From 8a849a2a567d4e519b246a16936b6e7519936d4b Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Thu, 10 Oct 2024 13:37:44 +0200 Subject: [PATCH 001/177] Revert "Reapply "[AMDGPU][GlobalISel] Fix load/store of pointer vectors, buffer.*.pN (#110714)" v2 (#111708)" This reverts commit 4b4a0d419c81b8b12a7dbb33dae1f7e9be91a88f. New test fails on buildbots https://lab.llvm.org/buildbot/#/builders/63/builds/2039 https://lab.llvm.org/buildbot/#/builders/127/builds/1055 --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 61 +- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 12 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 2 +- .../GlobalISel/buffer-load-store-pointers.ll | 301 -- ...st-select-load-global-old-legalization.mir | 3300 ----------------- .../GlobalISel/inst-select-load-local.mir | 96 +- .../GlobalISel/legalize-load-constant.mir | 51 +- .../AMDGPU/GlobalISel/legalize-load-flat.mir | 152 +- .../GlobalISel/legalize-load-global.mir | 98 +- .../AMDGPU/GlobalISel/legalize-load-local.mir | 50 +- .../GlobalISel/legalize-load-private.mir | 83 +- .../GlobalISel/legalize-store-global.mir | 84 +- 12 files changed, 275 insertions(+), 4015 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 4409a0d50e553e..b35f9faf024bdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -494,8 +494,6 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { return false; const unsigned Size = Ty.getSizeInBits(); - if (Ty.isPointerVector()) - return true; if (Size <= 64) return false; // Address space 8 pointers get their own workaround. @@ -504,6 +502,9 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { if (!Ty.isVector()) return true; + if (Ty.isPointerVector()) + return true; + unsigned EltSize = Ty.getScalarSizeInBits(); return EltSize != 32 && EltSize != 64; } @@ -5817,9 +5818,8 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, return Reg; } -Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B, - Register VData, LLT MemTy, - bool IsFormat) const { +Register AMDGPULegalizerInfo::fixStoreSourceType( + MachineIRBuilder &B, Register VData, bool IsFormat) const { MachineRegisterInfo *MRI = B.getMRI(); LLT Ty = MRI->getType(VData); @@ -5829,10 +5829,6 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B, if (hasBufferRsrcWorkaround(Ty)) return castBufferRsrcToV4I32(VData, B); - if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) { - Ty = getBitcastRegisterType(Ty); - VData = B.buildBitcast(Ty, VData).getReg(0); - } // Fixup illegal register types for i8 stores. if (Ty == LLT::scalar(8) || Ty == S16) { Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); @@ -5850,27 +5846,23 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B, } bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, - LegalizerHelper &Helper, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsTyped, bool IsFormat) const { - MachineIRBuilder &B = Helper.MIRBuilder; - MachineRegisterInfo &MRI = *B.getMRI(); - Register VData = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(VData); LLT EltTy = Ty.getScalarType(); const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); const LLT S32 = LLT::scalar(32); - MachineMemOperand *MMO = *MI.memoperands_begin(); - const int MemSize = MMO->getSize().getValue(); - LLT MemTy = MMO->getMemoryType(); - - VData = fixStoreSourceType(B, VData, MemTy, IsFormat); - + VData = fixStoreSourceType(B, VData, IsFormat); castBufferRsrcArgToV4I32(MI, B, 2); Register RSrc = MI.getOperand(2).getReg(); + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize().getValue(); + unsigned ImmOffset; // The typed intrinsics add an immediate after the registers. @@ -5962,13 +5954,10 @@ static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, } bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, - LegalizerHelper &Helper, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsFormat, bool IsTyped) const { - MachineIRBuilder &B = Helper.MIRBuilder; - MachineRegisterInfo &MRI = *B.getMRI(); - GISelChangeObserver &Observer = Helper.Observer; - // FIXME: Verifier should enforce 1 MMO for these intrinsics. MachineMemOperand *MMO = *MI.memoperands_begin(); const LLT MemTy = MMO->getMemoryType(); @@ -6017,21 +6006,9 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the // logic doesn't have to handle that case. if (hasBufferRsrcWorkaround(Ty)) { - Observer.changingInstr(MI); Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); - Observer.changedInstr(MI); Dst = MI.getOperand(0).getReg(); - B.setInsertPt(B.getMBB(), MI); } - if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) { - Ty = getBitcastRegisterType(Ty); - Observer.changingInstr(MI); - Helper.bitcastDst(MI, Ty, 0); - Observer.changedInstr(MI); - Dst = MI.getOperand(0).getReg(); - B.setInsertPt(B.getMBB(), MI); - } - LLT EltTy = Ty.getScalarType(); const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); const bool Unpacked = ST.hasUnpackedD16VMem(); @@ -7411,17 +7388,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_store: case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_ptr_buffer_store: - return legalizeBufferStore(MI, Helper, false, false); + return legalizeBufferStore(MI, MRI, B, false, false); case Intrinsic::amdgcn_raw_buffer_store_format: case Intrinsic::amdgcn_raw_ptr_buffer_store_format: case Intrinsic::amdgcn_struct_buffer_store_format: case Intrinsic::amdgcn_struct_ptr_buffer_store_format: - return legalizeBufferStore(MI, Helper, false, true); + return legalizeBufferStore(MI, MRI, B, false, true); case Intrinsic::amdgcn_raw_tbuffer_store: case Intrinsic::amdgcn_raw_ptr_tbuffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: case Intrinsic::amdgcn_struct_ptr_tbuffer_store: - return legalizeBufferStore(MI, Helper, true, true); + return legalizeBufferStore(MI, MRI, B, true, true); case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_atomic_buffer_load: @@ -7430,17 +7407,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_struct_ptr_buffer_load: case Intrinsic::amdgcn_struct_atomic_buffer_load: case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: - return legalizeBufferLoad(MI, Helper, false, false); + return legalizeBufferLoad(MI, MRI, B, false, false); case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_raw_ptr_buffer_load_format: case Intrinsic::amdgcn_struct_buffer_load_format: case Intrinsic::amdgcn_struct_ptr_buffer_load_format: - return legalizeBufferLoad(MI, Helper, true, false); + return legalizeBufferLoad(MI, MRI, B, true, false); case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_raw_ptr_tbuffer_load: case Intrinsic::amdgcn_struct_tbuffer_load: case Intrinsic::amdgcn_struct_ptr_tbuffer_load: - return legalizeBufferLoad(MI, Helper, true, true); + return legalizeBufferLoad(MI, MRI, B, true, true); case Intrinsic::amdgcn_raw_buffer_atomic_swap: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: case Intrinsic::amdgcn_struct_buffer_atomic_swap: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 86c15197805d23..84470dc75b60ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -195,13 +195,15 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore = false) const; - Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, + Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const; - bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, - bool IsTyped, bool IsFormat) const; - bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, - bool IsFormat, bool IsTyped) const; + bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsTyped, + bool IsFormat) const; + bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsFormat, + bool IsTyped) const; bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 902feacede83f4..ef9adde13348fe 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -590,7 +590,7 @@ class RegisterTypes reg_types> { def Reg16Types : RegisterTypes<[i16, f16, bf16]>; def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>; -def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, p1, p4, v4i16, v4f16, v4bf16]>; +def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>; def Reg96Types : RegisterTypes<[v3i32, v3f32]>; def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll deleted file mode 100644 index 091c9f143ce7ee..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll +++ /dev/null @@ -1,301 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck --check-prefix=GFX9 %s - -define ptr @buffer_load_p0(ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_load_p0 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 - ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]] - ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 - %ret = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret ptr %ret -} - -define void @buffer_store_p0(ptr %data, ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_store_p0 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX9-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: SI_RETURN - call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret void -} - -define ptr addrspace(1) @buffer_load_p1(ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_load_p1 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 - ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]] - ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 - %ret = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret ptr addrspace(1) %ret -} - -define void @buffer_store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_store_p1 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX9-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: SI_RETURN - call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret void -} - -define ptr addrspace(4) @buffer_load_p4(ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_load_p4 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 - ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]] - ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 - %ret = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret ptr addrspace(4) %ret -} - -define void @buffer_store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_store_p4 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX9-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: SI_RETURN - call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret void -} - -define ptr addrspace(5) @buffer_load_p5(ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_load_p5 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0 - %ret = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret ptr addrspace(5) %ret -} - -define void @buffer_store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_store_p5 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: SI_RETURN - call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret void -} - -define <2 x ptr addrspace(1)> @buffer_load_v2p1(ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_load_v2p1 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s64>) from %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3 - ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]] - ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]] - ; GFX9-NEXT: $vgpr2 = COPY [[COPY6]] - ; GFX9-NEXT: $vgpr3 = COPY [[COPY7]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %ret = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret <2 x ptr addrspace(1)> %ret -} - -define void @buffer_store_v2p5(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_store_v2p5 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; GFX9-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s64>) into %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: SI_RETURN - call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret void -} - -define <3 x ptr addrspace(5)> @buffer_load_v3p5(ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_load_v3p5 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub1 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2 - ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]] - ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]] - ; GFX9-NEXT: $vgpr2 = COPY [[COPY6]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - %ret = call <3 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v3p5(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret <3 x ptr addrspace(5)> %ret -} - -define void @buffer_store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_store_v3p5 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 - ; GFX9-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: SI_RETURN - call void @llvm.amdgcn.raw.ptr.buffer.store.v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret void -} - -define <4 x ptr addrspace(5)> @buffer_load_v4p5(ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_load_v4p5 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3 - ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]] - ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]] - ; GFX9-NEXT: $vgpr2 = COPY [[COPY6]] - ; GFX9-NEXT: $vgpr3 = COPY [[COPY7]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - %ret = call <4 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v4p5(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret <4 x ptr addrspace(5)> %ret -} - -define void @buffer_store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) { - ; GFX9-LABEL: name: buffer_store_v4p5 - ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr17 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; GFX9-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.buf, align 1, addrspace 8) - ; GFX9-NEXT: SI_RETURN - call void @llvm.amdgcn.raw.ptr.buffer.store.v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0) - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir deleted file mode 100644 index a7e3a86024201b..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir +++ /dev/null @@ -1,3300 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7 %s -# RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7-FLAT %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s - ---- - -name: load_global_s32_from_4 - -legalized: true -regBankSelected: true -tracksRegLiveness: true - - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_4 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_4 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_4 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] - ; - ; GFX8-LABEL: name: load_global_s32_from_4 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] - ; - ; GFX9-LABEL: name: load_global_s32_from_4 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX10-LABEL: name: load_global_s32_from_4 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX11-LABEL: name: load_global_s32_from_4 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX12-LABEL: name: load_global_s32_from_4 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) - $vgpr0 = COPY %1 - -... - ---- - -name: load_global_s32_from_2 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_2 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_2 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_2 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] - ; - ; GFX8-LABEL: name: load_global_s32_from_2 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] - ; - ; GFX9-LABEL: name: load_global_s32_from_2 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] - ; - ; GFX10-LABEL: name: load_global_s32_from_2 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] - ; - ; GFX11-LABEL: name: load_global_s32_from_2 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] - ; - ; GFX12-LABEL: name: load_global_s32_from_2 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 1) - $vgpr0 = COPY %1 - -... - ---- - -name: load_global_s32_from_1 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %1 - -... - ---- - -name: load_global_v2s32 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v2s32 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_v2s32 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_v2s32 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX8-LABEL: name: load_global_v2s32 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX9-LABEL: name: load_global_v2s32 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX10-LABEL: name: load_global_v2s32 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX11-LABEL: name: load_global_v2s32 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX12-LABEL: name: load_global_v2s32 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 1) - $vgpr0_vgpr1 = COPY %1 - -... - ---- - -name: load_global_v4s32 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v4s32 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_v4s32 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_v4s32 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] - ; - ; GFX8-LABEL: name: load_global_v4s32 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] - ; - ; GFX9-LABEL: name: load_global_v4s32 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX10-LABEL: name: load_global_v4s32 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX11-LABEL: name: load_global_v4s32 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX12-LABEL: name: load_global_v4s32 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 1) - $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 - -... - ---- - -name: load_global_s64 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s64 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s64 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s64 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX8-LABEL: name: load_global_s64 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX9-LABEL: name: load_global_s64 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX10-LABEL: name: load_global_s64 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX11-LABEL: name: load_global_s64 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX12-LABEL: name: load_global_s64 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1) - $vgpr0_vgpr1 = COPY %1 - -... - ---- - -name: load_global_v2s64 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v2s64 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_v2s64 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_v2s64 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] - ; - ; GFX8-LABEL: name: load_global_v2s64 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] - ; - ; GFX9-LABEL: name: load_global_v2s64 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX10-LABEL: name: load_global_v2s64 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX11-LABEL: name: load_global_v2s64 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX12-LABEL: name: load_global_v2s64 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 1) - $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 - -... - ---- - -name: load_global_v2p1 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v2p1 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_v2p1 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_v2p1 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] - ; - ; GFX8-LABEL: name: load_global_v2p1 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] - ; - ; GFX9-LABEL: name: load_global_v2p1 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX10-LABEL: name: load_global_v2p1 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX11-LABEL: name: load_global_v2p1 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; - ; GFX12-LABEL: name: load_global_v2p1 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 1) - %2:vgpr(<2 x p1>) = G_BITCAST %1(<2 x s64>) - $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 - -... - ---- - -name: load_global_s128 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s128 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - ; - ; GFX7-LABEL: name: load_global_s128 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - ; - ; GFX7-FLAT-LABEL: name: load_global_s128 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - ; - ; GFX8-LABEL: name: load_global_s128 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - ; - ; GFX9-LABEL: name: load_global_s128 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - ; - ; GFX10-LABEL: name: load_global_s128 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - ; - ; GFX11-LABEL: name: load_global_s128 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - ; - ; GFX12-LABEL: name: load_global_s128 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 1) - $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 - -... - ---- - -name: load_global_p3_from_4 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_p3_from_4 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_p3_from_4 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_p3_from_4 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] - ; - ; GFX8-LABEL: name: load_global_p3_from_4 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] - ; - ; GFX9-LABEL: name: load_global_p3_from_4 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX10-LABEL: name: load_global_p3_from_4 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX11-LABEL: name: load_global_p3_from_4 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX12-LABEL: name: load_global_p3_from_4 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 1) - $vgpr0 = COPY %1 - -... - ---- - -name: load_global_p1_from_8 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_p1_from_8 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_p1_from_8 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_p1_from_8 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX8-LABEL: name: load_global_p1_from_8 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX9-LABEL: name: load_global_p1_from_8 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX10-LABEL: name: load_global_p1_from_8 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX11-LABEL: name: load_global_p1_from_8 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX12-LABEL: name: load_global_p1_from_8 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 1) - $vgpr0_vgpr1 = COPY %1 - -... - ---- - -name: load_global_p999_from_8 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_p999_from_8 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; - ; GFX7-LABEL: name: load_global_p999_from_8 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; - ; GFX7-FLAT-LABEL: name: load_global_p999_from_8 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; - ; GFX8-LABEL: name: load_global_p999_from_8 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; - ; GFX9-LABEL: name: load_global_p999_from_8 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; - ; GFX10-LABEL: name: load_global_p999_from_8 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; - ; GFX11-LABEL: name: load_global_p999_from_8 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; - ; GFX12-LABEL: name: load_global_p999_from_8 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 1) - $vgpr0_vgpr1 = COPY %1 - -... - ---- - -name: load_global_v2p3 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v2p3 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - ; - ; GFX7-LABEL: name: load_global_v2p3 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - ; - ; GFX7-FLAT-LABEL: name: load_global_v2p3 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - ; - ; GFX8-LABEL: name: load_global_v2p3 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - ; - ; GFX9-LABEL: name: load_global_v2p3 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - ; - ; GFX10-LABEL: name: load_global_v2p3 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - ; - ; GFX11-LABEL: name: load_global_v2p3 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - ; - ; GFX12-LABEL: name: load_global_v2p3 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 1) - $vgpr0_vgpr1 = COPY %1 - -... - ---- - -name: load_global_v2s16 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v2s16 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_v2s16 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_v2s16 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] - ; - ; GFX8-LABEL: name: load_global_v2s16 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] - ; - ; GFX9-LABEL: name: load_global_v2s16 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX10-LABEL: name: load_global_v2s16 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX11-LABEL: name: load_global_v2s16 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - ; - ; GFX12-LABEL: name: load_global_v2s16 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 1) - $vgpr0 = COPY %1 - -... - ---- - -name: load_global_v4s16 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v4s16 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_v4s16 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_v4s16 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX8-LABEL: name: load_global_v4s16 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] - ; - ; GFX9-LABEL: name: load_global_v4s16 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX10-LABEL: name: load_global_v4s16 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX11-LABEL: name: load_global_v4s16 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; - ; GFX12-LABEL: name: load_global_v4s16 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 1) - $vgpr0_vgpr1 = COPY %1 - -... - ---- - -name: load_global_v8s16 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_v8s16 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_v8s16 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_v8s16 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] - ; - ; GFX8-LABEL: name: load_global_v8s16 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) - ; - ; GFX9-LABEL: name: load_global_v8s16 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) - ; - ; GFX10-LABEL: name: load_global_v8s16 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) - ; - ; GFX11-LABEL: name: load_global_v8s16 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) - ; - ; GFX12-LABEL: name: load_global_v8s16 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 1) - $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 - -... - -################################################################################ -### Stress addressing modes -################################################################################ - ---- - -name: load_global_s32_from_1_gep_2047 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_2047 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 2047 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_2048 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_2048 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 2048 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_m2047 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2047 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -2047 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_m2048 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2048 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -2048 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_4095 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_4095 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 4095 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_4096 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_4096 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 4096 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_m4095 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4095 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -4095 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_m4096 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4096 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -4096 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_8191 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_8191 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 8191 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_8192 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_8192 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 8192 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_m8191 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8191 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -8191 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_m8192 -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8192 - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -8192 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_24bit_max -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_max - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 8388607 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_2x_24bit_max -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 16777214 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_24bit_min -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_min - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -8388608 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... - ---- - -name: load_global_s32_from_1_gep_2x_24bit_min -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX7: liveins: $vgpr0_vgpr1 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] - ; - ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX7-FLAT: liveins: $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX8: liveins: $vgpr0_vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) - ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] - ; - ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX9: liveins: $vgpr0_vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX10: liveins: $vgpr0_vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX11: liveins: $vgpr0_vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - ; - ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min - ; GFX12: liveins: $vgpr0_vgpr1 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 - ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 - ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_CONSTANT i64 -16777215 - %2:vgpr(p1) = G_PTR_ADD %0, %1 - %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) - $vgpr0 = COPY %3 - -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir index 59c57a5fefbed9..280c7a5a492da8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7 %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s @@ -24,7 +24,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s32), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX7-LABEL: name: load_local_s32_from_4 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -32,14 +31,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s32), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX9-LABEL: name: load_local_s32_from_4 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s32_from_4 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -70,7 +67,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U16_]] - ; ; GFX7-LABEL: name: load_local_s32_from_2 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -78,14 +74,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U16_]] - ; ; GFX9-LABEL: name: load_local_s32_from_2 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U16_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s32_from_2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -119,7 +113,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX7-LABEL: name: load_local_s32_from_1 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -127,14 +120,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX9-LABEL: name: load_local_s32_from_1 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s32_from_1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -165,7 +156,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX7-LABEL: name: load_local_v2s32 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -173,14 +163,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX9-LABEL: name: load_local_v2s32 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] - ; ; GFX10-LABEL: name: load_local_v2s32 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -211,7 +199,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) - ; ; GFX7-LABEL: name: load_local_v2s32_align4 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -219,14 +206,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), align 4, addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] - ; ; GFX9-LABEL: name: load_local_v2s32_align4 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<2 x s32>), align 4, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_v2s32_align4 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -257,7 +242,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s64), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX7-LABEL: name: load_local_s64 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -265,14 +249,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s64), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX9-LABEL: name: load_local_s64 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s64 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -303,7 +285,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) - ; ; GFX7-LABEL: name: load_local_s64_align4 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -311,14 +292,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (s64), align 4, addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] - ; ; GFX9-LABEL: name: load_local_s64_align4 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (s64), align 4, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s64_align4 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -349,7 +328,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p3), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX7-LABEL: name: load_local_p3_from_4 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -357,14 +335,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p3), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX9-LABEL: name: load_local_p3_from_4 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_p3_from_4 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -395,7 +371,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p5), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX7-LABEL: name: load_local_p5_from_4 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -403,14 +378,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p5), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX9-LABEL: name: load_local_p5_from_4 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (p5), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_p5_from_4 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -441,7 +414,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p1), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX7-LABEL: name: load_local_p1_align8 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -449,14 +421,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p1), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX9-LABEL: name: load_local_p1_align8 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] - ; ; GFX10-LABEL: name: load_local_p1_align8 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -487,7 +457,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) - ; ; GFX7-LABEL: name: load_local_p1_align4 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -495,14 +464,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (p1), align 4, addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] - ; ; GFX9-LABEL: name: load_local_p1_align4 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (p1), align 4, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_p1_align4 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -533,7 +500,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load (p999), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; ; GFX7-LABEL: name: load_local_p999_from_8 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -541,14 +507,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load (p999), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; ; GFX9-LABEL: name: load_local_p999_from_8 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load (p999), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) - ; ; GFX10-LABEL: name: load_local_p999_from_8 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -575,36 +539,32 @@ body: | ; GFX6-LABEL: name: load_local_v2p3 ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; + ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; GFX7-LABEL: name: load_local_v2p3 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; GFX9-LABEL: name: load_local_v2p3 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 3) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] - ; + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; GFX10-LABEL: name: load_local_v2p3 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 3) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:vgpr(p3) = COPY $vgpr0 - %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 3) - %2:vgpr(<2 x p3>) = G_BITCAST %1(<2 x s32>) - $vgpr0_vgpr1 = COPY %2 + %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 3) + $vgpr0_vgpr1 = COPY %1 ... @@ -626,7 +586,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s16>), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX7-LABEL: name: load_local_v2s16 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -634,14 +593,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s16>), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] - ; ; GFX9-LABEL: name: load_local_v2s16 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_v2s16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -672,7 +629,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<4 x s16>), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX7-LABEL: name: load_local_v4s16 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -680,14 +636,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<4 x s16>), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] - ; ; GFX9-LABEL: name: load_local_v4s16 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] - ; ; GFX10-LABEL: name: load_local_v4s16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -744,7 +698,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -752,14 +705,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 65535, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX9-LABEL: name: load_local_s32_from_1_gep_65535 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 65535, 0, implicit $exec :: (load (s8), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s32_from_1_gep_65535 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -794,7 +745,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -804,7 +754,6 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX9-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -813,7 +762,6 @@ body: | ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_AND_B32_e64_]], 65535, 0, implicit $exec :: (load (s8), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -853,7 +801,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX7-LABEL: name: load_local_s32_from_1_gep_65536 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -863,7 +810,6 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX9-LABEL: name: load_local_s32_from_1_gep_65536 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -872,7 +818,6 @@ body: | ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load (s8), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s32_from_1_gep_65536 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -909,7 +854,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX7-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -919,7 +863,6 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] - ; ; GFX9-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -928,7 +871,6 @@ body: | ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load (s8), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -965,7 +907,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64), align 4, addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) - ; ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1016 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -973,14 +914,12 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 254, 255, 0, implicit $m0, implicit $exec :: (load (s64), align 4, addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] - ; ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1016 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load (s64), align 4, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s64_align4_from_1_gep_1016 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -1015,7 +954,6 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64), align 4, addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) - ; ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1025,7 +963,6 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[V_ADD_CO_U32_e64_]], 0, 1, 0, implicit $m0, implicit $exec :: (load (s64), align 4, addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] - ; ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -1034,7 +971,6 @@ body: | ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load (s64), align 4, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] - ; ; GFX10-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir index ff1d3fe3796732..a63df136e003c3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir @@ -6604,25 +6604,22 @@ body: | ; CI: liveins: $vgpr0_vgpr1 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; VI-LABEL: name: test_load_constant_v2p3_align8 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-LABEL: name: test_load_constant_v2p3_align8 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 4) $vgpr0_vgpr1 = COPY %1 @@ -6638,25 +6635,22 @@ body: | ; CI: liveins: $vgpr0_vgpr1 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; VI-LABEL: name: test_load_constant_v2p3_align4 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-LABEL: name: test_load_constant_v2p3_align4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 4) $vgpr0_vgpr1 = COPY %1 @@ -6689,6 +6683,7 @@ body: | ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; CI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) ; CI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4) @@ -6704,9 +6699,9 @@ body: | ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; VI-LABEL: name: test_load_constant_v2p3_align1 ; VI: liveins: $vgpr0_vgpr1 @@ -6729,6 +6724,7 @@ body: | ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) ; VI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4) @@ -6744,9 +6740,9 @@ body: | ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX9-LABEL: name: test_load_constant_v2p3_align1 ; GFX9: liveins: $vgpr0_vgpr1 @@ -6769,6 +6765,7 @@ body: | ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4) @@ -6784,9 +6781,9 @@ body: | ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 4) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index 3b166660a84b75..b1d7d36f9912e7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -15192,73 +15192,65 @@ body: | ; CI: liveins: $vgpr0_vgpr1 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 8) + ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3), align 8) ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4) - ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4) + ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; VI-LABEL: name: test_load_flat_v2p3_align8 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 8) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3), align 8) ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4) - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4) + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align8 ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>)) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align8 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>)) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_flat_v2p3_align8 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align8 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 ; UNALIGNED_GFX9PLUS-NEXT: {{ $}} ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>)) + ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align8 ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1 ; UNALIGNED_GFX11PLUS-NEXT: {{ $}} ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>)) + ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align8 ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>)) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -15274,73 +15266,65 @@ body: | ; CI: liveins: $vgpr0_vgpr1 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) + ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3)) ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4) - ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4) + ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; VI-LABEL: name: test_load_flat_v2p3_align4 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3)) ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4) - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4) + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align4 ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align4 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_flat_v2p3_align4 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align4 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 ; UNALIGNED_GFX9PLUS-NEXT: {{ $}} ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4) - ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4) + ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align4 ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1 ; UNALIGNED_GFX11PLUS-NEXT: {{ $}} ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4) - ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4) + ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align4 ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -15373,6 +15357,7 @@ body: | ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; CI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; CI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15388,9 +15373,9 @@ body: | ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; VI-LABEL: name: test_load_flat_v2p3_align1 ; VI: liveins: $vgpr0_vgpr1 @@ -15413,6 +15398,7 @@ body: | ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; VI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15428,9 +15414,9 @@ body: | ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1 ; GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -15453,6 +15439,7 @@ body: | ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15468,9 +15455,9 @@ body: | ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 @@ -15493,6 +15480,7 @@ body: | ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15508,9 +15496,9 @@ body: | ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_flat_v2p3_align1 ; GFX12: liveins: $vgpr0_vgpr1 @@ -15533,6 +15521,7 @@ body: | ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15548,9 +15537,9 @@ body: | ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -15573,6 +15562,7 @@ body: | ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15588,9 +15578,9 @@ body: | ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1 ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1 @@ -15613,6 +15603,7 @@ body: | ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15628,9 +15619,9 @@ body: | ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align1 ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1 @@ -15653,6 +15644,7 @@ body: | ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -15668,9 +15660,9 @@ body: | ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 0) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir index f384114ee4cde7..d6acc6ecdfc660 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir @@ -13448,49 +13448,43 @@ body: | ; SI: liveins: $vgpr0_vgpr1 ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1) - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; CI-HSA-LABEL: name: test_load_global_v2p3_align8 ; CI-HSA: liveins: $vgpr0_vgpr1 ; CI-HSA-NEXT: {{ $}} ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1) - ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) + ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; CI-MESA-LABEL: name: test_load_global_v2p3_align8 ; CI-MESA: liveins: $vgpr0_vgpr1 ; CI-MESA-NEXT: {{ $}} ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1) - ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) + ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; VI-LABEL: name: test_load_global_v2p3_align8 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align8 ; GFX9-HSA: liveins: $vgpr0_vgpr1 ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) + ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align8 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) + ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -13506,49 +13500,43 @@ body: | ; SI: liveins: $vgpr0_vgpr1 ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1) - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; CI-HSA-LABEL: name: test_load_global_v2p3_align4 ; CI-HSA: liveins: $vgpr0_vgpr1 ; CI-HSA-NEXT: {{ $}} ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1) - ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1) + ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; CI-MESA-LABEL: name: test_load_global_v2p3_align4 ; CI-MESA: liveins: $vgpr0_vgpr1 ; CI-MESA-NEXT: {{ $}} ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1) - ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1) + ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; VI-LABEL: name: test_load_global_v2p3_align4 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align4 ; GFX9-HSA: liveins: $vgpr0_vgpr1 ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1) - ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1) + ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align4 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1) - ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1) + ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -13581,6 +13569,7 @@ body: | ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; SI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; SI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -13596,17 +13585,16 @@ body: | ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; SI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; CI-HSA-LABEL: name: test_load_global_v2p3_align1 ; CI-HSA: liveins: $vgpr0_vgpr1 ; CI-HSA-NEXT: {{ $}} ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 1, addrspace 1) - ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 1, addrspace 1) + ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; CI-MESA-LABEL: name: test_load_global_v2p3_align1 ; CI-MESA: liveins: $vgpr0_vgpr1 @@ -13629,6 +13617,7 @@ body: | ; CI-MESA-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; CI-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; CI-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; CI-MESA-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -13644,9 +13633,9 @@ body: | ; CI-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; CI-MESA-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; CI-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-MESA-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; VI-LABEL: name: test_load_global_v2p3_align1 ; VI: liveins: $vgpr0_vgpr1 @@ -13669,6 +13658,7 @@ body: | ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; VI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -13684,17 +13674,16 @@ body: | ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align1 ; GFX9-HSA: liveins: $vgpr0_vgpr1 ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 1, addrspace 1) - ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 1, addrspace 1) + ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align1 ; GFX9-MESA: liveins: $vgpr0_vgpr1 @@ -13717,6 +13706,7 @@ body: | ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) ; GFX9-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; GFX9-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -13732,9 +13722,9 @@ body: | ; GFX9-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] ; GFX9-MESA-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) ; GFX9-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) - ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-MESA-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) + ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 1) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir index 1608234d6b2bc5..1249de647bb759 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -14595,81 +14595,71 @@ body: | ; SI: liveins: $vgpr0 ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; CI-LABEL: name: test_load_local_v2p3_align8 ; CI: liveins: $vgpr0 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; CI-DS128-LABEL: name: test_load_local_v2p3_align8 ; CI-DS128: liveins: $vgpr0 ; CI-DS128-NEXT: {{ $}} ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; VI-LABEL: name: test_load_local_v2p3_align8 ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-LABEL: name: test_load_local_v2p3_align8 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2p3_align8 ; GFX9-UNALIGNED: liveins: $vgpr0 ; GFX9-UNALIGNED-NEXT: {{ $}} ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX10-LABEL: name: test_load_local_v2p3_align8 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2p3_align8 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11-LABEL: name: test_load_local_v2p3_align8 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2p3_align8 ; GFX11-UNALIGNED: liveins: $vgpr0 ; GFX11-UNALIGNED-NEXT: {{ $}} ; GFX11-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX11-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3) - ; GFX11-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX11-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX11-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3) + ; GFX11-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 472cbe559e56f4..741f878c86f8b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -16526,117 +16526,106 @@ body: | ; SI: liveins: $vgpr0 ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5) + ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5) ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5) - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5) + ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; CI-LABEL: name: test_load_private_v2p3_align8 ; CI: liveins: $vgpr0 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5) + ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5) ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5) - ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5) + ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; VI-LABEL: name: test_load_private_v2p3_align8 ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5) ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5) - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5) + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX9-LABEL: name: test_load_private_v2p3_align8 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX10-LABEL: name: test_load_private_v2p3_align8 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; GFX11-LABEL: name: test_load_private_v2p3_align8 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5) - ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_private_v2p3_align8 ; GFX12: liveins: $vgpr0 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2p3_align8 ; UNALIGNED_GFX9: liveins: $vgpr0 ; UNALIGNED_GFX9-NEXT: {{ $}} ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5) + ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5) ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5) - ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2p3_align8 ; UNALIGNED_GFX10: liveins: $vgpr0 ; UNALIGNED_GFX10-NEXT: {{ $}} ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5) + ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5) ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5) - ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3) + ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2p3_align8 ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2p3_align8 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 5) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir index b9c72d39ed45b6..f2a88a21a286ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir @@ -2310,9 +2310,9 @@ body: | ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; SI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>) + ; SI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) + ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 @@ -2332,7 +2332,8 @@ body: | ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1) ; SI-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; SI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) + ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32) ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) @@ -2352,17 +2353,16 @@ body: | ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 1, addrspace 1) + ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 1, addrspace 1) ; ; VI-LABEL: name: test_store_global_v2p3_align1 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; VI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>) + ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) + ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 @@ -2383,7 +2383,8 @@ body: | ; VI-NEXT: G_STORE [[ANYEXT1]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1) ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; VI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) + ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) ; VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64) ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) @@ -2404,8 +2405,7 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 1, addrspace 1) + ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 1, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x p3>), align 1, addrspace 1) @@ -2422,9 +2422,9 @@ body: | ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; SI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>) + ; SI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) + ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 @@ -2433,7 +2433,8 @@ body: | ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1) ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; SI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) + ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) ; SI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1) @@ -2444,17 +2445,16 @@ body: | ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 2, addrspace 1) + ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 2, addrspace 1) ; ; VI-LABEL: name: test_store_global_v2p3_align2 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; VI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>) + ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) + ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 @@ -2463,7 +2463,8 @@ body: | ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1) ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; VI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) + ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32) ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) ; VI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1) @@ -2474,8 +2475,7 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 2, addrspace 1) + ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 2, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x p3>), align 2, addrspace 1) @@ -2492,32 +2492,28 @@ body: | ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; SI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1) + ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1) ; ; CI-LABEL: name: test_store_global_v2p3_align4 ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1) + ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1) ; ; VI-LABEL: name: test_store_global_v2p3_align4 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; VI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1) + ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1) ; ; GFX9-LABEL: name: test_store_global_v2p3_align4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1) + ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x p3>), align 4, addrspace 1) @@ -2534,32 +2530,28 @@ body: | ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; SI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1) + ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) ; ; CI-LABEL: name: test_store_global_v2p3_align8 ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1) + ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) ; ; VI-LABEL: name: test_store_global_v2p3_align8 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; VI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1) + ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) ; ; GFX9-LABEL: name: test_store_global_v2p3_align8 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1) + ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x p3>), align 8, addrspace 1) @@ -2576,32 +2568,28 @@ body: | ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; SI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1) + ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1) ; ; CI-LABEL: name: test_store_global_v2p3_align16 ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1) + ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1) ; ; VI-LABEL: name: test_store_global_v2p3_align16 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; VI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1) + ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1) ; ; GFX9-LABEL: name: test_store_global_v2p3_align16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>) - ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1) + ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x p3>), align 16, addrspace 1) From 8306114ed2313a7febdb0d0d0c31df357ed53fdd Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 10 Oct 2024 12:49:35 +0100 Subject: [PATCH 002/177] [clang][x86] Add constexpr support for _mm_cvtsi32_ss/_mm_cvt_si2ss/_mm_cvtsi64_ss SSE1 intrinsics Followup to #111001 --- clang/lib/Headers/xmmintrin.h | 15 ++++++--------- clang/test/CodeGen/X86/sse-builtins.c | 9 +++++++++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 2aa688adefc25a..20e66d190113a3 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -1618,9 +1618,8 @@ _mm_cvtt_ps2pi(__m128 __a) /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the /// converted value of the second operand. The upper 96 bits are copied from /// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtsi32_ss(__m128 __a, int __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a, + int __b) { __a[0] = __b; return __a; } @@ -1641,9 +1640,8 @@ _mm_cvtsi32_ss(__m128 __a, int __b) /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the /// converted value of the second operand. The upper 96 bits are copied from /// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvt_si2ss(__m128 __a, int __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a, + int __b) { return _mm_cvtsi32_ss(__a, __b); } @@ -1665,9 +1663,8 @@ _mm_cvt_si2ss(__m128 __a, int __b) /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the /// converted value of the second operand. The upper 96 bits are copied from /// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtsi64_ss(__m128 __a, long long __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi64_ss(__m128 __a, long long __b) { __a[0] = __b; return __a; } diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c index 932d6f36b09b66..391e049a6ae3ef 100644 --- a/clang/test/CodeGen/X86/sse-builtins.c +++ b/clang/test/CodeGen/X86/sse-builtins.c @@ -948,6 +948,15 @@ void test_constexpr() { constexpr __m128 v_mm_movelh_ps = _mm_movelh_ps(k1, k2); static_assert(v_mm_movelh_ps[0] == +1.0f && v_mm_movelh_ps[1] == +0.0f && v_mm_movelh_ps[2] == +8.0f && v_mm_movelh_ps[3] == +4.0f); + constexpr __m128 v_mm_cvtsi32_ss = _mm_cvtsi32_ss(k1, 42); + static_assert(v_mm_cvtsi32_ss[0] == 42.0f && v_mm_cvtsi32_ss[1] == +0.0f && v_mm_cvtsi32_ss[2] == +2.0f && v_mm_cvtsi32_ss[3] == +4.0f); + + constexpr __m128 v_mm_cvt_si2ss = _mm_cvt_si2ss(k2, -99); + static_assert(v_mm_cvt_si2ss[0] == -99.0f && v_mm_cvt_si2ss[1] == +4.0f && v_mm_cvt_si2ss[2] == +2.0f && v_mm_cvt_si2ss[3] == +1.0f); + + constexpr __m128 v_mm_cvtsi64_ss = _mm_cvtsi64_ss(k3, 555); + static_assert(v_mm_cvtsi64_ss[0] == 555.0f && v_mm_cvtsi64_ss[1] == -5.0f && v_mm_cvtsi64_ss[2] == +6.0f && v_mm_cvtsi64_ss[3] == +7.0f); + static_assert(_mm_cvtss_f32(k2) == +8.0f); } From ea2b8976e69ad70220f71abf28d6781dc1e41fab Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 10 Oct 2024 07:57:45 -0400 Subject: [PATCH 003/177] [libc++] Remove nonexistent directory from check-generated-output (#111746) The libcxx/benchmarks directory was moved to libcxx/test/benchmarks, which is already checked by that grep command. --- libcxx/utils/ci/run-buildbot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index 536d6270361307..0ce1def5f37224 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -238,7 +238,7 @@ check-generated-output) # Reject patches that introduce non-ASCII characters or hard tabs. # Depends on LC_COLLATE set at the top of this script. set -x - ! grep -rn '[^ -~]' libcxx/include libcxx/src libcxx/test libcxx/benchmarks \ + ! grep -rn '[^ -~]' libcxx/include libcxx/src libcxx/test \ --exclude '*.dat' \ --exclude '*unicode*.cpp' \ --exclude '*print*.sh.cpp' \ From b94c763b7c123995ae31a6ce44223e89ef9f226a Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Thu, 10 Oct 2024 04:59:34 -0700 Subject: [PATCH 004/177] [Fuchsia][CMake] Set output name for libc++ shared library (#111791) This is a dependency of #80007. --- clang/cmake/caches/Fuchsia-stage2.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 26ae30c71b4df3..5af98c7b3b3fba 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -345,6 +345,7 @@ foreach(target armv6m-none-eabi;armv7m-none-eabi;armv8m.main-none-eabi) set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "") + set(RUNTIMES_${target}_LIBCXX_SHARED_OUTPUT_NAME "c++-shared" CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_LIBC "llvm-libc" CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") @@ -396,6 +397,7 @@ foreach(target riscv32-unknown-elf) set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "") + set(RUNTIMES_${target}_LIBCXX_SHARED_OUTPUT_NAME "c++-shared" CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_LIBC "llvm-libc" CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") From 917ada35cd937ad4104dff89c48398bd796ba6b7 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 10 Oct 2024 08:00:01 -0400 Subject: [PATCH 005/177] [runtimes] Always define cxx_shared, cxx_static & other targets (#80007) This patch always defines the cxx_shared, cxx_static & other top-level targets. However, they are marked as EXCLUDE_FROM_ALL when we don't want to build them. Simply declaring the targets should be of no harm, and it allows other projects to mention these targets regardless of whether they end up being built or not. This patch basically moves the definition of e.g. cxx_shared out of the `if (LIBCXX_ENABLE_SHARED)` and instead marks it as EXCLUDE_FROM_ALL conditionally on whether LIBCXX_ENABLE_SHARED is passed. It then does the same for libunwind and libc++abi targets. I purposefully avoided to reformat the files (which now has inconsistent indentation) because I wanted to keep the diff minimal, and I know this is an area of the code where folks may have downstream diffs. I will re-indent the code separately once this patch lands. This is a reapplication of 79ee0342dbf0, which was reverted in a3539090884c because it broke the TSAN and the Fuchsia builds. Resolves #77654 Differential Revision: https://reviews.llvm.org/D134221 --- libcxx/cmake/caches/AIX.cmake | 7 +++++++ libcxx/cmake/caches/Armv7M-picolibc.cmake | 11 +++++++++++ libcxx/src/CMakeLists.txt | 22 +++++++++++----------- libcxxabi/src/CMakeLists.txt | 20 +++++++++++--------- libunwind/src/CMakeLists.txt | 18 ++++++++++-------- 5 files changed, 50 insertions(+), 28 deletions(-) diff --git a/libcxx/cmake/caches/AIX.cmake b/libcxx/cmake/caches/AIX.cmake index 4ec78f9bbd5923..036fdfdae60725 100644 --- a/libcxx/cmake/caches/AIX.cmake +++ b/libcxx/cmake/caches/AIX.cmake @@ -16,3 +16,10 @@ set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "") set(LIBUNWIND_ENABLE_SHARED ON CACHE BOOL "") set(LIBUNWIND_ENABLE_STATIC OFF CACHE BOOL "") set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE" CACHE STRING "") + +# On AIX, both shared and static libraries are archived. As a result, both the static and the shared targets end +# up with a `.a` suffix, which conflict. To workaround that, we set a different output name for the static +# libraries, which we never actually build anyway. For more information, see https://gitlab.kitware.com/cmake/cmake/-/issues/19494. +set(LIBCXX_STATIC_OUTPUT_NAME "c++-static" CACHE STRING "") +set(LIBCXXABI_STATIC_OUTPUT_NAME "c++abi-static" CACHE STRING "") +set(LIBUNWIND_STATIC_OUTPUT_NAME "unwind-static" CACHE STRING "") diff --git a/libcxx/cmake/caches/Armv7M-picolibc.cmake b/libcxx/cmake/caches/Armv7M-picolibc.cmake index b5f9089308d22e..0f8189b457285e 100644 --- a/libcxx/cmake/caches/Armv7M-picolibc.cmake +++ b/libcxx/cmake/caches/Armv7M-picolibc.cmake @@ -39,3 +39,14 @@ set(LIBUNWIND_IS_BAREMETAL ON CACHE BOOL "") set(LIBUNWIND_REMEMBER_HEAP_ALLOC ON CACHE BOOL "") set(LIBUNWIND_USE_COMPILER_RT ON CACHE BOOL "") find_program(QEMU_SYSTEM_ARM qemu-system-arm REQUIRED) + +# On embedded platforms that don't support shared library targets, CMake implicitly changes shared +# library targets to be static library targets. This results in duplicate definitions of the static +# library targets even though we might not ever build the shared library target, which breaks the +# build. To work around this, we change the output name of the shared library target so that it +# can't conflict with the static library target. +# +# This is tracked by https://gitlab.kitware.com/cmake/cmake/-/issues/25759. +set(LIBCXX_SHARED_OUTPUT_NAME "c++-shared" CACHE STRING "") +set(LIBCXXABI_SHARED_OUTPUT_NAME "c++abi-shared" CACHE STRING "") +set(LIBUNWIND_SHARED_OUTPUT_NAME "unwind-shared" CACHE STRING "") diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index b187677ff2db52..9f31822065be9d 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -143,10 +143,6 @@ if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS) ) endif() -if(NOT LIBCXX_INSTALL_LIBRARY) - set(exclude_from_all EXCLUDE_FROM_ALL) -endif() - if (APPLE AND LLVM_USE_SANITIZER) if (("${LLVM_USE_SANITIZER}" STREQUAL "Address") OR ("${LLVM_USE_SANITIZER}" STREQUAL "Address;Undefined") OR @@ -177,13 +173,13 @@ split_list(LIBCXX_COMPILE_FLAGS) split_list(LIBCXX_LINK_FLAGS) # Build the shared library. -if (LIBCXX_ENABLE_SHARED) - add_library(cxx_shared SHARED ${exclude_from_all} ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) + add_library(cxx_shared SHARED ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared PRIVATE ${LIBCXX_LIBRARIES}) set_target_properties(cxx_shared PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}" LINK_FLAGS "${LIBCXX_LINK_FLAGS}" OUTPUT_NAME "${LIBCXX_SHARED_OUTPUT_NAME}" @@ -247,7 +243,10 @@ if (LIBCXX_ENABLE_SHARED) ) endif() +if (LIBCXX_ENABLE_SHARED) list(APPEND LIBCXX_BUILD_TARGETS "cxx_shared") +endif() + if(WIN32 AND NOT MINGW AND NOT "${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows") # Since we most likely do not have a mt.exe replacement, disable the # manifest bundling. This allows a normal cmake invocation to pass which @@ -260,19 +259,18 @@ if (LIBCXX_ENABLE_SHARED) APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker /MANIFEST:NO") endif() endif() -endif() set(CMAKE_STATIC_LIBRARY_PREFIX "lib") # Build the static library. -if (LIBCXX_ENABLE_STATIC) - add_library(cxx_static STATIC ${exclude_from_all} ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) + add_library(cxx_static STATIC ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static PRIVATE ${LIBCXX_LIBRARIES} PRIVATE libcxx-abi-static) set_target_properties(cxx_static PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}" LINK_FLAGS "${LIBCXX_LINK_FLAGS}" OUTPUT_NAME "${LIBCXX_STATIC_OUTPUT_NAME}" @@ -295,16 +293,18 @@ if (LIBCXX_ENABLE_STATIC) target_compile_definitions(cxx_static PRIVATE _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=) endif() - list(APPEND LIBCXX_BUILD_TARGETS "cxx_static") + if (LIBCXX_ENABLE_STATIC) + list(APPEND LIBCXX_BUILD_TARGETS "cxx_static") + endif() # Attempt to merge the libc++.a archive and the ABI library archive into one. if (LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY) target_link_libraries(cxx_static PRIVATE libcxx-abi-static-objects) endif() -endif() # Add a meta-target for both libraries. add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS}) +# Build the experimental static library set(LIBCXX_EXPERIMENTAL_SOURCES experimental/keep.cpp ) diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt index 480e528b819bb9..e496cf3339164e 100644 --- a/libcxxabi/src/CMakeLists.txt +++ b/libcxxabi/src/CMakeLists.txt @@ -184,10 +184,10 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO endif() target_compile_options(cxxabi_shared_objects PRIVATE "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}") -if (LIBCXXABI_ENABLE_SHARED) add_library(cxxabi_shared SHARED) set_target_properties(cxxabi_shared PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}" OUTPUT_NAME "${LIBCXXABI_SHARED_OUTPUT_NAME}" SOVERSION "1" @@ -208,10 +208,12 @@ if (LIBCXXABI_ENABLE_SHARED) PUBLIC cxxabi_shared_objects PRIVATE ${LIBCXXABI_LIBRARIES}) +if (LIBCXXABI_ENABLE_SHARED) list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_shared") - if (LIBCXXABI_INSTALL_SHARED_LIBRARY) - list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared") - endif() +endif() +if (LIBCXXABI_INSTALL_SHARED_LIBRARY) + list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared") +endif() # TODO: Move this to libc++'s HandleLibCXXABI.cmake since this is effectively trying to control # what libc++ re-exports. @@ -254,7 +256,6 @@ if (LIBCXXABI_ENABLE_SHARED) reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp") endif() endif() -endif() # Build the static library. add_library(cxxabi_static_objects OBJECT EXCLUDE_FROM_ALL ${LIBCXXABI_SOURCES} ${LIBCXXABI_HEADERS}) @@ -294,13 +295,13 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY) _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=) endif() -if (LIBCXXABI_ENABLE_STATIC) add_library(cxxabi_static STATIC) if (LIBCXXABI_USE_LLVM_UNWINDER AND NOT LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY) target_link_libraries(cxxabi_static PUBLIC unwind_static) endif() set_target_properties(cxxabi_static PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}" OUTPUT_NAME "${LIBCXXABI_STATIC_OUTPUT_NAME}" ) @@ -308,10 +309,11 @@ if (LIBCXXABI_ENABLE_STATIC) PUBLIC cxxabi_static_objects PRIVATE ${LIBCXXABI_STATIC_LIBRARIES} ${LIBCXXABI_LIBRARIES}) +if (LIBCXXABI_ENABLE_STATIC) list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_static") - if (LIBCXXABI_INSTALL_STATIC_LIBRARY) - list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_static") - endif() +endif() +if (LIBCXXABI_INSTALL_STATIC_LIBRARY) + list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_static") endif() # Add a meta-target for both libraries. diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt index 125cf4ffe912a3..3065bfc8a07050 100644 --- a/libunwind/src/CMakeLists.txt +++ b/libunwind/src/CMakeLists.txt @@ -153,11 +153,11 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO set_target_properties(unwind_shared_objects PROPERTIES POSITION_INDEPENDENT_CODE ON) # must set manually because it's an object library endif() -if (LIBUNWIND_ENABLE_SHARED) add_library(unwind_shared SHARED) target_link_libraries(unwind_shared PUBLIC unwind_shared_objects) set_target_properties(unwind_shared PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}" LINKER_LANGUAGE C OUTPUT_NAME "${LIBUNWIND_SHARED_OUTPUT_NAME}" @@ -165,10 +165,11 @@ if (LIBUNWIND_ENABLE_SHARED) SOVERSION "1" ) +if (LIBUNWIND_ENABLE_SHARED) list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_shared") - if (LIBUNWIND_INSTALL_SHARED_LIBRARY) - list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_shared") - endif() +endif() +if (LIBUNWIND_INSTALL_SHARED_LIBRARY) + list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_shared") endif() # Build the static library. @@ -199,20 +200,21 @@ if(LIBUNWIND_HIDE_SYMBOLS) target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS) endif() -if (LIBUNWIND_ENABLE_STATIC) add_library(unwind_static STATIC) target_link_libraries(unwind_static PUBLIC unwind_static_objects) set_target_properties(unwind_static PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}" LINKER_LANGUAGE C OUTPUT_NAME "${LIBUNWIND_STATIC_OUTPUT_NAME}" ) +if (LIBUNWIND_ENABLE_STATIC) list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_static") - if (LIBUNWIND_INSTALL_STATIC_LIBRARY) - list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_static") - endif() +endif() +if (LIBUNWIND_INSTALL_STATIC_LIBRARY) + list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_static") endif() # Add a meta-target for both libraries. From f8b7a65395a07073feff367145965214d95ba99a Mon Sep 17 00:00:00 2001 From: Petr Kurapov Date: Thu, 10 Oct 2024 14:04:52 +0200 Subject: [PATCH 006/177] [MLIR][GPU-LLVM] Add in-pass signature update for opencl kernels (#105664) Default to Global address space for memrefs that do not have an explicit address space set in the IR. --------- Co-authored-by: Victor Perez Co-authored-by: Jakub Kuderski Co-authored-by: Victor Perez --- .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 49 ++++++++++++++++++ .../GPUToLLVMSPV/gpu-to-llvm-spv.mlir | 50 ++++++++++++++++--- 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index 544f1f4a4f6a79..bb6a38c0e76edf 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -34,6 +34,8 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/FormatVariadic.h" +#define DEBUG_TYPE "gpu-to-llvm-spv" + using namespace mlir; namespace mlir { @@ -316,6 +318,38 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern { } }; +class MemorySpaceToOpenCLMemorySpaceConverter final : public TypeConverter { +public: + MemorySpaceToOpenCLMemorySpaceConverter(MLIRContext *ctx) { + addConversion([](Type t) { return t; }); + addConversion([ctx](BaseMemRefType memRefType) -> std::optional { + // Attach global addr space attribute to memrefs with no addr space attr + Attribute memSpaceAttr = memRefType.getMemorySpace(); + if (memSpaceAttr) + return std::nullopt; + + unsigned globalAddrspace = storageClassToAddressSpace( + spirv::ClientAPI::OpenCL, spirv::StorageClass::CrossWorkgroup); + Attribute addrSpaceAttr = + IntegerAttr::get(IntegerType::get(ctx, 64), globalAddrspace); + if (auto rankedType = dyn_cast(memRefType)) { + return MemRefType::get(memRefType.getShape(), + memRefType.getElementType(), + rankedType.getLayout(), addrSpaceAttr); + } + return UnrankedMemRefType::get(memRefType.getElementType(), + addrSpaceAttr); + }); + addConversion([this](FunctionType type) { + auto inputs = llvm::map_to_vector( + type.getInputs(), [this](Type ty) { return convertType(ty); }); + auto results = llvm::map_to_vector( + type.getResults(), [this](Type ty) { return convertType(ty); }); + return FunctionType::get(type.getContext(), inputs, results); + }); + } +}; + //===----------------------------------------------------------------------===// // Subgroup query ops. //===----------------------------------------------------------------------===// @@ -382,6 +416,21 @@ struct GPUToLLVMSPVConversionPass final LLVMTypeConverter converter(context, options); LLVMConversionTarget target(*context); + // Force OpenCL address spaces when they are not present + { + MemorySpaceToOpenCLMemorySpaceConverter converter(context); + AttrTypeReplacer replacer; + replacer.addReplacement([&converter](BaseMemRefType origType) + -> std::optional { + return converter.convertType(origType); + }); + + replacer.recursivelyReplaceElementsIn(getOperation(), + /*replaceAttrs=*/true, + /*replaceLocs=*/false, + /*replaceTypes=*/true); + } + target.addIllegalOp, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i64) attributes {gpu.kernel} { + // CHECK-32: llvm.func spir_kernelcc @kernel_with_conv_args(%{{.*}}: i32, %{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i32) attributes {gpu.kernel} { gpu.func @kernel_with_conv_args(%arg0: index, %arg1: memref) kernel { gpu.return } - // CHECK-64: llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} { - // CHECK-32: llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} { + // CHECK-64: llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} { + // CHECK-32: llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} { gpu.func @kernel_with_sized_memref(%arg0: memref<1xindex>) kernel { gpu.return } - // CHECK-64: llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} { - // CHECK-32: llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} { + // CHECK-64: llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} { + // CHECK-32: llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} { gpu.func @kernel_with_ND_memref(%arg0: memref<128x128x128xindex>) kernel { gpu.return } @@ -566,6 +566,44 @@ gpu.module @kernels { // ----- +gpu.module @kernels { +// CHECK: llvm.func spir_funccc @_Z12get_group_idj(i32) +// CHECK-LABEL: llvm.func spir_funccc @no_address_spaces( +// CHECK-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> +// CHECK-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> +// CHECK-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> + gpu.func @no_address_spaces(%arg0: memref, %arg1: memref>, %arg2: memref) { + gpu.return + } + +// CHECK-LABEL: llvm.func spir_kernelcc @no_address_spaces_complex( +// CHECK-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> +// CHECK-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> +// CHECK: func.call @no_address_spaces_callee(%{{[0-9]+}}, %{{[0-9]+}}) +// CHECK-SAME: : (memref<2x2xf32, 1>, memref<4xf32, 1>) + gpu.func @no_address_spaces_complex(%arg0: memref<2x2xf32>, %arg1: memref<4xf32>) kernel { + func.call @no_address_spaces_callee(%arg0, %arg1) : (memref<2x2xf32>, memref<4xf32>) -> () + gpu.return + } +// CHECK-LABEL: func.func @no_address_spaces_callee( +// CHECK-SAME: [[ARG0:%.*]]: memref<2x2xf32, 1> +// CHECK-SAME: [[ARG1:%.*]]: memref<4xf32, 1> +// CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: [[I0:%.*]] = llvm.call spir_funccc @_Z12get_group_idj([[C0]]) { +// CHECK-32: [[I1:%.*]] = builtin.unrealized_conversion_cast [[I0]] : i32 to index +// CHECK-64: [[I1:%.*]] = builtin.unrealized_conversion_cast [[I0]] : i64 to index +// CHECK: [[LD:%.*]] = memref.load [[ARG0]]{{\[}}[[I1]], [[I1]]{{\]}} : memref<2x2xf32, 1> +// CHECK: memref.store [[LD]], [[ARG1]]{{\[}}[[I1]]{{\]}} : memref<4xf32, 1> + func.func @no_address_spaces_callee(%arg0: memref<2x2xf32>, %arg1: memref<4xf32>) { + %block_id = gpu.block_id x + %0 = memref.load %arg0[%block_id, %block_id] : memref<2x2xf32> + memref.store %0, %arg1[%block_id] : memref<4xf32> + func.return + } +} + +// ----- + // Lowering of subgroup query operations // CHECK-DAG: llvm.func spir_funccc @_Z18get_sub_group_size() -> i32 attributes {no_unwind, will_return} From 55d51dd9dca8220ffaf9260d56dae9f5c34b7120 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 10 Oct 2024 14:10:38 +0200 Subject: [PATCH 007/177] [clang][bytecode] Fix temporary lvalue base expression (#111808) We need to use the MaterializeTemporaryExpr here so the checks in ExprConstant.cpp do the right thing. --- clang/lib/AST/ByteCode/Compiler.cpp | 9 +++++---- clang/lib/AST/ByteCode/Compiler.h | 3 ++- clang/test/AST/ByteCode/cxx1z.cpp | 12 ++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 clang/test/AST/ByteCode/cxx1z.cpp diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index fe44238ea11869..ba4c5600d613b0 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2728,7 +2728,7 @@ bool Compiler::VisitMaterializeTemporaryExpr( const Expr *Inner = E->getSubExpr()->skipRValueSubobjectAdjustments(); if (std::optional LocalIndex = - allocateLocal(Inner, E->getExtendingDecl())) { + allocateLocal(E, Inner->getType(), E->getExtendingDecl())) { InitLinkScope ILS(this, InitLink::Temp(*LocalIndex)); if (!this->emitGetPtrLocal(*LocalIndex, E)) return false; @@ -4029,7 +4029,8 @@ unsigned Compiler::allocateLocalPrimitive(DeclTy &&Src, PrimType Ty, template std::optional -Compiler::allocateLocal(DeclTy &&Src, const ValueDecl *ExtendingDecl) { +Compiler::allocateLocal(DeclTy &&Src, QualType Ty, + const ValueDecl *ExtendingDecl) { // Make sure we don't accidentally register the same decl twice. if ([[maybe_unused]] const auto *VD = dyn_cast_if_present(Src.dyn_cast())) { @@ -4037,7 +4038,6 @@ Compiler::allocateLocal(DeclTy &&Src, const ValueDecl *ExtendingDecl) { assert(!Locals.contains(VD)); } - QualType Ty; const ValueDecl *Key = nullptr; const Expr *Init = nullptr; bool IsTemporary = false; @@ -4050,7 +4050,8 @@ Compiler::allocateLocal(DeclTy &&Src, const ValueDecl *ExtendingDecl) { } if (auto *E = Src.dyn_cast()) { IsTemporary = true; - Ty = E->getType(); + if (Ty.isNull()) + Ty = E->getType(); } Descriptor *D = P.createDescriptor( diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index 22e078f3fe546f..4253e7b3248c9f 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -302,7 +302,8 @@ class Compiler : public ConstStmtVisitor, bool>, /// Allocates a space storing a local given its type. std::optional - allocateLocal(DeclTy &&Decl, const ValueDecl *ExtendingDecl = nullptr); + allocateLocal(DeclTy &&Decl, QualType Ty = QualType(), + const ValueDecl *ExtendingDecl = nullptr); unsigned allocateTemporary(const Expr *E); private: diff --git a/clang/test/AST/ByteCode/cxx1z.cpp b/clang/test/AST/ByteCode/cxx1z.cpp new file mode 100644 index 00000000000000..2b5d215f016548 --- /dev/null +++ b/clang/test/AST/ByteCode/cxx1z.cpp @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++17 -verify=expected,both %s +// RUN: %clang_cc1 -std=c++17 -verify=ref,both %s + +template struct A {}; +namespace Temp { + struct S { int n; }; + constexpr S &addr(S &&s) { return s; } + A a; // both-error {{reference to temporary object}} + A b; // both-error {{pointer to temporary object}} + A c; // both-error {{reference to subobject of temporary object}} + A d; // both-error {{pointer to subobject of temporary object}} +} From b773da0c5eed06f21f4caeea5eae47cacefb376c Mon Sep 17 00:00:00 2001 From: Vladislav Dzhidzhoev Date: Thu, 10 Oct 2024 14:21:25 +0200 Subject: [PATCH 008/177] [lldb][test] Use $(STRIP) instead of strip in API tests (Darwin-only change) (#111816) This makes tests more portable. Make variables for LLVM utils are passed to `make` on Darwin as well. Co-authored-by: Vladimir Vereschaka --- .../Python/lldbsuite/test/builders/builder.py | 46 +++++++++---------- lldb/test/API/lang/objc/hidden-ivars/Makefile | 4 +- .../API/lang/objc/objc-ivar-stripped/Makefile | 2 +- .../objc/objc-static-method-stripped/Makefile | 2 +- lldb/test/API/macosx/add-dsym/Makefile | 2 +- lldb/test/API/tools/lldb-dap/module/Makefile | 2 +- .../tools/lldb-dap/terminated-event/Makefile | 2 +- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py index f813d68e46e82a..d399a5b228c131 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/builder.py +++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py @@ -169,31 +169,31 @@ def getToolchainUtil(util_name): if not os.getenv("LLVM_AR"): utils.extend(["LLVM_AR=%s" % getToolchainUtil("llvm-ar")]) - if not lldbplatformutil.platformIsDarwin(): - if cc_type in ["clang", "cc", "gcc"]: - util_paths = {} - # Assembly a toolchain side tool cmd based on passed CC. - for var, name in util_names.items(): - # Do not override explicity specified tool from the cmd line. - if not os.getenv(var): - util_paths[var] = getToolchainUtil("llvm-" + name) - else: - util_paths[var] = os.getenv(var) - utils.extend(["AR=%s" % util_paths["ARCHIVER"]]) - - # Look for llvm-dwp or gnu dwp - if not lldbutil.which(util_paths["DWP"]): - util_paths["DWP"] = getToolchainUtil("llvm-dwp") - if not lldbutil.which(util_paths["DWP"]): - util_paths["DWP"] = lldbutil.which("llvm-dwp") + if cc_type in ["clang", "cc", "gcc"]: + util_paths = {} + # Assembly a toolchain side tool cmd based on passed CC. + for var, name in util_names.items(): + # Do not override explicity specified tool from the cmd line. + if not os.getenv(var): + util_paths[var] = getToolchainUtil("llvm-" + name) + else: + util_paths[var] = os.getenv(var) + utils.extend(["AR=%s" % util_paths["ARCHIVER"]]) + + # Look for llvm-dwp or gnu dwp + if not lldbutil.which(util_paths["DWP"]): + util_paths["DWP"] = getToolchainUtil("llvm-dwp") + if not lldbutil.which(util_paths["DWP"]): + util_paths["DWP"] = lldbutil.which("llvm-dwp") + if not util_paths["DWP"]: + util_paths["DWP"] = lldbutil.which("dwp") if not util_paths["DWP"]: - util_paths["DWP"] = lldbutil.which("dwp") - if not util_paths["DWP"]: - del util_paths["DWP"] + del util_paths["DWP"] - for var, path in util_paths.items(): - utils.append("%s=%s" % (var, path)) - else: + for var, path in util_paths.items(): + utils.append("%s=%s" % (var, path)) + + if lldbplatformutil.platformIsDarwin(): utils.extend(["AR=%slibtool" % os.getenv("CROSS_COMPILE", "")]) return [ diff --git a/lldb/test/API/lang/objc/hidden-ivars/Makefile b/lldb/test/API/lang/objc/hidden-ivars/Makefile index 283e8a118fb16a..c94c0dee1b9ce9 100644 --- a/lldb/test/API/lang/objc/hidden-ivars/Makefile +++ b/lldb/test/API/lang/objc/hidden-ivars/Makefile @@ -14,8 +14,8 @@ endif stripped: a.out libInternalDefiner.dylib mkdir stripped - strip -Sx a.out -o stripped/a.out - strip -Sx libInternalDefiner.dylib -o stripped/libInternalDefiner.dylib + $(STRIP) -Sx a.out -o stripped/a.out + $(STRIP) -Sx libInternalDefiner.dylib -o stripped/libInternalDefiner.dylib ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - stripped/a.out endif diff --git a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile index 8b63215d6d9da6..eed66d2a965d11 100644 --- a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile +++ b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile @@ -6,7 +6,7 @@ all: a.out.stripped include Makefile.rules a.out.stripped: a.out.dSYM - strip -o a.out.stripped a.out + $(STRIP) -o a.out.stripped a.out ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out.stripped endif diff --git a/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile b/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile index ed312938c9cd11..4936553c56f7c0 100644 --- a/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile +++ b/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile @@ -4,7 +4,7 @@ LD_EXTRAS := -lobjc -framework Foundation default: a.out.stripped a.out.stripped: a.out.dSYM - strip -o a.out.stripped a.out + $(STRIP) -o a.out.stripped a.out ln -sf a.out.dSYM a.out.stripped.dSYM include Makefile.rules diff --git a/lldb/test/API/macosx/add-dsym/Makefile b/lldb/test/API/macosx/add-dsym/Makefile index 4e1ec2202d0b09..b949b308d3acce 100644 --- a/lldb/test/API/macosx/add-dsym/Makefile +++ b/lldb/test/API/macosx/add-dsym/Makefile @@ -8,7 +8,7 @@ hide.app/Contents/a.out.dSYM: mkdir hide.app mkdir hide.app/Contents mv a.out.dSYM hide.app/Contents - strip -x a.out + $(STRIP) -x a.out ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out endif diff --git a/lldb/test/API/tools/lldb-dap/module/Makefile b/lldb/test/API/tools/lldb-dap/module/Makefile index b30baf48b972ef..c7d626a1a7e4c1 100644 --- a/lldb/test/API/tools/lldb-dap/module/Makefile +++ b/lldb/test/API/tools/lldb-dap/module/Makefile @@ -10,7 +10,7 @@ include Makefile.rules all: a.out.stripped a.out.stripped: - strip -o a.out.stripped a.out + $(STRIP) -o a.out.stripped a.out ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out.stripped diff --git a/lldb/test/API/tools/lldb-dap/terminated-event/Makefile b/lldb/test/API/tools/lldb-dap/terminated-event/Makefile index b30baf48b972ef..c7d626a1a7e4c1 100644 --- a/lldb/test/API/tools/lldb-dap/terminated-event/Makefile +++ b/lldb/test/API/tools/lldb-dap/terminated-event/Makefile @@ -10,7 +10,7 @@ include Makefile.rules all: a.out.stripped a.out.stripped: - strip -o a.out.stripped a.out + $(STRIP) -o a.out.stripped a.out ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out.stripped From 36a0d442eb4d2f1e0782bc2a1b1715fc7631faec Mon Sep 17 00:00:00 2001 From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com> Date: Thu, 10 Oct 2024 20:45:40 +0800 Subject: [PATCH 009/177] [LLVM][DOCS] Add documentation for 'host' and 'Native' options in LLVM_TARGETS_TO_BUILD. (#111382) From https://github.com/llvm/llvm-project/issues/111356 --- llvm/docs/CMake.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index 191230101c4d24..91e34781ef307e 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -847,6 +847,12 @@ enabled sub-projects. Nearly all of these variable names begin with The full list, as of March 2023, is: ``AArch64;AMDGPU;ARM;AVR;BPF;Hexagon;Lanai;LoongArch;Mips;MSP430;NVPTX;PowerPC;RISCV;Sparc;SystemZ;VE;WebAssembly;X86;XCore`` + You can also specify ``host`` or ``Native`` to automatically detect and + include the target corresponding to the host machine's architecture, or + use ``all`` to include all available targets. + For example, on an x86_64 machine, specifying ``-DLLVM_TARGETS_TO_BUILD=host`` + will include the ``X86`` target. + **LLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN**:BOOL If enabled, the compiler version check will only warn when using a toolchain which is about to be deprecated, instead of emitting an error. From 1f919aa77805b951fb06b44732a87f1f83929247 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 10 Oct 2024 14:10:41 +0100 Subject: [PATCH 010/177] VectorCombine: lift one-use limitation in foldExtractedCmps (#110902) There are artificial one-use limitations on foldExtractedCmps. Adjust the costs to account for multi-use, and strip the one-use matcher, lifting the limitations. --- .../Transforms/Vectorize/VectorCombine.cpp | 25 ++++----- .../VectorCombine/X86/extract-cmp-binop.ll | 54 +++++++++++++++++++ 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 627edb680dfa1e..58145c7e3c5913 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1038,23 +1038,20 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { // The compare predicates should match, and each compare should have a // constant operand. - // TODO: Relax the one-use constraints. Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); Instruction *I0, *I1; Constant *C0, *C1; CmpInst::Predicate P0, P1; - if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || - !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || - P0 != P1) + if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) || + !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) || P0 != P1) return false; // The compare operands must be extracts of the same vector with constant // extract indexes. - // TODO: Relax the one-use constraints. Value *X; uint64_t Index0, Index1; - if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || - !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) + if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) || + !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))) return false; auto *Ext0 = cast(I0); @@ -1073,14 +1070,16 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { return false; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost Ext0Cost = + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0), + Ext1Cost = + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); InstructionCost OldCost = - TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); - OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); - OldCost += + Ext0Cost + Ext1Cost + TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred) * - 2; - OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); + 2 + + TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); // The proposed vector pattern is: // vcmp = cmp Pred X, VecC @@ -1096,6 +1095,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { ShufMask); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); + NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost; + NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost; // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll index 462bb13ae7d12a..be5359f549ac94 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll @@ -92,6 +92,60 @@ define i1 @icmp_add_v8i32(<8 x i32> %a) { ret i1 %r } +declare void @use() + +define i1 @fcmp_and_v2f64_multiuse(<2 x double> %a) { +; SSE-LABEL: @fcmp_and_v2f64_multiuse( +; SSE-NEXT: [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 +; SSE-NEXT: call void @use(double [[E1]]) +; SSE-NEXT: [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1 +; SSE-NEXT: [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01 +; SSE-NEXT: [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00 +; SSE-NEXT: [[R:%.*]] = and i1 [[CMP1]], [[CMP2]] +; SSE-NEXT: call void @use(i1 [[R]]) +; SSE-NEXT: ret i1 [[R]] +; +; AVX-LABEL: @fcmp_and_v2f64_multiuse( +; AVX-NEXT: [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 +; AVX-NEXT: call void @use(double [[E1]]) +; AVX-NEXT: [[TMP1:%.*]] = fcmp olt <2 x double> [[A]], +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]] +; AVX-NEXT: [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 +; AVX-NEXT: call void @use(i1 [[R]]) +; AVX-NEXT: ret i1 [[R]] +; + %e1 = extractelement <2 x double> %a, i32 0 + call void @use(double %e1) + %e2 = extractelement <2 x double> %a, i32 1 + %cmp1 = fcmp olt double %e1, 42.0 + %cmp2 = fcmp olt double %e2, -8.0 + %r = and i1 %cmp1, %cmp2 + call void @use(i1 %r) + ret i1 %r +} + +define i1 @icmp_xor_v4i32_multiuse(<4 x i32> %a) { +; CHECK-LABEL: @icmp_xor_v4i32_multiuse( +; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 1 +; CHECK-NEXT: call void @use(i32 [[E2]]) +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[A]], +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]] +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 +; CHECK-NEXT: call void @use(i1 [[R]]) +; CHECK-NEXT: ret i1 [[R]] +; + %e1 = extractelement <4 x i32> %a, i32 3 + %e2 = extractelement <4 x i32> %a, i32 1 + call void @use(i32 %e2) + %cmp1 = icmp sgt i32 %e1, 42 + %cmp2 = icmp sgt i32 %e2, -8 + %r = xor i1 %cmp1, %cmp2 + call void @use(i1 %r) + ret i1 %r +} + ; Negative test - this could CSE/simplify. define i1 @same_extract_index(<4 x i32> %a) { From 159d694c05500a656775f4cbd6931dae9aab290a Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Thu, 10 Oct 2024 21:14:05 +0800 Subject: [PATCH 011/177] [libc++] __uglify internal member names of iterators in `bitset` (#111127) [template.bitset.general] indicates that `bitset` shouldn't have member typedef-names `iterator` and `const_iterator`. Currently libc++'s typedef-names are causing ambiguity in name lookup, which isn't conforming. As these iterator types are themselves useful, I think we should just use __uglified member typedef-names for them. Fixes #111125 --- libcxx/docs/ReleaseNotes/20.rst | 4 ++ libcxx/include/bitset | 44 +++++++++--------- .../nonstdmem.uglified.compile.pass.cpp | 46 +++++++++++++++++++ 3 files changed, 72 insertions(+), 22 deletions(-) create mode 100644 libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst index dcb1102d81d641..3a66aecaf57cb2 100644 --- a/libcxx/docs/ReleaseNotes/20.rst +++ b/libcxx/docs/ReleaseNotes/20.rst @@ -78,6 +78,10 @@ Deprecations and Removals supported as an extension anymore, please migrate any code that uses e.g. ``std::vector`` to be standards conforming. +- Non-conforming member typedefs ``iterator`` and ``const_iterator`` of ``std::bitset`` are removed. Previously, they + were private but could cause ambiguity in name lookup. Code that expects such ambiguity will possibly not compile in + LLVM 20. + Upcoming Deprecations and Removals ---------------------------------- diff --git a/libcxx/include/bitset b/libcxx/include/bitset index ce23d522168c4c..f90ceaab816cca 100644 --- a/libcxx/include/bitset +++ b/libcxx/include/bitset @@ -187,8 +187,8 @@ protected: typedef __bit_reference<__bitset> reference; typedef __bit_const_reference<__bitset> const_reference; - typedef __bit_iterator<__bitset, false> iterator; - typedef __bit_iterator<__bitset, true> const_iterator; + typedef __bit_iterator<__bitset, false> __iterator; + typedef __bit_iterator<__bitset, true> __const_iterator; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long __v) _NOEXCEPT; @@ -199,11 +199,11 @@ protected: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t __pos) const _NOEXCEPT { return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t __pos) _NOEXCEPT { - return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __iterator __make_iter(size_t __pos) _NOEXCEPT { + return __iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t __pos) const _NOEXCEPT { - return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __const_iterator __make_iter(size_t __pos) const _NOEXCEPT { + return __const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset& __v) _NOEXCEPT; @@ -335,8 +335,8 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void __bitset<_N_words, _Siz template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long __bitset<_N_words, _Size>::to_ulong(false_type) const { - const_iterator __e = __make_iter(_Size); - const_iterator __i = std::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true); + __const_iterator __e = __make_iter(_Size); + __const_iterator __i = std::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true); if (__i != __e) __throw_overflow_error("bitset to_ulong overflow error"); @@ -352,8 +352,8 @@ __bitset<_N_words, _Size>::to_ulong(true_type) const { template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long __bitset<_N_words, _Size>::to_ullong(false_type) const { - const_iterator __e = __make_iter(_Size); - const_iterator __i = std::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true); + __const_iterator __e = __make_iter(_Size); + __const_iterator __i = std::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true); if (__i != __e) __throw_overflow_error("bitset to_ullong overflow error"); @@ -449,8 +449,8 @@ protected: typedef __bit_reference<__bitset> reference; typedef __bit_const_reference<__bitset> const_reference; - typedef __bit_iterator<__bitset, false> iterator; - typedef __bit_iterator<__bitset, true> const_iterator; + typedef __bit_iterator<__bitset, false> __iterator; + typedef __bit_iterator<__bitset, true> __const_iterator; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long __v) _NOEXCEPT; @@ -461,11 +461,11 @@ protected: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t __pos) const _NOEXCEPT { return const_reference(&__first_, __storage_type(1) << __pos); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t __pos) _NOEXCEPT { - return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __iterator __make_iter(size_t __pos) _NOEXCEPT { + return __iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t __pos) const _NOEXCEPT { - return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __const_iterator __make_iter(size_t __pos) const _NOEXCEPT { + return __const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset& __v) _NOEXCEPT; @@ -564,8 +564,8 @@ protected: typedef __bit_reference<__bitset> reference; typedef __bit_const_reference<__bitset> const_reference; - typedef __bit_iterator<__bitset, false> iterator; - typedef __bit_iterator<__bitset, true> const_iterator; + typedef __bit_iterator<__bitset, false> __iterator; + typedef __bit_iterator<__bitset, true> __const_iterator; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long) _NOEXCEPT; @@ -576,11 +576,11 @@ protected: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t) const _NOEXCEPT { return const_reference(nullptr, 1); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t) _NOEXCEPT { - return iterator(nullptr, 0); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __iterator __make_iter(size_t) _NOEXCEPT { + return __iterator(nullptr, 0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t) const _NOEXCEPT { - return const_iterator(nullptr, 0); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __const_iterator __make_iter(size_t) const _NOEXCEPT { + return __const_iterator(nullptr, 0); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset&) _NOEXCEPT {} diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp new file mode 100644 index 00000000000000..c9dd923d7130f5 --- /dev/null +++ b/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// This test ensures that we don't use a non-uglified name 'iterator' and +// 'const_iterator' in the implementation of bitset. +// +// See https://github.com/llvm/llvm-project/issues/111125. + +#include +#include +#include + +struct my_base { + typedef int* iterator; + typedef const int* const_iterator; +}; + +template +struct my_derived : my_base, std::bitset {}; + +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); +static_assert(std::is_same::iterator, int*>::value, ""); + +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); +static_assert(std::is_same::const_iterator, const int*>::value, ""); From 90149204bd08c07eb672cd5b19d782fed3d96ddc Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 10 Oct 2024 14:26:46 +0100 Subject: [PATCH 012/177] [ci] Don't add check-all target when pstl project is enabled (#111803) Fixes #110265 Adding check-all causes us to run some tests twice if a project specific target like check-clang is also added. check-pstl is an alternative but as far as I can tell, check-all does not include this so we have not been running the tests in CI anyway. When I tried to run check-pstl locally I got a lot of compiler errors but have not found any instructions on how to setup a correct build environment. Even if such instructions exist, it's probably more than we want to do in CI. According to Louis Dionne, the project is probably not active. So if it's ever revived it'll be up to the new contributors to enable testing. --- .ci/generate-buildkite-pipeline-premerge | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge index 53a43070bf1ca3..7676ff716c4185 100755 --- a/.ci/generate-buildkite-pipeline-premerge +++ b/.ci/generate-buildkite-pipeline-premerge @@ -191,6 +191,9 @@ function keep-modified-projects() { } function check-targets() { + # Do not use "check-all" here because if there is "check-all" plus a + # project specific target like "check-clang", that project's tests + # will be run twice. projects=${@} for project in ${projects}; do case ${project} in @@ -216,7 +219,7 @@ function check-targets() { echo "check-lldb" ;; pstl) - echo "check-all" + # Currently we do not run pstl tests in CI. ;; libclc) # Currently there is no testing for libclc. From 480e7f0667794822f7f3a065bed73d9a2ecc2d58 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Thu, 10 Oct 2024 15:37:19 +0200 Subject: [PATCH 013/177] [flang] correctly deal with bind(c) derived type result ABI (#111678) Derived type results of BIND(C) function should be returned according the the C ABI for returning the related C struct type. This currently did not happen since the abstract-result pass was forcing the Fortran ABI for all derived type results. use the bind_c attribute that was added on call/func/dispatch in FIR to prevent such rewrite in the abstract result pass, and update the target-rewrite pass to deal with the struct return ABI. So far, the target specific part of the target-rewrite is only implemented for X86-64 according to the "System V Application Binary Interface AMD64 v1", the other targets will hit a TODO, just like for BIND(C), VALUE derived type arguments. This intends to deal with https://github.com/llvm/llvm-project/issues/102113. --- .../include/flang/Optimizer/CodeGen/Target.h | 5 + .../flang/Optimizer/Dialect/FIROpsSupport.h | 21 +++ flang/lib/Optimizer/CodeGen/Target.cpp | 68 ++++++++- flang/lib/Optimizer/CodeGen/TargetRewrite.cpp | 137 ++++++++++++++---- .../Optimizer/Transforms/AbstractResult.cpp | 65 ++++++++- flang/test/Fir/abstract-results-bindc.fir | 43 ++++++ flang/test/Fir/struct-return-x86-64.fir | 120 +++++++++++++++ 7 files changed, 419 insertions(+), 40 deletions(-) create mode 100644 flang/test/Fir/abstract-results-bindc.fir create mode 100644 flang/test/Fir/struct-return-x86-64.fir diff --git a/flang/include/flang/Optimizer/CodeGen/Target.h b/flang/include/flang/Optimizer/CodeGen/Target.h index a7161152a5c323..3b38583511927a 100644 --- a/flang/include/flang/Optimizer/CodeGen/Target.h +++ b/flang/include/flang/Optimizer/CodeGen/Target.h @@ -126,6 +126,11 @@ class CodeGenSpecifics { structArgumentType(mlir::Location loc, fir::RecordType recTy, const Marshalling &previousArguments) const = 0; + /// Type representation of a `fir.type` type argument when returned by + /// value. Such value may need to be converted to a hidden reference argument. + virtual Marshalling structReturnType(mlir::Location loc, + fir::RecordType eleTy) const = 0; + /// Type representation of a `boxchar` type argument when passed by value. /// An argument value may need to be passed as a (safe) reference argument. /// diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h index cdbefdb2341485..fb7b1d16f62f3a 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h +++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h @@ -177,6 +177,27 @@ inline mlir::NamedAttribute getAdaptToByRefAttr(Builder &builder) { } bool isDummyArgument(mlir::Value v); + +template +inline bool hasProcedureAttr(fir::FortranProcedureFlagsEnumAttr flags) { + return flags && bitEnumContainsAny(flags.getValue(), Flag); +} + +template +inline bool hasProcedureAttr(mlir::Operation *op) { + if (auto firCallOp = mlir::dyn_cast(op)) + return hasProcedureAttr(firCallOp.getProcedureAttrsAttr()); + if (auto firCallOp = mlir::dyn_cast(op)) + return hasProcedureAttr(firCallOp.getProcedureAttrsAttr()); + return hasProcedureAttr( + op->getAttrOfType( + getFortranProcedureFlagsAttrName())); +} + +inline bool hasBindcAttr(mlir::Operation *op) { + return hasProcedureAttr(op); +} + } // namespace fir #endif // FORTRAN_OPTIMIZER_DIALECT_FIROPSSUPPORT_H diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp index a12b59413f4456..6c148dffb0e55a 100644 --- a/flang/lib/Optimizer/CodeGen/Target.cpp +++ b/flang/lib/Optimizer/CodeGen/Target.cpp @@ -100,6 +100,11 @@ struct GenericTarget : public CodeGenSpecifics { TODO(loc, "passing VALUE BIND(C) derived type for this target"); } + CodeGenSpecifics::Marshalling + structReturnType(mlir::Location loc, fir::RecordType ty) const override { + TODO(loc, "returning BIND(C) derived type for this target"); + } + CodeGenSpecifics::Marshalling integerArgumentType(mlir::Location loc, mlir::IntegerType argTy) const override { @@ -533,7 +538,8 @@ struct TargetX86_64 : public GenericTarget { /// When \p recTy is a one field record type that can be passed /// like the field on its own, returns the field type. Returns /// a null type otherwise. - mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy) const { + mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy, + bool allowComplex = false) const { auto typeList = recTy.getTypeList(); if (typeList.size() != 1) return {}; @@ -541,6 +547,8 @@ struct TargetX86_64 : public GenericTarget { if (mlir::isa( fieldType)) return fieldType; + if (allowComplex && mlir::isa(fieldType)) + return fieldType; if (mlir::isa(fieldType)) { // Only CHARACTER(1) are expected in BIND(C) contexts, which is the only // contexts where derived type may be passed in registers. @@ -593,7 +601,7 @@ struct TargetX86_64 : public GenericTarget { postMerge(byteOffset, Lo, Hi); if (Lo == ArgClass::Memory || Lo == ArgClass::X87 || Lo == ArgClass::ComplexX87) - return passOnTheStack(loc, recTy); + return passOnTheStack(loc, recTy, /*isResult=*/false); int neededIntRegisters = 0; int neededSSERegisters = 0; if (Lo == ArgClass::SSE) @@ -609,7 +617,7 @@ struct TargetX86_64 : public GenericTarget { // all in registers or all on the stack). if (!hasEnoughRegisters(loc, neededIntRegisters, neededSSERegisters, previousArguments)) - return passOnTheStack(loc, recTy); + return passOnTheStack(loc, recTy, /*isResult=*/false); if (auto fieldType = passAsFieldIfOneFieldStruct(recTy)) { CodeGenSpecifics::Marshalling marshal; @@ -641,9 +649,57 @@ struct TargetX86_64 : public GenericTarget { return marshal; } + CodeGenSpecifics::Marshalling + structReturnType(mlir::Location loc, fir::RecordType recTy) const override { + std::uint64_t byteOffset = 0; + ArgClass Lo, Hi; + Lo = Hi = ArgClass::NoClass; + byteOffset = classifyStruct(loc, recTy, byteOffset, Lo, Hi); + mlir::MLIRContext *context = recTy.getContext(); + postMerge(byteOffset, Lo, Hi); + if (Lo == ArgClass::Memory) + return passOnTheStack(loc, recTy, /*isResult=*/true); + + // Note that X87/ComplexX87 are passed in memory, but returned via %st0 + // %st1 registers. Here, they are returned as fp80 or {fp80, fp80} by + // passAsFieldIfOneFieldStruct, and LLVM will use the expected registers. + + // Note that {_Complex long double} is not 100% clear from an ABI + // perspective because the aggregate post merger rules say it should be + // passed in memory because it is bigger than 2 eight bytes. This has the + // funny effect of + // {_Complex long double} return to be dealt with differently than + // _Complex long double. + + if (auto fieldType = + passAsFieldIfOneFieldStruct(recTy, /*allowComplex=*/true)) { + if (auto complexType = mlir::dyn_cast(fieldType)) + return complexReturnType(loc, complexType.getElementType()); + CodeGenSpecifics::Marshalling marshal; + marshal.emplace_back(fieldType, AT{}); + return marshal; + } + + if (Hi == ArgClass::NoClass || Hi == ArgClass::SSEUp) { + // Return a single integer or floating point argument. + mlir::Type lowType = pickLLVMArgType(loc, context, Lo, byteOffset); + CodeGenSpecifics::Marshalling marshal; + marshal.emplace_back(lowType, AT{}); + return marshal; + } + // Will be returned in two different registers. Generate {lowTy, HiTy} for + // the LLVM IR result type. + CodeGenSpecifics::Marshalling marshal; + mlir::Type lowType = pickLLVMArgType(loc, context, Lo, 8u); + mlir::Type hiType = pickLLVMArgType(loc, context, Hi, byteOffset - 8u); + marshal.emplace_back(mlir::TupleType::get(context, {lowType, hiType}), + AT{}); + return marshal; + } + /// Marshal an argument that must be passed on the stack. - CodeGenSpecifics::Marshalling passOnTheStack(mlir::Location loc, - mlir::Type ty) const { + CodeGenSpecifics::Marshalling + passOnTheStack(mlir::Location loc, mlir::Type ty, bool isResult) const { CodeGenSpecifics::Marshalling marshal; auto sizeAndAlign = fir::getTypeSizeAndAlignmentOrCrash(loc, ty, getDataLayout(), kindMap); @@ -651,7 +707,7 @@ struct TargetX86_64 : public GenericTarget { unsigned short align = std::max(sizeAndAlign.second, static_cast(8)); marshal.emplace_back(fir::ReferenceType::get(ty), - AT{align, /*byval=*/true, /*sret=*/false}); + AT{align, /*byval=*/!isResult, /*sret=*/isResult}); return marshal; } }; diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp index fd56fd6bf50f44..04a3ea684642c8 100644 --- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp @@ -142,20 +142,16 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { mlir::ModuleOp getModule() { return getOperation(); } - template + template std::optional> - rewriteCallComplexResultType( - mlir::Location loc, A ty, B &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, C &newOpers, - mlir::Value &savedStackPtr) { - if (noComplexConversion) { - newResTys.push_back(ty); - return std::nullopt; - } - auto m = specifics->complexReturnType(loc, ty.getElementType()); - // Currently targets mandate COMPLEX is a single aggregate or packed - // scalar, including the sret case. - assert(m.size() == 1 && "target of complex return not supported"); + rewriteCallResultType(mlir::Location loc, mlir::Type originalResTy, + Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, + Callback &newOpers, mlir::Value &savedStackPtr, + fir::CodeGenSpecifics::Marshalling &m) { + // Currently, targets mandate COMPLEX or STRUCT is a single aggregate or + // packed scalar, including the sret case. + assert(m.size() == 1 && "return type not supported on this target"); auto resTy = std::get(m[0]); auto attr = std::get(m[0]); if (attr.isSRet()) { @@ -170,7 +166,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { newInTyAndAttrs.push_back(m[0]); newOpers.push_back(stack); return [=](mlir::Operation *) -> mlir::Value { - auto memTy = fir::ReferenceType::get(ty); + auto memTy = fir::ReferenceType::get(originalResTy); auto cast = rewriter->create(loc, memTy, stack); return rewriter->create(loc, cast); }; @@ -180,11 +176,41 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { // We are going to generate an alloca, so save the stack pointer. if (!savedStackPtr) savedStackPtr = genStackSave(loc); - return this->convertValueInMemory(loc, call->getResult(0), ty, + return this->convertValueInMemory(loc, call->getResult(0), originalResTy, /*inputMayBeBigger=*/true); }; } + template + std::optional> + rewriteCallComplexResultType( + mlir::Location loc, mlir::ComplexType ty, Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers, + mlir::Value &savedStackPtr) { + if (noComplexConversion) { + newResTys.push_back(ty); + return std::nullopt; + } + auto m = specifics->complexReturnType(loc, ty.getElementType()); + return rewriteCallResultType(loc, ty, newResTys, newInTyAndAttrs, newOpers, + savedStackPtr, m); + } + + template + std::optional> + rewriteCallStructResultType( + mlir::Location loc, fir::RecordType recTy, Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers, + mlir::Value &savedStackPtr) { + if (noStructConversion) { + newResTys.push_back(recTy); + return std::nullopt; + } + auto m = specifics->structReturnType(loc, recTy); + return rewriteCallResultType(loc, recTy, newResTys, newInTyAndAttrs, + newOpers, savedStackPtr, m); + } + void passArgumentOnStackOrWithNewType( mlir::Location loc, fir::CodeGenSpecifics::TypeAndAttr newTypeAndAttr, mlir::Type oldType, mlir::Value oper, @@ -356,6 +382,11 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { newInTyAndAttrs, newOpers, savedStackPtr); }) + .template Case([&](fir::RecordType recTy) { + wrap = rewriteCallStructResultType(loc, recTy, newResTys, + newInTyAndAttrs, newOpers, + savedStackPtr); + }) .Default([&](mlir::Type ty) { newResTys.push_back(ty); }); } else if (fnTy.getResults().size() > 1) { TODO(loc, "multiple results not supported yet"); @@ -562,6 +593,24 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { } } + template + void + lowerStructSignatureRes(mlir::Location loc, fir::RecordType recTy, + Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) { + if (noComplexConversion) { + newResTys.push_back(recTy); + return; + } else { + for (auto &tup : specifics->structReturnType(loc, recTy)) { + if (std::get(tup).isSRet()) + newInTyAndAttrs.push_back(tup); + else + newResTys.push_back(std::get(tup)); + } + } + } + void lowerStructSignatureArg(mlir::Location loc, fir::RecordType recTy, fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) { @@ -595,6 +644,9 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { .Case([&](mlir::ComplexType ty) { lowerComplexSignatureRes(loc, ty, newResTys, newInTyAndAttrs); }) + .Case([&](fir::RecordType ty) { + lowerStructSignatureRes(loc, ty, newResTys, newInTyAndAttrs); + }) .Default([&](mlir::Type ty) { newResTys.push_back(ty); }); } llvm::SmallVector trailingInTys; @@ -696,7 +748,8 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { for (auto ty : func.getResults()) if ((mlir::isa(ty) && !noCharacterConversion) || (fir::isa_complex(ty) && !noComplexConversion) || - (mlir::isa(ty) && hasCCallingConv)) { + (mlir::isa(ty) && hasCCallingConv) || + (mlir::isa(ty) && !noStructConversion)) { LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n"); return false; } @@ -770,6 +823,9 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { rewriter->getUnitAttr())); newResTys.push_back(retTy); }) + .Case([&](fir::RecordType recTy) { + doStructReturn(func, recTy, newResTys, newInTyAndAttrs, fixups); + }) .Default([&](mlir::Type ty) { newResTys.push_back(ty); }); // Saved potential shift in argument. Handling of result can add arguments @@ -1062,21 +1118,12 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { return false; } - /// Convert a complex return value. This can involve converting the return - /// value to a "hidden" first argument or packing the complex into a wide - /// GPR. template - void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx, - Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, - FIXUPS &fixups) { - if (noComplexConversion) { - newResTys.push_back(cmplx); - return; - } - auto m = - specifics->complexReturnType(func.getLoc(), cmplx.getElementType()); - assert(m.size() == 1); + void doReturn(mlir::func::FuncOp func, Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, + FIXUPS &fixups, fir::CodeGenSpecifics::Marshalling &m) { + assert(m.size() == 1 && + "expect result to be turned into single argument or result so far"); auto &tup = m[0]; auto attr = std::get(tup); auto argTy = std::get(tup); @@ -1117,6 +1164,36 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { newResTys.push_back(argTy); } + /// Convert a complex return value. This can involve converting the return + /// value to a "hidden" first argument or packing the complex into a wide + /// GPR. + template + void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx, + Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, + FIXUPS &fixups) { + if (noComplexConversion) { + newResTys.push_back(cmplx); + return; + } + auto m = + specifics->complexReturnType(func.getLoc(), cmplx.getElementType()); + doReturn(func, newResTys, newInTyAndAttrs, fixups, m); + } + + template + void doStructReturn(mlir::func::FuncOp func, fir::RecordType recTy, + Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, + FIXUPS &fixups) { + if (noStructConversion) { + newResTys.push_back(recTy); + return; + } + auto m = specifics->structReturnType(func.getLoc(), recTy); + doReturn(func, newResTys, newInTyAndAttrs, fixups, m); + } + template void createFuncOpArgFixups(mlir::func::FuncOp func, diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp index 7299ff80121e13..c0ec820d87ed44 100644 --- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp +++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp @@ -32,6 +32,33 @@ using namespace mlir; namespace fir { namespace { +// Helper to only build the symbol table if needed because its build time is +// linear on the number of symbols in the module. +struct LazySymbolTable { + LazySymbolTable(mlir::Operation *op) + : module{op->getParentOfType()} {} + void build() { + if (table) + return; + table = std::make_unique(module); + } + + template + T lookup(llvm::StringRef name) { + build(); + return table->lookup(name); + } + +private: + std::unique_ptr table; + mlir::ModuleOp module; +}; + +bool hasScalarDerivedResult(mlir::FunctionType funTy) { + return funTy.getNumResults() == 1 && + mlir::isa(funTy.getResult(0)); +} + static mlir::Type getResultArgumentType(mlir::Type resultType, bool shouldBoxResult) { return llvm::TypeSwitch(resultType) @@ -190,7 +217,14 @@ class SaveResultOpConversion llvm::LogicalResult matchAndRewrite(fir::SaveResultOp op, mlir::PatternRewriter &rewriter) const override { - rewriter.eraseOp(op); + mlir::Operation *call = op.getValue().getDefiningOp(); + if (mlir::isa(op.getValue().getType()) && call && + fir::hasBindcAttr(call)) { + rewriter.replaceOpWithNewOp(op, op.getValue(), + op.getMemref()); + } else { + rewriter.eraseOp(op); + } return mlir::success(); } }; @@ -300,6 +334,12 @@ class AbstractResultOpt auto *context = &getContext(); // Convert function type itself if it has an abstract result. auto funcTy = mlir::cast(func.getFunctionType()); + // Scalar derived result of BIND(C) function must be returned according + // to the C struct return ABI which is target dependent and implemented in + // the target-rewrite pass. + if (hasScalarDerivedResult(funcTy) && + fir::hasBindcAttr(func.getOperation())) + return; if (hasAbstractResult(funcTy)) { if (fir::isa_builtin_cptr_type(funcTy.getResult(0))) { func.setType(getCPtrFunctionType(funcTy)); @@ -395,6 +435,8 @@ class AbstractResultOpt return; } + LazySymbolTable symbolTable(op); + mlir::RewritePatternSet patterns(context); mlir::ConversionTarget target = *context; const bool shouldBoxResult = this->passResultAsBox.getValue(); @@ -409,14 +451,29 @@ class AbstractResultOpt mlir::func::FuncDialect>(); target.addIllegalOp(); target.addDynamicallyLegalOp([](fir::CallOp call) { - return !hasAbstractResult(call.getFunctionType()); + mlir::FunctionType funTy = call.getFunctionType(); + if (hasScalarDerivedResult(funTy) && + fir::hasBindcAttr(call.getOperation())) + return true; + return !hasAbstractResult(funTy); }); - target.addDynamicallyLegalOp([](fir::AddrOfOp addrOf) { - if (auto funTy = mlir::dyn_cast(addrOf.getType())) + target.addDynamicallyLegalOp([&symbolTable]( + fir::AddrOfOp addrOf) { + if (auto funTy = mlir::dyn_cast(addrOf.getType())) { + if (hasScalarDerivedResult(funTy)) { + auto func = symbolTable.lookup( + addrOf.getSymbol().getRootReference().getValue()); + return func && fir::hasBindcAttr(func.getOperation()); + } return !hasAbstractResult(funTy); + } return true; }); target.addDynamicallyLegalOp([](fir::DispatchOp dispatch) { + mlir::FunctionType funTy = dispatch.getFunctionType(); + if (hasScalarDerivedResult(funTy) && + fir::hasBindcAttr(dispatch.getOperation())) + return true; return !hasAbstractResult(dispatch.getFunctionType()); }); diff --git a/flang/test/Fir/abstract-results-bindc.fir b/flang/test/Fir/abstract-results-bindc.fir new file mode 100644 index 00000000000000..9b26730f7d2923 --- /dev/null +++ b/flang/test/Fir/abstract-results-bindc.fir @@ -0,0 +1,43 @@ +// Test that bind_c derived type results are not moved to a hidden argument +// by the abstract-result pass. They will be dealt with according to the C +// struct returning ABI for the target in the target-rewrite pass. +// RUN: fir-opt %s --abstract-result | FileCheck %s + +!t = !fir.type + +func.func private @foo() -> !t attributes {fir.proc_attrs = #fir.proc_attrs} + +func.func @test_call(%x: !fir.ref) { + %0 = fir.call @foo() proc_attrs : () -> !t + fir.save_result %0 to %x : !t, !fir.ref + return +} + +func.func @test_addr_of() -> (() -> !t) { + %0 = fir.address_of(@foo) : () -> !t + return %0 : () -> !t +} + +func.func @test_dispatch(%x: !fir.ref, %y : !fir.class>) { + %0 = fir.dispatch "bar"(%y : !fir.class>) (%y : !fir.class>) -> !t proc_attrs {pass_arg_pos = 0 : i32} + fir.save_result %0 to %x : !t, !fir.ref + return +} + +// CHECK-LABEL: func.func @test_call( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>) { +// CHECK: %[[VAL_1:.*]] = fir.call @foo() proc_attrs : () -> !fir.type +// CHECK: fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref> +// CHECK: return +// CHECK: } +// CHECK-LABEL: func.func @test_addr_of() -> (() -> !fir.type) { +// CHECK: %[[VAL_0:.*]] = fir.address_of(@foo) : () -> !fir.type +// CHECK: return %[[VAL_0]] : () -> !fir.type +// CHECK: } +// CHECK-LABEL: func.func @test_dispatch( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.class>) { +// CHECK: %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class>) (%[[VAL_1]] : !fir.class>) -> !fir.type proc_attrs {pass_arg_pos = 0 : i32} +// CHECK: fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref> +// CHECK: return +// CHECK: } diff --git a/flang/test/Fir/struct-return-x86-64.fir b/flang/test/Fir/struct-return-x86-64.fir new file mode 100644 index 00000000000000..f4c2add69ff7e9 --- /dev/null +++ b/flang/test/Fir/struct-return-x86-64.fir @@ -0,0 +1,120 @@ +// Test X86-64 ABI rewrite of struct returned by value (BIND(C), VALUE derived types). +// REQUIRES: x86-registered-target +// RUN: fir-opt --target-rewrite %s | FileCheck %s + +!fits_in_reg = !fir.type +!too_big = !fir.type}> + +module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + + func.func private @test_inreg() -> !fits_in_reg + func.func @test_call_inreg(%arg0: !fir.ref) { + %0 = fir.call @test_inreg() : () -> !fits_in_reg + fir.store %0 to %arg0 : !fir.ref + return + } + func.func @test_addr_of_inreg() -> (() -> ()) { + %0 = fir.address_of(@test_inreg) : () -> !fits_in_reg + %1 = fir.convert %0 : (() -> !fits_in_reg) -> (() -> ()) + return %1 : () -> () + } + func.func @test_dispatch_inreg(%arg0: !fir.ref, %arg1: !fir.class>) { + %0 = fir.dispatch "bar"(%arg1 : !fir.class>) (%arg1 : !fir.class>) -> !fits_in_reg {pass_arg_pos = 0 : i32} + fir.store %0 to %arg0 : !fir.ref + return + } + + func.func private @test_sret() -> !too_big + func.func @test_call_sret(%arg0: !fir.ref) { + %0 = fir.call @test_sret() : () -> !too_big + fir.store %0 to %arg0 : !fir.ref + return + } + func.func @test_addr_of_sret() -> (() -> ()) { + %0 = fir.address_of(@test_sret) : () -> !too_big + %1 = fir.convert %0 : (() -> !too_big) -> (() -> ()) + return %1 : () -> () + } + func.func @test_dispatch_sret(%arg0: !fir.ref, %arg1: !fir.class>) { + %0 = fir.dispatch "bar"(%arg1 : !fir.class>) (%arg1 : !fir.class>) -> !too_big {pass_arg_pos = 0 : i32} + fir.store %0 to %arg0 : !fir.ref + return + } + func.func private @test_fp_80() -> !fir.type + func.func private @test_complex_80() -> !fir.type}> + func.func private @test_two_fp_80() -> !fir.type + func.func private @test_fp128() -> !fir.type +} + +// CHECK-LABEL: func.func private @test_inreg() -> tuple + +// CHECK-LABEL: func.func @test_call_inreg( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>) { +// CHECK: %[[VAL_1:.*]] = fir.call @test_inreg() : () -> tuple +// CHECK: %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr +// CHECK: %[[VAL_3:.*]] = fir.alloca tuple +// CHECK: fir.store %[[VAL_1]] to %[[VAL_3]] : !fir.ref> +// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref>) -> !fir.ref> +// CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref> +// CHECK: llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr +// CHECK: fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref> +// CHECK: return +// CHECK: } + +// CHECK-LABEL: func.func @test_addr_of_inreg() -> (() -> ()) { +// CHECK: %[[VAL_0:.*]] = fir.address_of(@test_inreg) : () -> tuple +// CHECK: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> tuple) -> (() -> ()) +// CHECK: return %[[VAL_1]] : () -> () +// CHECK: } + +// CHECK-LABEL: func.func @test_dispatch_inreg( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.class>) { +// CHECK: %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class>) (%[[VAL_1]] : !fir.class>) -> tuple {pass_arg_pos = 0 : i32} +// CHECK: %[[VAL_3:.*]] = llvm.intr.stacksave : !llvm.ptr +// CHECK: %[[VAL_4:.*]] = fir.alloca tuple +// CHECK: fir.store %[[VAL_2]] to %[[VAL_4]] : !fir.ref> +// CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>) -> !fir.ref> +// CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref> +// CHECK: llvm.intr.stackrestore %[[VAL_3]] : !llvm.ptr +// CHECK: fir.store %[[VAL_6]] to %[[VAL_0]] : !fir.ref> +// CHECK: return +// CHECK: } +// CHECK: func.func private @test_sret(!fir.ref}>> {llvm.align = 8 : i32, llvm.sret = !fir.type}>}) + +// CHECK-LABEL: func.func @test_call_sret( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref}>>) { +// CHECK: %[[VAL_1:.*]] = llvm.intr.stacksave : !llvm.ptr +// CHECK: %[[VAL_2:.*]] = fir.alloca !fir.type}> +// CHECK: fir.call @test_sret(%[[VAL_2]]) : (!fir.ref}>>) -> () +// CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref}>>) -> !fir.ref}>> +// CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.ref}>> +// CHECK: llvm.intr.stackrestore %[[VAL_1]] : !llvm.ptr +// CHECK: fir.store %[[VAL_4]] to %[[VAL_0]] : !fir.ref}>> +// CHECK: return +// CHECK: } + +// CHECK-LABEL: func.func @test_addr_of_sret() -> (() -> ()) { +// CHECK: %[[VAL_0:.*]] = fir.address_of(@test_sret) : (!fir.ref}>>) -> () +// CHECK: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : ((!fir.ref}>>) -> ()) -> (() -> ()) +// CHECK: return %[[VAL_1]] : () -> () +// CHECK: } + +// CHECK-LABEL: func.func @test_dispatch_sret( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref}>>, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.class>) { +// CHECK: %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr +// CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type}> +// CHECK: fir.dispatch "bar"(%[[VAL_1]] : !fir.class>) (%[[VAL_3]], %[[VAL_1]] : !fir.ref}>>, !fir.class>) {pass_arg_pos = 1 : i32} +// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref}>>) -> !fir.ref}>> +// CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref}>> +// CHECK: llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr +// CHECK: fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref}>> +// CHECK: return +// CHECK: } + + +// CHECK: func.func private @test_fp_80() -> f80 +// CHECK: func.func private @test_complex_80(!fir.ref}>> {llvm.align = 16 : i32, llvm.sret = !fir.type}>}) +// CHECK: func.func private @test_two_fp_80(!fir.ref> {llvm.align = 16 : i32, llvm.sret = !fir.type}) +// CHECK: func.func private @test_fp128() -> f128 From 6779376ee917279b16e256839d236cfdf8fd9ab9 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 10 Oct 2024 14:38:07 +0100 Subject: [PATCH 014/177] [Dexter] Remove outdated imp dependency (#111833) Fixes: https://github.com/llvm/llvm-project/issues/111815 This patch replaces usage of the python `imp` library, which is deprecated since python3.4 and removed in python3.12, with the `importlib` library. As part of this update the repeated find_module+load_module pattern is moved into a utility function, since the importlib equivalent is much more verbose. --- .../dexter/dex/debugger/lldb/LLDB.py | 5 ++--- .../dex/debugger/visualstudio/VisualStudio.py | 8 +++----- .../debuginfo-tests/dexter/dex/tools/Main.py | 6 ++---- .../debuginfo-tests/dexter/dex/tools/help/Tool.py | 8 +++----- .../debuginfo-tests/dexter/dex/utils/Imports.py | 13 +++++++++++++ 5 files changed, 23 insertions(+), 17 deletions(-) create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py index 2307550aca047b..e8bc65cd3fbe88 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py @@ -7,7 +7,6 @@ """Interface for communicating with the LLDB debugger via its python interface. """ -import imp import os import shlex from subprocess import CalledProcessError, check_output, STDOUT @@ -18,6 +17,7 @@ from dex.dextIR import StackFrame, SourceLocation, ProgramState from dex.utils.Exceptions import DebuggerException, LoadDebuggerException from dex.utils.ReturnCode import ReturnCode +from dex.utils.Imports import load_module class LLDB(DebuggerBase): @@ -82,8 +82,7 @@ def _load_interface(self): ) try: - module_info = imp.find_module("lldb", [pythonpath]) - return imp.load_module("lldb", *module_info) + return load_module("lldb", pythonpath) except ImportError as e: msg = str(e) if msg.endswith("not a valid Win32 application."): diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py index 17587b3f3e18d6..7cb56ec0c25a76 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py @@ -7,7 +7,6 @@ """Interface for communicating with the Visual Studio debugger via DTE.""" import abc -import imp import os import sys from enum import IntEnum @@ -19,15 +18,14 @@ from dex.dextIR import FrameIR, LocIR, StepIR, StopReason, ValueIR from dex.dextIR import StackFrame, SourceLocation, ProgramState from dex.utils.Exceptions import Error, LoadDebuggerException +from dex.utils.Imports import load_module from dex.utils.ReturnCode import ReturnCode - def _load_com_module(): try: - module_info = imp.find_module( - "ComInterface", [os.path.join(os.path.dirname(__file__), "windows")] + return load_module( + "ComInterface", os.path.join(os.path.dirname(__file__), "windows") ) - return imp.load_module("ComInterface", *module_info) except ImportError as e: raise LoadDebuggerException(e, sys.exc_info()) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py index b6c146ad784062..512958d20f4bbc 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py @@ -10,7 +10,6 @@ subtool. """ -import imp import os import sys @@ -18,6 +17,7 @@ from dex.utils import ExtArgParse as argparse from dex.utils import get_root_directory from dex.utils.Exceptions import Error, ToolArgumentError +from dex.utils.Imports import load_module from dex.utils.Logging import Logger from dex.utils.UnitTests import unit_tests_ok from dex.utils.Version import version @@ -135,9 +135,7 @@ def _import_tool_module(tool_name): tool_name = tool_name.replace("-", "_") tools_directory = get_tools_directory() - module_info = imp.find_module(tool_name, [tools_directory]) - - return imp.load_module(tool_name, *module_info) + return load_module(tool_name, tools_directory) def tool_main(context, tool, args): diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py index 520bf9f59917af..44e0a0e65c4bac 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py @@ -6,10 +6,10 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Help tool.""" -import imp import textwrap from dex.tools import ToolBase, get_tool_names, get_tools_directory, tool_main +from dex.utils.Imports import load_module from dex.utils.ReturnCode import ReturnCode @@ -39,8 +39,7 @@ def _default_text(self): tools_directory = get_tools_directory() for tool_name in sorted(self._visible_tool_names): internal_name = tool_name.replace("-", "_") - module_info = imp.find_module(internal_name, [tools_directory]) - tool_doc = imp.load_module(internal_name, *module_info).Tool.__doc__ + tool_doc = load_module(internal_name, tools_directory).Tool.__doc__ tool_doc = tool_doc.strip() if tool_doc else "" tool_doc = textwrap.fill(" ".join(tool_doc.split()), 80) s += "{}\n{}\n\n".format(tool_name, tool_doc) @@ -53,6 +52,5 @@ def go(self) -> ReturnCode: tool_name = self.context.options.tool.replace("-", "_") tools_directory = get_tools_directory() - module_info = imp.find_module(tool_name, [tools_directory]) - module = imp.load_module(tool_name, *module_info) + module = load_module(tool_name, tools_directory) return tool_main(self.context, module.Tool(self.context), ["--help"]) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py new file mode 100644 index 00000000000000..ea052c21a18498 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py @@ -0,0 +1,13 @@ +import importlib +import os +import sys + + +def load_module(name, path): + spec = importlib.util.spec_from_file_location( + name, os.path.join(path, name, "__init__.py") + ) + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module + spec.loader.exec_module(module) + return module From a3cd269fbebecb6971e216a9c29ad8933ad7b0fc Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 10 Oct 2024 21:40:24 +0800 Subject: [PATCH 015/177] [RISCV] Remove {s,u}int_to_fp custom op action for f16/bf16 (#111471) It turns out that {s,u}int_to_fp nodes get their operation action from their operand's type, not the result type, so we don't need to set it for fp16 or bf16. vp_{s,u}int_to_fp uses the result type though so we need to keep it. This also means that we can lower int_to_fp for fixed length bf16 vectors already, so this adds tests for that. The cost model test changes are due to BasicTTIImpl's getCastInstrCost not taking into account that int_to_fp needs its legal type swapped. This can be fixed in a later patch, but its worth noting that the affected types in the tests currently crash when lowered anyway (due to them needing split at LMUL > 8) --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 +- .../Analysis/CostModel/RISCV/cast-half.ll | 8 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll | 66 +++++++++- .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll | 124 +++++++++++++++++- 4 files changed, 189 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 01fa418e4dbdf4..230ccd8209e1f2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1071,9 +1071,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::VP_SINT_TO_FP, - ISD::VP_UINT_TO_FP}, - VT, Custom); + setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT, Custom); setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE, @@ -1343,9 +1341,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction( {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT, Custom); - setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, - ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, - VT, Custom); + setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT, + Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); if (Subtarget.hasStdExtZfhmin()) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll index 84b5486eb2de1c..244c42cc94ba03 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll @@ -842,7 +842,7 @@ define void @sitofp() { ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> -; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> @@ -988,7 +988,7 @@ define void @sitofp() { ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> -; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> @@ -1208,7 +1208,7 @@ define void @uitofp() { ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> -; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> ; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> @@ -1354,7 +1354,7 @@ define void @uitofp() { ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> -; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> ; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll index bfcc7017178e31..a4a491989c7f02 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 define void @fp2si_v2f32_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: fp2si_v2f32_v2i32: @@ -432,6 +432,64 @@ define void @fp2ui_v8f32_v8i64(ptr %x, ptr %y) { ret void } +define void @fp2si_v2bf16_v2i64(ptr %x, ptr %y) { +; CHECK-LABEL: fp2si_v2bf16_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.rtz.x.f.v v8, v9 +; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: ret + %a = load <2 x bfloat>, ptr %x + %d = fptosi <2 x bfloat> %a to <2 x i64> + store <2 x i64> %d, ptr %y + ret void +} + +define void @fp2ui_v2bf16_v2i64(ptr %x, ptr %y) { +; CHECK-LABEL: fp2ui_v2bf16_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.rtz.xu.f.v v8, v9 +; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: ret + %a = load <2 x bfloat>, ptr %x + %d = fptoui <2 x bfloat> %a to <2 x i64> + store <2 x i64> %d, ptr %y + ret void +} + +define <2 x i1> @fp2si_v2bf16_v2i1(<2 x bfloat> %x) { +; CHECK-LABEL: fp2si_v2bf16_v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: ret + %z = fptosi <2 x bfloat> %x to <2 x i1> + ret <2 x i1> %z +} + +define <2 x i1> @fp2ui_v2bf16_v2i1(<2 x bfloat> %x) { +; CHECK-LABEL: fp2ui_v2bf16_v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: ret + %z = fptoui <2 x bfloat> %x to <2 x i1> + ret <2 x i1> %z +} + define void @fp2si_v2f16_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: fp2si_v2f16_v2i64: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll index 7333067e9205e0..9cdc9b81c9530a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 define void @si2fp_v2i32_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: si2fp_v2i32_v2f32: @@ -418,6 +418,122 @@ define <8 x double> @ui2fp_v8i1_v8f64(<8 x i1> %x) { ret <8 x double> %z } +define void @si2fp_v2i64_v2bf16(ptr %x, ptr %y) { +; CHECK-LABEL: si2fp_v2i64_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfncvt.f.x.w v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <2 x i64>, ptr %x + %d = sitofp <2 x i64> %a to <2 x bfloat> + store <2 x bfloat> %d, ptr %y + ret void +} + +define void @ui2fp_v2i64_v2bf16(ptr %x, ptr %y) { +; CHECK-LABEL: ui2fp_v2i64_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfncvt.f.xu.w v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <2 x i64>, ptr %x + %d = uitofp <2 x i64> %a to <2 x bfloat> + store <2 x bfloat> %d, ptr %y + ret void +} + +define <2 x bfloat> @si2fp_v2i1_v2bf16(<2 x i1> %x) { +; CHECK-LABEL: si2fp_v2i1_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, -1, v0 +; CHECK-NEXT: vfwcvt.f.x.v v9, v8 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %z = sitofp <2 x i1> %x to <2 x bfloat> + ret <2 x bfloat> %z +} + +define <2 x bfloat> @ui2fp_v2i1_v2bf16(<2 x i1> %x) { +; CHECK-LABEL: ui2fp_v2i1_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %z = uitofp <2 x i1> %x to <2 x bfloat> + ret <2 x bfloat> %z +} + +define void @si2fp_v8i64_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: si2fp_v8i64_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfncvt.f.x.w v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <8 x i64>, ptr %x + %d = sitofp <8 x i64> %a to <8 x bfloat> + store <8 x bfloat> %d, ptr %y + ret void +} + +define void @ui2fp_v8i64_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: ui2fp_v8i64_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfncvt.f.xu.w v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <8 x i64>, ptr %x + %d = uitofp <8 x i64> %a to <8 x bfloat> + store <8 x bfloat> %d, ptr %y + ret void +} + +define <8 x bfloat> @si2fp_v8i1_v8bf16(<8 x i1> %x) { +; CHECK-LABEL: si2fp_v8i1_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, -1, v0 +; CHECK-NEXT: vfwcvt.f.x.v v10, v8 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %z = sitofp <8 x i1> %x to <8 x bfloat> + ret <8 x bfloat> %z +} + +define <8 x bfloat> @ui2fp_v8i1_v8bf16(<8 x i1> %x) { +; CHECK-LABEL: ui2fp_v8i1_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %z = uitofp <8 x i1> %x to <8 x bfloat> + ret <8 x bfloat> %z +} + define void @si2fp_v2i64_v2f16(ptr %x, ptr %y) { ; CHECK-LABEL: si2fp_v2i64_v2f16: ; CHECK: # %bb.0: From 0a0f100a70583725428ec317138b09f935a2b9bb Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Thu, 10 Oct 2024 15:03:01 +0100 Subject: [PATCH 016/177] Revert "[LTO] Run Argument Promotion before IPSCCP" (#111839) Reverts llvm/llvm-project#111163, as this was merged prematurely. --- llvm/lib/Passes/PassBuilderPipelines.cpp | 9 --------- llvm/test/Other/new-pm-lto-defaults.ll | 9 +++------ 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index cdb9431c755bce..8f151a99b11709 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1831,15 +1831,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(PGOIndirectCallPromotion( true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); - // Promoting by-reference arguments to by-value exposes more constants to - // IPSCCP. - MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( - PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true))); - MPM.addPass( - createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); - MPM.addPass( - createModuleToFunctionPassAdaptor(SROAPass(SROAOptions::ModifyCFG))); - // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll index 2dd754ecef4d7b..5543472df685b0 100644 --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -41,17 +41,14 @@ ; CHECK-O23SZ-NEXT: PGOIndirectCallPromotion ; CHECK-O23SZ-NEXT: Running analysis: ProfileSummaryAnalysis ; CHECK-O23SZ-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis +; CHECK-O23SZ-NEXT: Running pass: IPSCCPPass +; CHECK-O23SZ-NEXT: Running analysis: AssumptionAnalysis on foo +; CHECK-O23SZ-NEXT: Running pass: CalledValuePropagationPass ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}SCC ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis ; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> -; CHECK-O23SZ-NEXT: Running pass: PostOrderFunctionAttrsPass -; CHECK-O23SZ-NEXT: Running pass: ArgumentPromotionPass -; CHECK-O23SZ-NEXT: Running pass: SROAPass -; CHECK-O23SZ-NEXT: Running analysis: AssumptionAnalysis on foo -; CHECK-O23SZ-NEXT: Running pass: IPSCCPPass -; CHECK-O23SZ-NEXT: Running pass: CalledValuePropagationPass ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: AAManager ; CHECK-O-NEXT: Running analysis: BasicAA From dabb0ddbd7a7229855156c61df1d35ad845361ac Mon Sep 17 00:00:00 2001 From: Vladimir Radosavljevic <129192835+vladimirradosavljevic@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:05:42 +0200 Subject: [PATCH 017/177] [MCP] Skip invalidating def constant regs during forward propagation (#111129) Before this patch, redundant COPY couldn't be removed for the following case: ``` %reg1 = COPY %const-reg ... // There is a def of %const-reg %reg2 = COPY killed %reg1 ``` where this can be optimized to: ``` ... // There is a def of %const-reg %reg2 = COPY %const-reg ``` This patch allows for such optimization by not invalidating defined constant registers. This is safe, as architectures like AArch64 and RISCV replace a dead definition of a GPR with a zero constant register for certain instructions. --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 7 +++++-- .../AArch64/machine-cp-constant-reg.mir | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 8bcc437cbfb865..fb4da2c11cda77 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -886,8 +886,11 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { "MachineCopyPropagation should be run after register allocation!"); if (MO.isDef() && !MO.isEarlyClobber()) { - Defs.push_back(Reg.asMCReg()); - continue; + // Skip invalidating constant registers. + if (!MRI->isConstantPhysReg(Reg)) { + Defs.push_back(Reg.asMCReg()); + continue; + } } else if (MO.readsReg()) ReadRegister(Reg.asMCReg(), MI, MO.isDebug() ? DebugUse : RegularUse); } diff --git a/llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir new file mode 100644 index 00000000000000..cad55b9daffafd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir @@ -0,0 +1,19 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: test +body: | + bb.0: + liveins: $w2 + ; CHECK-LABEL: name: test + ; CHECK: liveins: $w2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $wzr = SUBSWri killed renamable $w2, 0, 0, implicit-def $nzcv + ; CHECK-NEXT: renamable $w0 = COPY $wzr + ; CHECK-NEXT: RET_ReallyLR implicit killed $w0 + renamable $w1 = COPY $wzr + $wzr = SUBSWri killed renamable $w2, 0, 0, implicit-def $nzcv + renamable $w0 = COPY killed renamable $w1 + RET_ReallyLR implicit killed $w0 +... From 3737a5321901574b3f4b2cf0d798faea5c4a2302 Mon Sep 17 00:00:00 2001 From: Tyler Nowicki Date: Thu, 10 Oct 2024 10:08:45 -0400 Subject: [PATCH 018/177] [Coroutines] Support for Custom ABIs (#111755) This change extends the current method for creating ABI object to allow users (plugin libraries) to create custom ABI objects for their needs. This is accomplished by inheriting one of the common ABIs and overriding one or more of the methods to create a custom ABI. To use a custom ABI for a given coroutine the coro.begin.custom.abi intrinsic is used in place of the coro.begin intrinsic. This takes an additional i32 arg that specifies the index of an ABI generator for the custom ABI object in a SmallVector passed to the CoroSplitPass ctor. The detailed changes include: * Add the llvm.coro.begin.custom intrinsic used to specify the index of the custom ABI to use for the given coroutine. * Add constructors to CoroSplit that take a list of generators that create the custom ABI object. * Extend the CreateNewABI function used by CoroSplit to return a unique_ptr to an ABI object. * Add has/getCustomABI methods to CoroBeginInst class. * Add a unittest for a custom ABI. See doc update here: https://github.com/llvm/llvm-project/pull/111781 --- .../llvm/Analysis/TargetTransformInfoImpl.h | 1 + llvm/include/llvm/IR/Intrinsics.td | 3 +- llvm/include/llvm/Transforms/Coroutines/ABI.h | 8 +- .../llvm/Transforms/Coroutines/CoroInstr.h | 19 +++- .../llvm/Transforms/Coroutines/CoroSplit.h | 13 ++- .../lib/Transforms/Coroutines/CoroCleanup.cpp | 4 +- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 38 +++++++- llvm/lib/Transforms/Coroutines/Coroutines.cpp | 4 +- .../Transforms/Coroutines/ExtraRematTest.cpp | 87 +++++++++++++++++++ 9 files changed, 164 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 6d3ce93acbe451..3d0140ad7ad7a3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -778,6 +778,7 @@ class TargetTransformInfoImplBase { case Intrinsic::experimental_gc_relocate: case Intrinsic::coro_alloc: case Intrinsic::coro_begin: + case Intrinsic::coro_begin_custom_abi: case Intrinsic::coro_free: case Intrinsic::coro_end: case Intrinsic::coro_frame: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 20dd921ddbd230..8a0721cf23f538 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1719,7 +1719,8 @@ def int_coro_prepare_async : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>; def int_coro_begin : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty], [WriteOnly>]>; - +def int_coro_begin_custom_abi : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty, llvm_i32_ty], + [WriteOnly>]>; def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly, ReadOnly>, diff --git a/llvm/include/llvm/Transforms/Coroutines/ABI.h b/llvm/include/llvm/Transforms/Coroutines/ABI.h index e7568d275c1615..8b83c5308056eb 100644 --- a/llvm/include/llvm/Transforms/Coroutines/ABI.h +++ b/llvm/include/llvm/Transforms/Coroutines/ABI.h @@ -29,7 +29,13 @@ namespace coro { // This interface/API is to provide an object oriented way to implement ABI // functionality. This is intended to replace use of the ABI enum to perform // ABI operations. The ABIs (e.g. Switch, Async, Retcon{Once}) are the common -// ABIs. +// ABIs. However, specific users may need to modify the behavior of these. This +// can be accomplished by inheriting one of the common ABIs and overriding one +// or more of the methods to create a custom ABI. To use a custom ABI for a +// given coroutine the coro.begin.custom.abi intrinsic is used in place of the +// coro.begin intrinsic. This takes an additional i32 arg that specifies the +// index of an ABI generator for the custom ABI object in a SmallVector passed +// to CoroSplitPass ctor. class BaseABI { public: diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h index a329a06bf13891..3aa30bec85c3a5 100644 --- a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h @@ -124,7 +124,8 @@ class AnyCoroIdInst : public IntrinsicInst { IntrinsicInst *getCoroBegin() { for (User *U : users()) if (auto *II = dyn_cast(U)) - if (II->getIntrinsicID() == Intrinsic::coro_begin) + if (II->getIntrinsicID() == Intrinsic::coro_begin || + II->getIntrinsicID() == Intrinsic::coro_begin_custom_abi) return II; llvm_unreachable("no coro.begin associated with coro.id"); } @@ -442,20 +443,30 @@ class CoroFreeInst : public IntrinsicInst { } }; -/// This class represents the llvm.coro.begin instructions. +/// This class represents the llvm.coro.begin or llvm.coro.begin.custom.abi +/// instructions. class CoroBeginInst : public IntrinsicInst { - enum { IdArg, MemArg }; + enum { IdArg, MemArg, CustomABIArg }; public: AnyCoroIdInst *getId() const { return cast(getArgOperand(IdArg)); } + bool hasCustomABI() const { + return getIntrinsicID() == Intrinsic::coro_begin_custom_abi; + } + + int getCustomABI() const { + return cast(getArgOperand(CustomABIArg))->getZExtValue(); + } + Value *getMem() const { return getArgOperand(MemArg); } // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { - return I->getIntrinsicID() == Intrinsic::coro_begin; + return I->getIntrinsicID() == Intrinsic::coro_begin || + I->getIntrinsicID() == Intrinsic::coro_begin_custom_abi; } static bool classof(const Value *V) { return isa(V) && classof(cast(V)); diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h b/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h index a5fd57f8f9dfab..6c6a982e828050 100644 --- a/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h @@ -28,17 +28,26 @@ struct Shape; } // namespace coro struct CoroSplitPass : PassInfoMixin { + using BaseABITy = + std::function(Function &, coro::Shape &)>; CoroSplitPass(bool OptimizeFrame = false); + + CoroSplitPass(SmallVector GenCustomABIs, + bool OptimizeFrame = false); + CoroSplitPass(std::function MaterializableCallback, bool OptimizeFrame = false); + CoroSplitPass(std::function MaterializableCallback, + SmallVector GenCustomABIs, + bool OptimizeFrame = false); + PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR); + static bool isRequired() { return true; } - using BaseABITy = - std::function(Function &, coro::Shape &)>; // Generator for an ABI transformer BaseABITy CreateAndInitABI; diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp index dd92b3593af92e..1cda7f93f72a2c 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -53,6 +53,7 @@ bool Lowerer::lower(Function &F) { default: continue; case Intrinsic::coro_begin: + case Intrinsic::coro_begin_custom_abi: II->replaceAllUsesWith(II->getArgOperand(1)); break; case Intrinsic::coro_free: @@ -112,7 +113,8 @@ static bool declaresCoroCleanupIntrinsics(const Module &M) { M, {"llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.subfn.addr", "llvm.coro.free", "llvm.coro.id", "llvm.coro.id.retcon", "llvm.coro.id.async", "llvm.coro.id.retcon.once", - "llvm.coro.async.size.replace", "llvm.coro.async.resume"}); + "llvm.coro.async.size.replace", "llvm.coro.async.resume", + "llvm.coro.begin.custom.abi"}); } PreservedAnalyses CoroCleanupPass::run(Module &M, diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index ef1f27118bc14b..88ce331c8cfb64 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -2200,7 +2200,15 @@ static void addPrepareFunction(const Module &M, static std::unique_ptr CreateNewABI(Function &F, coro::Shape &S, - std::function IsMatCallback) { + std::function IsMatCallback, + const SmallVector GenCustomABIs) { + if (S.CoroBegin->hasCustomABI()) { + unsigned CustomABI = S.CoroBegin->getCustomABI(); + if (CustomABI >= GenCustomABIs.size()) + llvm_unreachable("Custom ABI not found amoung those specified"); + return GenCustomABIs[CustomABI](F, S); + } + switch (S.ABI) { case coro::ABI::Switch: return std::unique_ptr( @@ -2221,7 +2229,17 @@ CreateNewABI(Function &F, coro::Shape &S, CoroSplitPass::CoroSplitPass(bool OptimizeFrame) : CreateAndInitABI([](Function &F, coro::Shape &S) { std::unique_ptr ABI = - CreateNewABI(F, S, coro::isTriviallyMaterializable); + CreateNewABI(F, S, coro::isTriviallyMaterializable, {}); + ABI->init(); + return ABI; + }), + OptimizeFrame(OptimizeFrame) {} + +CoroSplitPass::CoroSplitPass( + SmallVector GenCustomABIs, bool OptimizeFrame) + : CreateAndInitABI([=](Function &F, coro::Shape &S) { + std::unique_ptr ABI = + CreateNewABI(F, S, coro::isTriviallyMaterializable, GenCustomABIs); ABI->init(); return ABI; }), @@ -2232,7 +2250,21 @@ CoroSplitPass::CoroSplitPass(bool OptimizeFrame) CoroSplitPass::CoroSplitPass(std::function IsMatCallback, bool OptimizeFrame) : CreateAndInitABI([=](Function &F, coro::Shape &S) { - std::unique_ptr ABI = CreateNewABI(F, S, IsMatCallback); + std::unique_ptr ABI = + CreateNewABI(F, S, IsMatCallback, {}); + ABI->init(); + return ABI; + }), + OptimizeFrame(OptimizeFrame) {} + +// For back compatibility, constructor takes a materializable callback and +// creates a generator for an ABI with a modified materializable callback. +CoroSplitPass::CoroSplitPass( + std::function IsMatCallback, + SmallVector GenCustomABIs, bool OptimizeFrame) + : CreateAndInitABI([=](Function &F, coro::Shape &S) { + std::unique_ptr ABI = + CreateNewABI(F, S, IsMatCallback, GenCustomABIs); ABI->init(); return ABI; }), diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index f4d9a7a8aa8569..1c45bcd7f6a837 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -73,6 +73,7 @@ static const char *const CoroIntrinsics[] = { "llvm.coro.await.suspend.handle", "llvm.coro.await.suspend.void", "llvm.coro.begin", + "llvm.coro.begin.custom.abi", "llvm.coro.destroy", "llvm.coro.done", "llvm.coro.end", @@ -247,7 +248,8 @@ void coro::Shape::analyze(Function &F, } break; } - case Intrinsic::coro_begin: { + case Intrinsic::coro_begin: + case Intrinsic::coro_begin_custom_abi: { auto CB = cast(II); // Ignore coro id's that aren't pre-split. diff --git a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp index 1d55889a32d7aa..c3394fdaa940ba 100644 --- a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp +++ b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp @@ -182,4 +182,91 @@ TEST_F(ExtraRematTest, TestCoroRematWithCallback) { CallInst *CI = getCallByName(Resume1, "should.remat"); ASSERT_TRUE(CI); } + +StringRef TextCoroBeginCustomABI = R"( + define ptr @f(i32 %n) presplitcoroutine { + entry: + %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call ptr @malloc(i32 %size) + %hdl = call ptr @llvm.coro.begin.custom.abi(token %id, ptr %alloc, i32 0) + + %inc1 = add i32 %n, 1 + %val2 = call i32 @should.remat(i32 %inc1) + %sp1 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sp1, label %suspend [i8 0, label %resume1 + i8 1, label %cleanup] + resume1: + %inc2 = add i32 %val2, 1 + %sp2 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sp1, label %suspend [i8 0, label %resume2 + i8 1, label %cleanup] + + resume2: + call void @print(i32 %val2) + call void @print(i32 %inc2) + br label %cleanup + + cleanup: + %mem = call ptr @llvm.coro.free(token %id, ptr %hdl) + call void @free(ptr %mem) + br label %suspend + suspend: + call i1 @llvm.coro.end(ptr %hdl, i1 0) + ret ptr %hdl + } + + declare ptr @llvm.coro.free(token, ptr) + declare i32 @llvm.coro.size.i32() + declare i8 @llvm.coro.suspend(token, i1) + declare void @llvm.coro.resume(ptr) + declare void @llvm.coro.destroy(ptr) + + declare token @llvm.coro.id(i32, ptr, ptr, ptr) + declare i1 @llvm.coro.alloc(token) + declare ptr @llvm.coro.begin.custom.abi(token, ptr, i32) + declare i1 @llvm.coro.end(ptr, i1) + + declare i32 @should.remat(i32) + + declare noalias ptr @malloc(i32) + declare void @print(i32) + declare void @free(ptr) + )"; + +// SwitchABI with overridden isMaterializable +class ExtraCustomABI : public coro::SwitchABI { +public: + ExtraCustomABI(Function &F, coro::Shape &S) + : coro::SwitchABI(F, S, ExtraMaterializable) {} +}; + +TEST_F(ExtraRematTest, TestCoroRematWithCustomABI) { + ParseAssembly(TextCoroBeginCustomABI); + + ASSERT_TRUE(M); + + CoroSplitPass::BaseABITy GenCustomABI = [](Function &F, coro::Shape &S) { + return std::unique_ptr(new ExtraCustomABI(F, S)); + }; + + CGSCCPassManager CGPM; + CGPM.addPass(CoroSplitPass({GenCustomABI})); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + MPM.run(*M, MAM); + + // Verify that extra rematerializable instruction has been rematerialized + Function *F = M->getFunction("f.resume"); + ASSERT_TRUE(F) << "could not find split function f.resume"; + + BasicBlock *Resume1 = getBasicBlockByName(F, "resume1"); + ASSERT_TRUE(Resume1) + << "could not find expected BB resume1 in split function"; + + // With callback the extra rematerialization of the function should have + // happened + CallInst *CI = getCallByName(Resume1, "should.remat"); + ASSERT_TRUE(CI); +} + } // namespace From 005e601611095f1bed4ca7e6c37c17645e75ca0c Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 10 Oct 2024 10:11:49 -0400 Subject: [PATCH 019/177] [gn] port 0e913237871e (LLDB_TEST_MAKE) --- llvm/utils/gn/secondary/lldb/test/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn index 749fda78b4f171..cb6380882e7cd7 100644 --- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn @@ -68,6 +68,7 @@ write_lit_cfg("lit_api_site_cfg") { "LLDB_TEST_BUILD_DIRECTORY=" + rebase_path("$target_gen_dir/test_build"), "LLDB_TEST_DSYMUTIL=" + rebase_path("$root_build_dir/bin/dsymutil"), "LLDB_TEST_EXECUTABLE=" + rebase_path("$root_build_dir/bin/lldb"), + "LLDB_TEST_MAKE=make", "LLDB_TEST_MODULE_CACHE_CLANG=" + rebase_path( "$target_gen_dir/lldb-test-build.noindex/module-cache-clang"), From 545e0593f8c59376a7ec8c6eb558babf6c9f93c1 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 10 Oct 2024 07:22:28 -0700 Subject: [PATCH 020/177] [libc] Clean up 'vasprintf' implementation (#111761) Summary: This had some leftover references to the old namespace and didn't put restrict on it. --- libc/src/stdio/asprintf.cpp | 7 ++++--- libc/src/stdio/asprintf.h | 2 +- libc/src/stdio/printf_core/vasprintf_internal.h | 6 +++--- libc/src/stdio/vasprintf.cpp | 7 ++++--- libc/src/stdio/vasprintf.h | 3 ++- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp index 88b458a9e103bf..f8cfb74ce48ea2 100644 --- a/libc/src/stdio/asprintf.cpp +++ b/libc/src/stdio/asprintf.cpp @@ -11,10 +11,11 @@ #include "src/__support/macros/config.h" #include "src/stdio/printf_core/vasprintf_internal.h" -namespace LIBC_NAMESPACE { +namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, asprintf, - (char **__restrict buffer, const char *format, ...)) { + (char **__restrict buffer, const char *__restrict format, + ...)) { va_list vlist; va_start(vlist, format); internal::ArgList args(vlist); // This holder class allows for easier copying @@ -25,4 +26,4 @@ LLVM_LIBC_FUNCTION(int, asprintf, return ret; } -} // namespace LIBC_NAMESPACE +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/asprintf.h b/libc/src/stdio/asprintf.h index 0c0d5a350829e7..222dfdee9d4fd7 100644 --- a/libc/src/stdio/asprintf.h +++ b/libc/src/stdio/asprintf.h @@ -13,7 +13,7 @@ namespace LIBC_NAMESPACE { -int asprintf(char **__restrict s, const char *format, ...); +int asprintf(char **__restrict s, const char *__restrict format, ...); } // namespace LIBC_NAMESPACE diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h index 24ebc02a0b33f2..e3448eebd302b7 100644 --- a/libc/src/stdio/printf_core/vasprintf_internal.h +++ b/libc/src/stdio/printf_core/vasprintf_internal.h @@ -13,7 +13,7 @@ #include "src/stdio/printf_core/writer.h" #include // malloc, realloc, free -namespace LIBC_NAMESPACE { +namespace LIBC_NAMESPACE_DECL { namespace printf_core { LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { @@ -40,7 +40,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { constexpr size_t DEFAULT_BUFFER_SIZE = 200; -LIBC_INLINE int vasprintf_internal(char **ret, const char *format, +LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format, internal::ArgList args) { char init_buff_on_stack[DEFAULT_BUFFER_SIZE]; printf_core::WriteBuffer wb(init_buff_on_stack, DEFAULT_BUFFER_SIZE, @@ -64,4 +64,4 @@ LIBC_INLINE int vasprintf_internal(char **ret, const char *format, return ret_val; } } // namespace printf_core -} // namespace LIBC_NAMESPACE +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp index 7fa4cc6f127dda..4a44d4a0f88426 100644 --- a/libc/src/stdio/vasprintf.cpp +++ b/libc/src/stdio/vasprintf.cpp @@ -10,14 +10,15 @@ #include "src/__support/arg_list.h" #include "src/stdio/printf_core/vasprintf_internal.h" -namespace LIBC_NAMESPACE { +namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, vasprintf, - (char **__restrict ret, const char *format, va_list vlist)) { + (char **__restrict ret, const char *__restrict format, + va_list vlist)) { internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. return printf_core::vasprintf_internal(ret, format, args); } -} // namespace LIBC_NAMESPACE +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vasprintf.h b/libc/src/stdio/vasprintf.h index 792e948cf1850c..8b286fe69bf203 100644 --- a/libc/src/stdio/vasprintf.h +++ b/libc/src/stdio/vasprintf.h @@ -13,7 +13,8 @@ namespace LIBC_NAMESPACE { -int vasprintf(char **__restrict s, const char *format, va_list vlist); +int vasprintf(char **__restrict s, const char *__restrict format, + va_list vlist); } // namespace LIBC_NAMESPACE From e023d0270eb32cacdc720bbeea262b2869f7e9e4 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Thu, 10 Oct 2024 10:43:26 -0400 Subject: [PATCH 021/177] [AMDGPU][test]update error dasm test for update-mc-test-check script (#111760) The previous error test line is using a 16bit instruction to indicate an error. However this is a poor pick. The 16bit instructions on AMDGPU is under development and thus, some downstream branches are not showing this exact error message. Changing it to another error dasm code. --- .../update_mc_test_checks/Inputs/amdgpu_dasm.txt | 2 +- .../update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt index 9f5fba6e50df25..e8338577cfc47c 100644 --- a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt +++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt @@ -2,4 +2,4 @@ 0x00,0x00,0x00,0x7e -0xfd,0xb8,0x0a,0x7f +0x00,0x00,0x00,0x01 diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected index 1b64695fc29408..a6f7abcb1774ac 100644 --- a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected +++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected @@ -4,5 +4,5 @@ 0x00,0x00,0x00,0x7e # CHECK: v_nop ; encoding: [0x00,0x00,0x00,0x7e] -0xfd,0xb8,0x0a,0x7f +0x00,0x00,0x00,0x01 # CHECK: :[[@LINE-1]]:1: warning: invalid instruction encoding From f59b0c76030aff268b78d475e219708d06b982b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Thu, 10 Oct 2024 15:45:52 +0100 Subject: [PATCH 022/177] [mlir][linalg][nfc] Delete references to args_in/args_out (#111517) After the refactor in: * ed229132f1c4ea2ba0644fc345d8279e47a00565, the `args_in` and `args_out` attributes are no longer used by `linalg.generic`. This patch removes most the remaining references. I've left out BufferDeallocationInternals.md, which doesn't seem maintained anymore and is quite out of sync with other bits of MLIR (e.g. `test.generic` instead of `linalg.generic`). --- .../Dialect/Bufferization/Transforms/Passes.td | 7 ------- .../lib/Dialect/Linalg/Transforms/DropUnitDims.cpp | 4 ---- mlir/test/Dialect/Linalg/loops.mlir | 14 -------------- mlir/test/Dialect/Linalg/transform-patterns.mlir | 2 -- .../Linalg/vectorization-with-patterns.mlir | 8 -------- 5 files changed, 35 deletions(-) diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index a610ddcc9899ed..a683a905cd2d6b 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -32,8 +32,6 @@ def BufferDeallocation : Pass<"buffer-deallocation", "func::FuncOp"> { ^bb2: %0 = memref.alloc() : memref<2xf32> linalg.generic { - args_in = 1 : i64, - args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %0 { ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): @@ -63,8 +61,6 @@ def BufferDeallocation : Pass<"buffer-deallocation", "func::FuncOp"> { ^bb2: // pred: ^bb0 %1 = memref.alloc() : memref<2xf32> linalg.generic { - args_in = 1 : i64, - args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %1 { ^bb0(%arg3: f32, %arg4: f32): @@ -143,8 +139,6 @@ def OwnershipBasedBufferDeallocation : Pass< ^bb2: %0 = memref.alloc() : memref<2xf32> linalg.generic { - args_in = 1 : i64, - args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} outs(%arg1, %0 : memref<2xf32>, memref<2xf32>) { @@ -179,7 +173,6 @@ def OwnershipBasedBufferDeallocation : Pass< indexing_maps = [#map, #map], iterator_types = ["parallel"]} outs(%arg1, %alloc : memref<2xf32>, memref<2xf32>) - attrs = {args_in = 1 : i64, args_out = 1 : i64} { ^bb0(%out: f32, %out_0: f32): %2 = math.exp %out : f32 linalg.yield %2, %out_0 : f32, f32 diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp index 90ee0fb3bf0b6b..bacc634f5ee554 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @@ -178,8 +178,6 @@ struct MoveInitOperandsToInput : public OpRewritePattern { /// ] /// /// #trait = { -/// args_in = 2, -/// args_out = 1, /// indexing_maps = #accesses, /// iterator_types = ["parallel", "parallel"], /// library_call = "some_external_fn" @@ -210,8 +208,6 @@ struct MoveInitOperandsToInput : public OpRewritePattern { /// ] /// /// #trait = { -/// args_in = 2, -/// args_out = 1, /// indexing_maps = #accesses, /// iterator_types = ["parallel", "parallel"], /// library_call = "some_external_fn" diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir index 6ddbd06389f5eb..6286a11c11a21f 100644 --- a/mlir/test/Dialect/Linalg/loops.mlir +++ b/mlir/test/Dialect/Linalg/loops.mlir @@ -254,8 +254,6 @@ func.func @copy_view(%arg0: memref>, %arg1: memre affine_map<(i, j, k) -> (i, k, j)> ] #trait2 = { - args_in = 1, - args_out = 2, iterator_types = ["parallel", "parallel", "parallel"], indexing_maps = #accesses, library_call = "some_external_function_name_2", @@ -296,8 +294,6 @@ func.func @generic_region(%arg0: memref>, %a // CHECKPARALLEL: store %[[e]], %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref> #trait4 = { - args_in = 1, - args_out = 2, iterator_types = ["parallel", "parallel", "parallel"], indexing_maps = #accesses, library_call = "some_external_function_name_2", @@ -366,8 +362,6 @@ func.func @generic_index_region( ] #trait_broadcast = { - args_in = 1, - args_out = 1, indexing_maps = #broadcast_access, iterator_types = ["parallel", "parallel"], library_call = "some_broadcast_external_fn" @@ -466,8 +460,6 @@ func.func @generic_index_op_zero_rank(%arg0: memref, %arg1: memref<3x4xi32> ] #trait_reduce_1D = { - args_in = 1, - args_out = 1, indexing_maps = #reduce_1D_access, iterator_types = ["reduction"], library_call = "some_reduce_external_fn" @@ -510,8 +502,6 @@ func.func @generic_op_1D_reduce(%arg0: memref, %arg1: memref) ] #trait_reduce_init_1D = { - args_in = 2, - args_out = 1, indexing_maps = #reduce_init_1D_access, iterator_types = ["reduction"], library_call = "some_reduce_external_fn" @@ -559,8 +549,6 @@ func.func @generic_index_op_1D_reduce(%arg0: memref, // CHECKPARALLEL: store %[[e]], %[[ARG2]][] #trait_const_fill = { - args_in = 0, - args_out = 1, indexing_maps = [affine_map<(i) -> (i)>], iterator_types = ["parallel"], library_call = "some_external_fn" @@ -591,8 +579,6 @@ func.func @generic_const_init(%arg0: memref) { affine_map<() -> ()> ] #scalar_trait = { - args_in = 2, - args_out = 1, iterator_types = [], indexing_maps = #scalar_access, library_call = "some_external_fn" diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir index 87b7664198dae1..176e55e3e6c4aa 100644 --- a/mlir/test/Dialect/Linalg/transform-patterns.mlir +++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir @@ -118,8 +118,6 @@ module attributes {transform.with_named_sequence} { affine_map<(m, n, k) -> (m, n)> ] #generic_matmul_trait = { - args_in = 2, - args_out = 1, indexing_maps = #matmul_accesses, library_call = "linalg_matmul", iterator_types = ["parallel", "parallel", "reduction"] diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir index e7beb725471123..1c6a786bfa436d 100644 --- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir @@ -83,8 +83,6 @@ module attributes {transform.with_named_sequence} { // ----- #matmul_trait = { - args_in = 2, - args_out = 1, indexing_maps = [ affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (k, n)>, @@ -125,8 +123,6 @@ module attributes {transform.with_named_sequence} { // ----- #matmul_transpose_out_trait = { - args_in = 2, - args_out = 1, indexing_maps = [ affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (k, n)>, @@ -196,8 +192,6 @@ module attributes {transform.with_named_sequence} { // ----- #matmul_trait = { - args_in = 2, - args_out = 1, indexing_maps = [ affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (k, n)>, @@ -528,8 +522,6 @@ func.func @generic_vectorize(%arg0: memref<4x256xf32>, // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index %c1_f32 = arith.constant 1.0 : f32 linalg.generic { - args_in = 0 : i64, - args_out = 10 : i64, indexing_maps = [ affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, From 058ede06c4ffd4e3c9f54d947e3bfb027c2d0557 Mon Sep 17 00:00:00 2001 From: Vladislav Dzhidzhoev Date: Thu, 10 Oct 2024 17:14:13 +0200 Subject: [PATCH 023/177] [lldb][test] Use `xcrun -f strip` for API tests on Darwin (#111842) A follow-up for https://github.com/llvm/llvm-project/pull/111816. This is to fix buildbot failure https://lab.llvm.org/staging/#/builders/195/builds/4242. TestSymbolFileJSON.py doesn't pass with llvm-strip on macOS. Apparently, llvm-strip/llvm-objcopy can't clean symbols from Mach-O nlists. --- lldb/packages/Python/lldbsuite/test/builders/builder.py | 4 ++++ lldb/test/API/functionalities/json/symbol-file/Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py index d399a5b228c131..de057324694486 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/builder.py +++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py @@ -10,6 +10,7 @@ import lldbsuite.test.lldbutil as lldbutil from lldbsuite.test import configuration from lldbsuite.test_event import build_exception +from lldbsuite.support import seven class Builder: @@ -190,6 +191,9 @@ def getToolchainUtil(util_name): if not util_paths["DWP"]: del util_paths["DWP"] + if lldbplatformutil.platformIsDarwin(): + util_paths["STRIP"] = seven.get_command_output("xcrun -f strip") + for var, path in util_paths.items(): utils.append("%s=%s" % (var, path)) diff --git a/lldb/test/API/functionalities/json/symbol-file/Makefile b/lldb/test/API/functionalities/json/symbol-file/Makefile index aff841c364299c..13bc164582eeee 100644 --- a/lldb/test/API/functionalities/json/symbol-file/Makefile +++ b/lldb/test/API/functionalities/json/symbol-file/Makefile @@ -3,6 +3,6 @@ C_SOURCES := main.c all: stripped.out stripped.out : a.out - strip a.out -o stripped.out + $(STRIP) a.out -o stripped.out include Makefile.rules From 77c842f44cc06951975fd4a85761e0bc830d185a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 10 Oct 2024 08:15:48 -0700 Subject: [PATCH 024/177] [clang-apply-replacements] Avoid repeated hash lookups (NFC) (#111783) --- .../lib/Tooling/ApplyReplacements.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp index 9e0da82dfd3806..b895075e4f31cc 100644 --- a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp +++ b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp @@ -148,11 +148,8 @@ groupReplacements(const TUReplacements &TUs, const TUDiagnostics &TUDs, if (auto Entry = SM.getFileManager().getOptionalFileRef(Path)) { if (SourceTU) { - auto &Replaces = DiagReplacements[*Entry]; - auto It = Replaces.find(R); - if (It == Replaces.end()) - Replaces.emplace(R, SourceTU); - else if (It->second != SourceTU) + auto [It, Inserted] = DiagReplacements[*Entry].try_emplace(R, SourceTU); + if (!Inserted && It->second != SourceTU) // This replacement is a duplicate of one suggested by another TU. return; } From d2a96d170a4faa0a6c42fe5f23c073891d6118b8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 10 Oct 2024 08:16:51 -0700 Subject: [PATCH 025/177] [clang-change-namespace] Avoid repeated hash lookups (NFC) (#111784) --- clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp b/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp index 879c0d26d472a8..850df7daf5c038 100644 --- a/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp +++ b/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp @@ -606,9 +606,8 @@ void ChangeNamespaceTool::run( Result.Nodes.getNodeAs("func_ref")) { // If this reference has been processed as a function call, we do not // process it again. - if (ProcessedFuncRefs.count(FuncRef)) + if (!ProcessedFuncRefs.insert(FuncRef).second) return; - ProcessedFuncRefs.insert(FuncRef); const auto *Func = Result.Nodes.getNodeAs("func_decl"); assert(Func); const auto *Context = Result.Nodes.getNodeAs("dc"); From 670a4613fc5f29036f23fe357b0dbf017d019717 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 10 Oct 2024 08:17:25 -0700 Subject: [PATCH 026/177] [clang-tidy] Avoid repeated hash lookups (NFC) (#111785) --- .../clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp index d77df50f8fea24..080454287f28b5 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp @@ -146,12 +146,13 @@ void ForwardDeclarationNamespaceCheck::onEndOfTranslationUnit() { } // Check if a definition in another namespace exists. const auto DeclName = CurDecl->getName(); - if (!DeclNameToDefinitions.contains(DeclName)) { + auto It = DeclNameToDefinitions.find(DeclName); + if (It == DeclNameToDefinitions.end()) { continue; // No definition in this translation unit, we can skip it. } // Make a warning for each definition with the same name (in other // namespaces). - const auto &Definitions = DeclNameToDefinitions[DeclName]; + const auto &Definitions = It->second; for (const auto *Def : Definitions) { diag(CurDecl->getLocation(), "no definition found for %0, but a definition with " From 35bbfbc7c0d0782bad5160662c9683b38329c7c1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 10 Oct 2024 08:17:58 -0700 Subject: [PATCH 027/177] [XRay] Simplify code with DenseMap::operator[] (NFC) (#111786) --- llvm/lib/XRay/BlockIndexer.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/llvm/lib/XRay/BlockIndexer.cpp b/llvm/lib/XRay/BlockIndexer.cpp index a99a6815f0d16b..f4ba0eb5bda9cf 100644 --- a/llvm/lib/XRay/BlockIndexer.cpp +++ b/llvm/lib/XRay/BlockIndexer.cpp @@ -80,12 +80,9 @@ Error BlockIndexer::visit(FunctionRecord &R) { } Error BlockIndexer::flush() { - Index::iterator It; - std::tie(It, std::ignore) = - Indices.insert({{CurrentBlock.ProcessID, CurrentBlock.ThreadID}, {}}); - It->second.push_back({CurrentBlock.ProcessID, CurrentBlock.ThreadID, - CurrentBlock.WallclockTime, - std::move(CurrentBlock.Records)}); + Indices[{CurrentBlock.ProcessID, CurrentBlock.ThreadID}].push_back( + {CurrentBlock.ProcessID, CurrentBlock.ThreadID, + CurrentBlock.WallclockTime, std::move(CurrentBlock.Records)}); CurrentBlock.ProcessID = 0; CurrentBlock.ThreadID = 0; CurrentBlock.Records = {}; From fc467b477545c9f8ef4dc36ecee4dcd2a7457787 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 10 Oct 2024 08:18:28 -0700 Subject: [PATCH 028/177] [AMDGPU] Avoid repeated hash lookups (NFC) (#111787) --- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 7e4d9d21a0b397..1b88fdd3ab2e1c 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1647,16 +1647,18 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { unsigned Idx = Swz[i]->getAsZExtVal(); - if (SwizzleRemap.contains(Idx)) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + auto It = SwizzleRemap.find(Idx); + if (It != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(It->second, DL, MVT::i32); } SwizzleRemap.clear(); BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { unsigned Idx = Swz[i]->getAsZExtVal(); - if (SwizzleRemap.contains(Idx)) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + auto It = SwizzleRemap.find(Idx); + if (It != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(It->second, DL, MVT::i32); } return BuildVector; From 97a43242246bf4a55e68bddf3e6a0500c07803cc Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Thu, 10 Oct 2024 08:22:48 -0700 Subject: [PATCH 029/177] [lld-macho] Fix ICF differentiation of safe_thunks relocs (#111811) In `--icf=safe_thunks` mode, the linker differentiates `keepUnique` functions by creating thunks during a post-processing step after Identical Code Folding (ICF). While this ensures that `keepUnique` functions themselves are not incorrectly merged, it overlooks functions that reference these `keepUnique` symbols. If two functions are identical except for references to different `keepUnique` functions, the current ICF algorithm incorrectly considers them identical because it doesn't account for the future differentiation introduced by thunks. This leads to incorrect deduplication of functions that should remain distinct. To address this issue, we modify the ICF comparison to explicitly check for references to `keepUnique` functions during deduplication. By doing so, functions that reference different `keepUnique` symbols are correctly identified as distinct, preventing erroneous merging and ensuring the correctness of the linked output. --- lld/MachO/ICF.cpp | 11 +++++++ lld/test/MachO/icf-safe-thunks.ll | 49 +++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp index 2ff962b06e3679..aedaecfdeb2c01 100644 --- a/lld/MachO/ICF.cpp +++ b/lld/MachO/ICF.cpp @@ -147,6 +147,17 @@ bool ICF::equalsConstant(const ConcatInputSection *ia, isecB = rb.referent.get(); } + // Typically, we should not encounter sections marked with `keepUnique` at + // this point as they would have resulted in different hashes and therefore + // no need for a full comparison. + // However, in `safe_thunks` mode, it's possible for two different + // relocations to reference identical `keepUnique` functions that will be + // distinguished later via thunks - so we need to handle this case + // explicitly. + if ((isecA != isecB) && ((isecA->keepUnique && isCodeSection(isecA)) || + (isecB->keepUnique && isCodeSection(isecB)))) + return false; + if (isecA->parent != isecB->parent) return false; // Sections with identical parents should be of the same kind. diff --git a/lld/test/MachO/icf-safe-thunks.ll b/lld/test/MachO/icf-safe-thunks.ll index 238e90f952e160..95e00a5b98385b 100644 --- a/lld/test/MachO/icf-safe-thunks.ll +++ b/lld/test/MachO/icf-safe-thunks.ll @@ -22,6 +22,13 @@ ; CHECK-ARM64-NEXT: _func_3identical_v3_canmerge: ; CHECK-ARM64-NEXT: mov {{.*}}, #0x21 ; +; CHECK-ARM64: _func_call_thunked_1_nomerge: +; CHECK-ARM64-NEXT: stp x29 +; +; CHECK-ARM64: _func_call_thunked_2_nomerge: +; CHECK-ARM64-NEXT: _func_call_thunked_2_merge: +; CHECK-ARM64-NEXT: stp x29 +; ; CHECK-ARM64: _call_all_funcs: ; CHECK-ARM64-NEXT: stp x29 ; @@ -43,6 +50,9 @@ ; CHECK-ARM64-MAP-NEXT: 0x00000010 [ 2] _func_3identical_v1_canmerge ; CHECK-ARM64-MAP-NEXT: 0x00000000 [ 2] _func_3identical_v2_canmerge ; CHECK-ARM64-MAP-NEXT: 0x00000000 [ 2] _func_3identical_v3_canmerge +; CHECK-ARM64-MAP-NEXT: 0x00000020 [ 2] _func_call_thunked_1_nomerge +; CHECK-ARM64-MAP-NEXT: 0x00000020 [ 2] _func_call_thunked_2_nomerge +; CHECK-ARM64-MAP-NEXT: 0x00000000 [ 2] _func_call_thunked_2_merge ; CHECK-ARM64-MAP-NEXT: 0x00000034 [ 2] _call_all_funcs ; CHECK-ARM64-MAP-NEXT: 0x00000050 [ 2] _take_func_addr ; CHECK-ARM64-MAP-NEXT: 0x00000004 [ 2] _func_2identical_v2 @@ -125,6 +135,30 @@ entry: ret void } +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_call_thunked_1_nomerge() local_unnamed_addr #0 { +entry: + tail call void @func_2identical_v1() + store volatile i8 77, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_call_thunked_2_nomerge() local_unnamed_addr #0 { +entry: + tail call void @func_2identical_v2() + store volatile i8 77, ptr @g_val, align 1, !tbaa !5 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) +define void @func_call_thunked_2_merge() local_unnamed_addr #0 { +entry: + tail call void @func_2identical_v2() + store volatile i8 77, ptr @g_val, align 1, !tbaa !5 + ret void +} + ; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync) define void @call_all_funcs() local_unnamed_addr #1 { entry: @@ -227,6 +261,21 @@ attributes #1 = { mustprogress nofree noinline norecurse nounwind ssp uwtable(sy ; g_val = 33; ; } ; +; ATTR void func_call_thunked_1_nomerge() { +; func_2identical_v1(); +; g_val = 77; +; } +; +; ATTR void func_call_thunked_2_nomerge() { +; func_2identical_v2(); +; g_val = 77; +; } +; +; ATTR void func_call_thunked_2_merge() { +; func_2identical_v2(); +; g_val = 77; +; } +; ; ATTR void call_all_funcs() { ; func_unique_1(); ; func_unique_2_canmerge(); From 4ddc756bccb34f3d07e30c9ca96bba32cb0cf4f9 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Thu, 10 Oct 2024 17:25:57 +0200 Subject: [PATCH 030/177] Revert "[flang] correctly deal with bind(c) derived type result ABI" (#111858) Reverts llvm/llvm-project#111678 Causes ARM failure in test suite. TYPE(C_PTR) result should not regress even if struct ABI no implemented for the target. https://lab.llvm.org/buildbot/#/builders/143/builds/2731 I need to revisit this. --- .../include/flang/Optimizer/CodeGen/Target.h | 5 - .../flang/Optimizer/Dialect/FIROpsSupport.h | 21 --- flang/lib/Optimizer/CodeGen/Target.cpp | 68 +-------- flang/lib/Optimizer/CodeGen/TargetRewrite.cpp | 137 ++++-------------- .../Optimizer/Transforms/AbstractResult.cpp | 65 +-------- flang/test/Fir/abstract-results-bindc.fir | 43 ------ flang/test/Fir/struct-return-x86-64.fir | 120 --------------- 7 files changed, 40 insertions(+), 419 deletions(-) delete mode 100644 flang/test/Fir/abstract-results-bindc.fir delete mode 100644 flang/test/Fir/struct-return-x86-64.fir diff --git a/flang/include/flang/Optimizer/CodeGen/Target.h b/flang/include/flang/Optimizer/CodeGen/Target.h index 3b38583511927a..a7161152a5c323 100644 --- a/flang/include/flang/Optimizer/CodeGen/Target.h +++ b/flang/include/flang/Optimizer/CodeGen/Target.h @@ -126,11 +126,6 @@ class CodeGenSpecifics { structArgumentType(mlir::Location loc, fir::RecordType recTy, const Marshalling &previousArguments) const = 0; - /// Type representation of a `fir.type` type argument when returned by - /// value. Such value may need to be converted to a hidden reference argument. - virtual Marshalling structReturnType(mlir::Location loc, - fir::RecordType eleTy) const = 0; - /// Type representation of a `boxchar` type argument when passed by value. /// An argument value may need to be passed as a (safe) reference argument. /// diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h index fb7b1d16f62f3a..cdbefdb2341485 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h +++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h @@ -177,27 +177,6 @@ inline mlir::NamedAttribute getAdaptToByRefAttr(Builder &builder) { } bool isDummyArgument(mlir::Value v); - -template -inline bool hasProcedureAttr(fir::FortranProcedureFlagsEnumAttr flags) { - return flags && bitEnumContainsAny(flags.getValue(), Flag); -} - -template -inline bool hasProcedureAttr(mlir::Operation *op) { - if (auto firCallOp = mlir::dyn_cast(op)) - return hasProcedureAttr(firCallOp.getProcedureAttrsAttr()); - if (auto firCallOp = mlir::dyn_cast(op)) - return hasProcedureAttr(firCallOp.getProcedureAttrsAttr()); - return hasProcedureAttr( - op->getAttrOfType( - getFortranProcedureFlagsAttrName())); -} - -inline bool hasBindcAttr(mlir::Operation *op) { - return hasProcedureAttr(op); -} - } // namespace fir #endif // FORTRAN_OPTIMIZER_DIALECT_FIROPSSUPPORT_H diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp index 6c148dffb0e55a..a12b59413f4456 100644 --- a/flang/lib/Optimizer/CodeGen/Target.cpp +++ b/flang/lib/Optimizer/CodeGen/Target.cpp @@ -100,11 +100,6 @@ struct GenericTarget : public CodeGenSpecifics { TODO(loc, "passing VALUE BIND(C) derived type for this target"); } - CodeGenSpecifics::Marshalling - structReturnType(mlir::Location loc, fir::RecordType ty) const override { - TODO(loc, "returning BIND(C) derived type for this target"); - } - CodeGenSpecifics::Marshalling integerArgumentType(mlir::Location loc, mlir::IntegerType argTy) const override { @@ -538,8 +533,7 @@ struct TargetX86_64 : public GenericTarget { /// When \p recTy is a one field record type that can be passed /// like the field on its own, returns the field type. Returns /// a null type otherwise. - mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy, - bool allowComplex = false) const { + mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy) const { auto typeList = recTy.getTypeList(); if (typeList.size() != 1) return {}; @@ -547,8 +541,6 @@ struct TargetX86_64 : public GenericTarget { if (mlir::isa( fieldType)) return fieldType; - if (allowComplex && mlir::isa(fieldType)) - return fieldType; if (mlir::isa(fieldType)) { // Only CHARACTER(1) are expected in BIND(C) contexts, which is the only // contexts where derived type may be passed in registers. @@ -601,7 +593,7 @@ struct TargetX86_64 : public GenericTarget { postMerge(byteOffset, Lo, Hi); if (Lo == ArgClass::Memory || Lo == ArgClass::X87 || Lo == ArgClass::ComplexX87) - return passOnTheStack(loc, recTy, /*isResult=*/false); + return passOnTheStack(loc, recTy); int neededIntRegisters = 0; int neededSSERegisters = 0; if (Lo == ArgClass::SSE) @@ -617,7 +609,7 @@ struct TargetX86_64 : public GenericTarget { // all in registers or all on the stack). if (!hasEnoughRegisters(loc, neededIntRegisters, neededSSERegisters, previousArguments)) - return passOnTheStack(loc, recTy, /*isResult=*/false); + return passOnTheStack(loc, recTy); if (auto fieldType = passAsFieldIfOneFieldStruct(recTy)) { CodeGenSpecifics::Marshalling marshal; @@ -649,57 +641,9 @@ struct TargetX86_64 : public GenericTarget { return marshal; } - CodeGenSpecifics::Marshalling - structReturnType(mlir::Location loc, fir::RecordType recTy) const override { - std::uint64_t byteOffset = 0; - ArgClass Lo, Hi; - Lo = Hi = ArgClass::NoClass; - byteOffset = classifyStruct(loc, recTy, byteOffset, Lo, Hi); - mlir::MLIRContext *context = recTy.getContext(); - postMerge(byteOffset, Lo, Hi); - if (Lo == ArgClass::Memory) - return passOnTheStack(loc, recTy, /*isResult=*/true); - - // Note that X87/ComplexX87 are passed in memory, but returned via %st0 - // %st1 registers. Here, they are returned as fp80 or {fp80, fp80} by - // passAsFieldIfOneFieldStruct, and LLVM will use the expected registers. - - // Note that {_Complex long double} is not 100% clear from an ABI - // perspective because the aggregate post merger rules say it should be - // passed in memory because it is bigger than 2 eight bytes. This has the - // funny effect of - // {_Complex long double} return to be dealt with differently than - // _Complex long double. - - if (auto fieldType = - passAsFieldIfOneFieldStruct(recTy, /*allowComplex=*/true)) { - if (auto complexType = mlir::dyn_cast(fieldType)) - return complexReturnType(loc, complexType.getElementType()); - CodeGenSpecifics::Marshalling marshal; - marshal.emplace_back(fieldType, AT{}); - return marshal; - } - - if (Hi == ArgClass::NoClass || Hi == ArgClass::SSEUp) { - // Return a single integer or floating point argument. - mlir::Type lowType = pickLLVMArgType(loc, context, Lo, byteOffset); - CodeGenSpecifics::Marshalling marshal; - marshal.emplace_back(lowType, AT{}); - return marshal; - } - // Will be returned in two different registers. Generate {lowTy, HiTy} for - // the LLVM IR result type. - CodeGenSpecifics::Marshalling marshal; - mlir::Type lowType = pickLLVMArgType(loc, context, Lo, 8u); - mlir::Type hiType = pickLLVMArgType(loc, context, Hi, byteOffset - 8u); - marshal.emplace_back(mlir::TupleType::get(context, {lowType, hiType}), - AT{}); - return marshal; - } - /// Marshal an argument that must be passed on the stack. - CodeGenSpecifics::Marshalling - passOnTheStack(mlir::Location loc, mlir::Type ty, bool isResult) const { + CodeGenSpecifics::Marshalling passOnTheStack(mlir::Location loc, + mlir::Type ty) const { CodeGenSpecifics::Marshalling marshal; auto sizeAndAlign = fir::getTypeSizeAndAlignmentOrCrash(loc, ty, getDataLayout(), kindMap); @@ -707,7 +651,7 @@ struct TargetX86_64 : public GenericTarget { unsigned short align = std::max(sizeAndAlign.second, static_cast(8)); marshal.emplace_back(fir::ReferenceType::get(ty), - AT{align, /*byval=*/!isResult, /*sret=*/isResult}); + AT{align, /*byval=*/true, /*sret=*/false}); return marshal; } }; diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp index 04a3ea684642c8..fd56fd6bf50f44 100644 --- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp @@ -142,16 +142,20 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { mlir::ModuleOp getModule() { return getOperation(); } - template + template std::optional> - rewriteCallResultType(mlir::Location loc, mlir::Type originalResTy, - Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, - Callback &newOpers, mlir::Value &savedStackPtr, - fir::CodeGenSpecifics::Marshalling &m) { - // Currently, targets mandate COMPLEX or STRUCT is a single aggregate or - // packed scalar, including the sret case. - assert(m.size() == 1 && "return type not supported on this target"); + rewriteCallComplexResultType( + mlir::Location loc, A ty, B &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, C &newOpers, + mlir::Value &savedStackPtr) { + if (noComplexConversion) { + newResTys.push_back(ty); + return std::nullopt; + } + auto m = specifics->complexReturnType(loc, ty.getElementType()); + // Currently targets mandate COMPLEX is a single aggregate or packed + // scalar, including the sret case. + assert(m.size() == 1 && "target of complex return not supported"); auto resTy = std::get(m[0]); auto attr = std::get(m[0]); if (attr.isSRet()) { @@ -166,7 +170,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { newInTyAndAttrs.push_back(m[0]); newOpers.push_back(stack); return [=](mlir::Operation *) -> mlir::Value { - auto memTy = fir::ReferenceType::get(originalResTy); + auto memTy = fir::ReferenceType::get(ty); auto cast = rewriter->create(loc, memTy, stack); return rewriter->create(loc, cast); }; @@ -176,41 +180,11 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { // We are going to generate an alloca, so save the stack pointer. if (!savedStackPtr) savedStackPtr = genStackSave(loc); - return this->convertValueInMemory(loc, call->getResult(0), originalResTy, + return this->convertValueInMemory(loc, call->getResult(0), ty, /*inputMayBeBigger=*/true); }; } - template - std::optional> - rewriteCallComplexResultType( - mlir::Location loc, mlir::ComplexType ty, Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers, - mlir::Value &savedStackPtr) { - if (noComplexConversion) { - newResTys.push_back(ty); - return std::nullopt; - } - auto m = specifics->complexReturnType(loc, ty.getElementType()); - return rewriteCallResultType(loc, ty, newResTys, newInTyAndAttrs, newOpers, - savedStackPtr, m); - } - - template - std::optional> - rewriteCallStructResultType( - mlir::Location loc, fir::RecordType recTy, Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers, - mlir::Value &savedStackPtr) { - if (noStructConversion) { - newResTys.push_back(recTy); - return std::nullopt; - } - auto m = specifics->structReturnType(loc, recTy); - return rewriteCallResultType(loc, recTy, newResTys, newInTyAndAttrs, - newOpers, savedStackPtr, m); - } - void passArgumentOnStackOrWithNewType( mlir::Location loc, fir::CodeGenSpecifics::TypeAndAttr newTypeAndAttr, mlir::Type oldType, mlir::Value oper, @@ -382,11 +356,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { newInTyAndAttrs, newOpers, savedStackPtr); }) - .template Case([&](fir::RecordType recTy) { - wrap = rewriteCallStructResultType(loc, recTy, newResTys, - newInTyAndAttrs, newOpers, - savedStackPtr); - }) .Default([&](mlir::Type ty) { newResTys.push_back(ty); }); } else if (fnTy.getResults().size() > 1) { TODO(loc, "multiple results not supported yet"); @@ -593,24 +562,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { } } - template - void - lowerStructSignatureRes(mlir::Location loc, fir::RecordType recTy, - Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) { - if (noComplexConversion) { - newResTys.push_back(recTy); - return; - } else { - for (auto &tup : specifics->structReturnType(loc, recTy)) { - if (std::get(tup).isSRet()) - newInTyAndAttrs.push_back(tup); - else - newResTys.push_back(std::get(tup)); - } - } - } - void lowerStructSignatureArg(mlir::Location loc, fir::RecordType recTy, fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) { @@ -644,9 +595,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { .Case([&](mlir::ComplexType ty) { lowerComplexSignatureRes(loc, ty, newResTys, newInTyAndAttrs); }) - .Case([&](fir::RecordType ty) { - lowerStructSignatureRes(loc, ty, newResTys, newInTyAndAttrs); - }) .Default([&](mlir::Type ty) { newResTys.push_back(ty); }); } llvm::SmallVector trailingInTys; @@ -748,8 +696,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { for (auto ty : func.getResults()) if ((mlir::isa(ty) && !noCharacterConversion) || (fir::isa_complex(ty) && !noComplexConversion) || - (mlir::isa(ty) && hasCCallingConv) || - (mlir::isa(ty) && !noStructConversion)) { + (mlir::isa(ty) && hasCCallingConv)) { LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n"); return false; } @@ -823,9 +770,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { rewriter->getUnitAttr())); newResTys.push_back(retTy); }) - .Case([&](fir::RecordType recTy) { - doStructReturn(func, recTy, newResTys, newInTyAndAttrs, fixups); - }) .Default([&](mlir::Type ty) { newResTys.push_back(ty); }); // Saved potential shift in argument. Handling of result can add arguments @@ -1118,12 +1062,21 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { return false; } + /// Convert a complex return value. This can involve converting the return + /// value to a "hidden" first argument or packing the complex into a wide + /// GPR. template - void doReturn(mlir::func::FuncOp func, Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, - FIXUPS &fixups, fir::CodeGenSpecifics::Marshalling &m) { - assert(m.size() == 1 && - "expect result to be turned into single argument or result so far"); + void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx, + Ty &newResTys, + fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, + FIXUPS &fixups) { + if (noComplexConversion) { + newResTys.push_back(cmplx); + return; + } + auto m = + specifics->complexReturnType(func.getLoc(), cmplx.getElementType()); + assert(m.size() == 1); auto &tup = m[0]; auto attr = std::get(tup); auto argTy = std::get(tup); @@ -1164,36 +1117,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { newResTys.push_back(argTy); } - /// Convert a complex return value. This can involve converting the return - /// value to a "hidden" first argument or packing the complex into a wide - /// GPR. - template - void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx, - Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, - FIXUPS &fixups) { - if (noComplexConversion) { - newResTys.push_back(cmplx); - return; - } - auto m = - specifics->complexReturnType(func.getLoc(), cmplx.getElementType()); - doReturn(func, newResTys, newInTyAndAttrs, fixups, m); - } - - template - void doStructReturn(mlir::func::FuncOp func, fir::RecordType recTy, - Ty &newResTys, - fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, - FIXUPS &fixups) { - if (noStructConversion) { - newResTys.push_back(recTy); - return; - } - auto m = specifics->structReturnType(func.getLoc(), recTy); - doReturn(func, newResTys, newInTyAndAttrs, fixups, m); - } - template void createFuncOpArgFixups(mlir::func::FuncOp func, diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp index c0ec820d87ed44..7299ff80121e13 100644 --- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp +++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp @@ -32,33 +32,6 @@ using namespace mlir; namespace fir { namespace { -// Helper to only build the symbol table if needed because its build time is -// linear on the number of symbols in the module. -struct LazySymbolTable { - LazySymbolTable(mlir::Operation *op) - : module{op->getParentOfType()} {} - void build() { - if (table) - return; - table = std::make_unique(module); - } - - template - T lookup(llvm::StringRef name) { - build(); - return table->lookup(name); - } - -private: - std::unique_ptr table; - mlir::ModuleOp module; -}; - -bool hasScalarDerivedResult(mlir::FunctionType funTy) { - return funTy.getNumResults() == 1 && - mlir::isa(funTy.getResult(0)); -} - static mlir::Type getResultArgumentType(mlir::Type resultType, bool shouldBoxResult) { return llvm::TypeSwitch(resultType) @@ -217,14 +190,7 @@ class SaveResultOpConversion llvm::LogicalResult matchAndRewrite(fir::SaveResultOp op, mlir::PatternRewriter &rewriter) const override { - mlir::Operation *call = op.getValue().getDefiningOp(); - if (mlir::isa(op.getValue().getType()) && call && - fir::hasBindcAttr(call)) { - rewriter.replaceOpWithNewOp(op, op.getValue(), - op.getMemref()); - } else { - rewriter.eraseOp(op); - } + rewriter.eraseOp(op); return mlir::success(); } }; @@ -334,12 +300,6 @@ class AbstractResultOpt auto *context = &getContext(); // Convert function type itself if it has an abstract result. auto funcTy = mlir::cast(func.getFunctionType()); - // Scalar derived result of BIND(C) function must be returned according - // to the C struct return ABI which is target dependent and implemented in - // the target-rewrite pass. - if (hasScalarDerivedResult(funcTy) && - fir::hasBindcAttr(func.getOperation())) - return; if (hasAbstractResult(funcTy)) { if (fir::isa_builtin_cptr_type(funcTy.getResult(0))) { func.setType(getCPtrFunctionType(funcTy)); @@ -435,8 +395,6 @@ class AbstractResultOpt return; } - LazySymbolTable symbolTable(op); - mlir::RewritePatternSet patterns(context); mlir::ConversionTarget target = *context; const bool shouldBoxResult = this->passResultAsBox.getValue(); @@ -451,29 +409,14 @@ class AbstractResultOpt mlir::func::FuncDialect>(); target.addIllegalOp(); target.addDynamicallyLegalOp([](fir::CallOp call) { - mlir::FunctionType funTy = call.getFunctionType(); - if (hasScalarDerivedResult(funTy) && - fir::hasBindcAttr(call.getOperation())) - return true; - return !hasAbstractResult(funTy); + return !hasAbstractResult(call.getFunctionType()); }); - target.addDynamicallyLegalOp([&symbolTable]( - fir::AddrOfOp addrOf) { - if (auto funTy = mlir::dyn_cast(addrOf.getType())) { - if (hasScalarDerivedResult(funTy)) { - auto func = symbolTable.lookup( - addrOf.getSymbol().getRootReference().getValue()); - return func && fir::hasBindcAttr(func.getOperation()); - } + target.addDynamicallyLegalOp([](fir::AddrOfOp addrOf) { + if (auto funTy = mlir::dyn_cast(addrOf.getType())) return !hasAbstractResult(funTy); - } return true; }); target.addDynamicallyLegalOp([](fir::DispatchOp dispatch) { - mlir::FunctionType funTy = dispatch.getFunctionType(); - if (hasScalarDerivedResult(funTy) && - fir::hasBindcAttr(dispatch.getOperation())) - return true; return !hasAbstractResult(dispatch.getFunctionType()); }); diff --git a/flang/test/Fir/abstract-results-bindc.fir b/flang/test/Fir/abstract-results-bindc.fir deleted file mode 100644 index 9b26730f7d2923..00000000000000 --- a/flang/test/Fir/abstract-results-bindc.fir +++ /dev/null @@ -1,43 +0,0 @@ -// Test that bind_c derived type results are not moved to a hidden argument -// by the abstract-result pass. They will be dealt with according to the C -// struct returning ABI for the target in the target-rewrite pass. -// RUN: fir-opt %s --abstract-result | FileCheck %s - -!t = !fir.type - -func.func private @foo() -> !t attributes {fir.proc_attrs = #fir.proc_attrs} - -func.func @test_call(%x: !fir.ref) { - %0 = fir.call @foo() proc_attrs : () -> !t - fir.save_result %0 to %x : !t, !fir.ref - return -} - -func.func @test_addr_of() -> (() -> !t) { - %0 = fir.address_of(@foo) : () -> !t - return %0 : () -> !t -} - -func.func @test_dispatch(%x: !fir.ref, %y : !fir.class>) { - %0 = fir.dispatch "bar"(%y : !fir.class>) (%y : !fir.class>) -> !t proc_attrs {pass_arg_pos = 0 : i32} - fir.save_result %0 to %x : !t, !fir.ref - return -} - -// CHECK-LABEL: func.func @test_call( -// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>) { -// CHECK: %[[VAL_1:.*]] = fir.call @foo() proc_attrs : () -> !fir.type -// CHECK: fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref> -// CHECK: return -// CHECK: } -// CHECK-LABEL: func.func @test_addr_of() -> (() -> !fir.type) { -// CHECK: %[[VAL_0:.*]] = fir.address_of(@foo) : () -> !fir.type -// CHECK: return %[[VAL_0]] : () -> !fir.type -// CHECK: } -// CHECK-LABEL: func.func @test_dispatch( -// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>, -// CHECK-SAME: %[[VAL_1:.*]]: !fir.class>) { -// CHECK: %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class>) (%[[VAL_1]] : !fir.class>) -> !fir.type proc_attrs {pass_arg_pos = 0 : i32} -// CHECK: fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref> -// CHECK: return -// CHECK: } diff --git a/flang/test/Fir/struct-return-x86-64.fir b/flang/test/Fir/struct-return-x86-64.fir deleted file mode 100644 index f4c2add69ff7e9..00000000000000 --- a/flang/test/Fir/struct-return-x86-64.fir +++ /dev/null @@ -1,120 +0,0 @@ -// Test X86-64 ABI rewrite of struct returned by value (BIND(C), VALUE derived types). -// REQUIRES: x86-registered-target -// RUN: fir-opt --target-rewrite %s | FileCheck %s - -!fits_in_reg = !fir.type -!too_big = !fir.type}> - -module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { - - func.func private @test_inreg() -> !fits_in_reg - func.func @test_call_inreg(%arg0: !fir.ref) { - %0 = fir.call @test_inreg() : () -> !fits_in_reg - fir.store %0 to %arg0 : !fir.ref - return - } - func.func @test_addr_of_inreg() -> (() -> ()) { - %0 = fir.address_of(@test_inreg) : () -> !fits_in_reg - %1 = fir.convert %0 : (() -> !fits_in_reg) -> (() -> ()) - return %1 : () -> () - } - func.func @test_dispatch_inreg(%arg0: !fir.ref, %arg1: !fir.class>) { - %0 = fir.dispatch "bar"(%arg1 : !fir.class>) (%arg1 : !fir.class>) -> !fits_in_reg {pass_arg_pos = 0 : i32} - fir.store %0 to %arg0 : !fir.ref - return - } - - func.func private @test_sret() -> !too_big - func.func @test_call_sret(%arg0: !fir.ref) { - %0 = fir.call @test_sret() : () -> !too_big - fir.store %0 to %arg0 : !fir.ref - return - } - func.func @test_addr_of_sret() -> (() -> ()) { - %0 = fir.address_of(@test_sret) : () -> !too_big - %1 = fir.convert %0 : (() -> !too_big) -> (() -> ()) - return %1 : () -> () - } - func.func @test_dispatch_sret(%arg0: !fir.ref, %arg1: !fir.class>) { - %0 = fir.dispatch "bar"(%arg1 : !fir.class>) (%arg1 : !fir.class>) -> !too_big {pass_arg_pos = 0 : i32} - fir.store %0 to %arg0 : !fir.ref - return - } - func.func private @test_fp_80() -> !fir.type - func.func private @test_complex_80() -> !fir.type}> - func.func private @test_two_fp_80() -> !fir.type - func.func private @test_fp128() -> !fir.type -} - -// CHECK-LABEL: func.func private @test_inreg() -> tuple - -// CHECK-LABEL: func.func @test_call_inreg( -// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>) { -// CHECK: %[[VAL_1:.*]] = fir.call @test_inreg() : () -> tuple -// CHECK: %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr -// CHECK: %[[VAL_3:.*]] = fir.alloca tuple -// CHECK: fir.store %[[VAL_1]] to %[[VAL_3]] : !fir.ref> -// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref>) -> !fir.ref> -// CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref> -// CHECK: llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr -// CHECK: fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref> -// CHECK: return -// CHECK: } - -// CHECK-LABEL: func.func @test_addr_of_inreg() -> (() -> ()) { -// CHECK: %[[VAL_0:.*]] = fir.address_of(@test_inreg) : () -> tuple -// CHECK: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> tuple) -> (() -> ()) -// CHECK: return %[[VAL_1]] : () -> () -// CHECK: } - -// CHECK-LABEL: func.func @test_dispatch_inreg( -// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>, -// CHECK-SAME: %[[VAL_1:.*]]: !fir.class>) { -// CHECK: %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class>) (%[[VAL_1]] : !fir.class>) -> tuple {pass_arg_pos = 0 : i32} -// CHECK: %[[VAL_3:.*]] = llvm.intr.stacksave : !llvm.ptr -// CHECK: %[[VAL_4:.*]] = fir.alloca tuple -// CHECK: fir.store %[[VAL_2]] to %[[VAL_4]] : !fir.ref> -// CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>) -> !fir.ref> -// CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref> -// CHECK: llvm.intr.stackrestore %[[VAL_3]] : !llvm.ptr -// CHECK: fir.store %[[VAL_6]] to %[[VAL_0]] : !fir.ref> -// CHECK: return -// CHECK: } -// CHECK: func.func private @test_sret(!fir.ref}>> {llvm.align = 8 : i32, llvm.sret = !fir.type}>}) - -// CHECK-LABEL: func.func @test_call_sret( -// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref}>>) { -// CHECK: %[[VAL_1:.*]] = llvm.intr.stacksave : !llvm.ptr -// CHECK: %[[VAL_2:.*]] = fir.alloca !fir.type}> -// CHECK: fir.call @test_sret(%[[VAL_2]]) : (!fir.ref}>>) -> () -// CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref}>>) -> !fir.ref}>> -// CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.ref}>> -// CHECK: llvm.intr.stackrestore %[[VAL_1]] : !llvm.ptr -// CHECK: fir.store %[[VAL_4]] to %[[VAL_0]] : !fir.ref}>> -// CHECK: return -// CHECK: } - -// CHECK-LABEL: func.func @test_addr_of_sret() -> (() -> ()) { -// CHECK: %[[VAL_0:.*]] = fir.address_of(@test_sret) : (!fir.ref}>>) -> () -// CHECK: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : ((!fir.ref}>>) -> ()) -> (() -> ()) -// CHECK: return %[[VAL_1]] : () -> () -// CHECK: } - -// CHECK-LABEL: func.func @test_dispatch_sret( -// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref}>>, -// CHECK-SAME: %[[VAL_1:.*]]: !fir.class>) { -// CHECK: %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr -// CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type}> -// CHECK: fir.dispatch "bar"(%[[VAL_1]] : !fir.class>) (%[[VAL_3]], %[[VAL_1]] : !fir.ref}>>, !fir.class>) {pass_arg_pos = 1 : i32} -// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref}>>) -> !fir.ref}>> -// CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref}>> -// CHECK: llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr -// CHECK: fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref}>> -// CHECK: return -// CHECK: } - - -// CHECK: func.func private @test_fp_80() -> f80 -// CHECK: func.func private @test_complex_80(!fir.ref}>> {llvm.align = 16 : i32, llvm.sret = !fir.type}>}) -// CHECK: func.func private @test_two_fp_80(!fir.ref> {llvm.align = 16 : i32, llvm.sret = !fir.type}) -// CHECK: func.func private @test_fp128() -> f128 From a3638f19bc04468c6db28a9cca50975229bfd45a Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Thu, 10 Oct 2024 17:31:53 +0200 Subject: [PATCH 031/177] [clang] Update string and string_view in lifetimebound tests (#111737) Removes pragmas like `# 1 "" 1 3` to make line numbers in failing tests more accurate. Use `basic_string_view` instead `string_view` to kick in GSL owner/pointer auto inference. --- clang/test/SemaCXX/attr-lifetimebound.cpp | 33 +++++++++++------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp index 0fb997a5671085..bdc58171917375 100644 --- a/clang/test/SemaCXX/attr-lifetimebound.cpp +++ b/clang/test/SemaCXX/attr-lifetimebound.cpp @@ -75,23 +75,26 @@ namespace usage_ok { } } -# 1 "" 1 3 namespace std { using size_t = __SIZE_TYPE__; - struct string { - string(); - string(const char*); + template + struct basic_string { + basic_string(); + basic_string(const T*); char &operator[](size_t) const [[clang::lifetimebound]]; }; - string operator""s(const char *, size_t); - - struct string_view { - string_view(); - string_view(const char *p [[clang::lifetimebound]]); - string_view(const string &s [[clang::lifetimebound]]); + using string = basic_string; + string operator""s(const char *, size_t); // expected-warning {{user-defined literal suffixes not starting with '_' are reserved}} + + template + struct basic_string_view { + basic_string_view(); + basic_string_view(const T *p); + basic_string_view(const string &s [[clang::lifetimebound]]); }; - string_view operator""sv(const char *, size_t); + using string_view = basic_string_view; + string_view operator""sv(const char *, size_t); // expected-warning {{user-defined literal suffixes not starting with '_' are reserved}} struct vector { int *data(); @@ -100,7 +103,6 @@ namespace std { template struct map {}; } -# 68 "attr-lifetimebound.cpp" 2 using std::operator""s; using std::operator""sv; @@ -112,7 +114,7 @@ namespace p0936r0_examples { void f() { std::string_view sv = "hi"; std::string_view sv2 = sv + sv; // expected-warning {{temporary}} - sv2 = sv + sv; // FIXME: can we infer that we should warn here too? + sv2 = sv + sv; // expected-warning {{object backing the pointer}} } struct X { int a, b; }; @@ -238,11 +240,6 @@ template T *addressof(T &arg) { &const_cast(reinterpret_cast(arg))); } -template -struct basic_string_view { - basic_string_view(const T *); -}; - template struct span { template span(const T (&__arr)[_ArrayExtent]) noexcept; From 9839b8cfb477866b8610714976cc6599f32f63e6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 10 Oct 2024 19:32:02 +0400 Subject: [PATCH 032/177] llvm-reduce: Fix assert on invokes with catchswitch (#111838) This is the minimal change to avoid the assert. There's an API flaw in invoke instructions where getLandingPad assumes all invoke unwind blocks have landingpads, when some have catchswitch instead. Fixes #111817 --- .../issue111817-catchswitch-assert.ll | 53 +++++++++++++++++++ .../llvm-reduce/deltas/ReduceBasicBlocks.cpp | 13 ++++- 2 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll diff --git a/llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll b/llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll new file mode 100644 index 00000000000000..cf20c8607ab2f3 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll @@ -0,0 +1,53 @@ +; RUN: llvm-reduce -abort-on-invalid-reduction --delta-passes=basic-blocks --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck --check-prefix=CHECK-FINAL %s < %t + +; Make sure there's no assertion for invoke destinations that don't +; use landingpad (and use catchswitch instead) + +; CHECK-INTERESTINGNESS: invoke + +; CHECK-FINAL: bb: +; CHECK-FINAL-NEXT: invoke void @llvm.seh.try.begin() +; CHECK-FINAL-NEXT: to label %bb7 unwind label %bb1 +; CHECK-FINAL: bb1: +; CHECK-FINAL-NEXT: %i = catchswitch within none [label %bb2] unwind to caller + +; CHECK-FINAL: bb2: +; CHECK-FINAL-NEXT: %i3 = catchpad within %i [ptr null] +; CHECK-FINAL-NEXT: ret ptr null + +; CHECK-FINAL-NOT: bb4 +; CHECK-FINAL-NOT: bb5 + +; CHECK-FINAL: bb7: +; CHECK-FINAL-NEXT: ret ptr null +define ptr @func() personality ptr @__C_specific_handler { +bb: + invoke void @llvm.seh.try.begin() + to label %bb7 unwind label %bb1 + +bb1: ; preds = %bb + %i = catchswitch within none [label %bb2] unwind to caller + +bb2: ; preds = %bb1 + %i3 = catchpad within %i [ptr null] + catchret from %i3 to label %bb4 + +bb4: ; preds = %bb2 + invoke void @llvm.seh.try.end() + to label %bb7 unwind label %bb5 + +bb5: ; preds = %bb4 + %i6 = cleanuppad within none [] + cleanupret from %i6 unwind to caller + +bb7: ; preds = %bb4, %bb + ret ptr null +} + +declare void @llvm.seh.try.begin() #0 +declare void @llvm.seh.try.end() #0 +declare i32 @__C_specific_handler(...) + +attributes #0 = { nounwind willreturn memory(write) } + diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp index 6858dac9aeac41..41e3ffd963f5ba 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp @@ -45,12 +45,21 @@ static void replaceBranchTerminator(BasicBlock &BB, if (ChunkSuccessors.size() == Term->getNumSuccessors()) return; + // TODO: Handle these without failing verifier. + if (isa(Term)) + return; + bool IsBranch = isa(Term); if (InvokeInst *Invoke = dyn_cast(Term)) { - LandingPadInst *LP = Invoke->getLandingPadInst(); + BasicBlock *UnwindDest = Invoke->getUnwindDest(); + Instruction *LP = UnwindDest->getFirstNonPHI(); + // Remove landingpad instruction if the containing block isn't used by other // invokes. - if (none_of(LP->getParent()->users(), [Invoke](User *U) { + + // TODO: Handle catchswitch, catchpad, catchret, and cleanupret + if (isa(LP) && + none_of(UnwindDest->users(), [Invoke](User *U) { return U != Invoke && isa(U); })) { LP->replaceAllUsesWith(getDefaultValue(LP->getType())); From c042d8f7b35ccb7add9c873c7e5d74f568cca115 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 10 Oct 2024 08:43:30 -0700 Subject: [PATCH 033/177] [RISCV] Use RISCVAsmPrinter::EmitToStreamer for EmitHwasanMemaccessSymbols. (#111792) Add a MCSubtargetInfo& operand so we can control the subtarget for the new calls. The old signature is kept as a wrapper to pass *STI to maintain compatibility. By using EmitToStreamer we are able to compress the instructions when possible. --- llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 184 ++++++++++-------- .../CodeGen/RISCV/hwasan-check-memaccess.ll | 45 +++++ 2 files changed, 148 insertions(+), 81 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 384a7cf59f0632..5ad09ae7290fc5 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -86,7 +86,11 @@ class RISCVAsmPrinter : public AsmPrinter { const char *ExtraCode, raw_ostream &OS) override; // Returns whether Inst is compressed. - bool EmitToStreamer(MCStreamer &S, const MCInst &Inst); + bool EmitToStreamer(MCStreamer &S, const MCInst &Inst, + const MCSubtargetInfo &SubtargetInfo); + bool EmitToStreamer(MCStreamer &S, const MCInst &Inst) { + return EmitToStreamer(S, Inst, *STI); + } bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst); @@ -242,12 +246,13 @@ void RISCVAsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM, SM.recordStatepoint(*MILabel, MI); } -bool RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) { +bool RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst, + const MCSubtargetInfo &SubtargetInfo) { MCInst CInst; - bool Res = RISCVRVC::compress(CInst, Inst, *STI); + bool Res = RISCVRVC::compress(CInst, Inst, SubtargetInfo); if (Res) ++RISCVNumInstrsCompressed; - S.emitInstruction(Res ? CInst : Inst, *STI); + S.emitInstruction(Res ? CInst : Inst, SubtargetInfo); return Res; } @@ -662,87 +667,100 @@ void RISCVAsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { OutStreamer->emitLabel(Sym); // Extract shadow offset from ptr - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::SLLI).addReg(RISCV::X6).addReg(Reg).addImm(8), MCSTI); - OutStreamer->emitInstruction(MCInstBuilder(RISCV::SRLI) - .addReg(RISCV::X6) - .addReg(RISCV::X6) - .addImm(12), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::SRLI) + .addReg(RISCV::X6) + .addReg(RISCV::X6) + .addImm(12), + MCSTI); // load shadow tag in X6, X5 contains shadow base - OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADD) - .addReg(RISCV::X6) - .addReg(RISCV::X5) - .addReg(RISCV::X6), - MCSTI); - OutStreamer->emitInstruction( + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::ADD) + .addReg(RISCV::X6) + .addReg(RISCV::X5) + .addReg(RISCV::X6), + MCSTI); + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::LBU).addReg(RISCV::X6).addReg(RISCV::X6).addImm(0), MCSTI); // Extract tag from pointer and compare it with loaded tag from shadow - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::SRLI).addReg(RISCV::X7).addReg(Reg).addImm(56), MCSTI); MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol(); // X7 contains tag from the pointer, while X6 contains tag from memory - OutStreamer->emitInstruction( - MCInstBuilder(RISCV::BNE) - .addReg(RISCV::X7) - .addReg(RISCV::X6) - .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym, - OutContext)), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::BNE) + .addReg(RISCV::X7) + .addReg(RISCV::X6) + .addExpr(MCSymbolRefExpr::create( + HandleMismatchOrPartialSym, OutContext)), + MCSTI); MCSymbol *ReturnSym = OutContext.createTempSymbol(); OutStreamer->emitLabel(ReturnSym); - OutStreamer->emitInstruction(MCInstBuilder(RISCV::JALR) - .addReg(RISCV::X0) - .addReg(RISCV::X1) - .addImm(0), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::JALR) + .addReg(RISCV::X0) + .addReg(RISCV::X1) + .addImm(0), + MCSTI); OutStreamer->emitLabel(HandleMismatchOrPartialSym); - OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI) - .addReg(RISCV::X28) - .addReg(RISCV::X0) - .addImm(16), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X28) + .addReg(RISCV::X0) + .addImm(16), + MCSTI); MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::BGEU) .addReg(RISCV::X6) .addReg(RISCV::X28) .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), MCSTI); - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::ANDI).addReg(RISCV::X28).addReg(Reg).addImm(0xF), MCSTI); if (Size != 1) - OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI) - .addReg(RISCV::X28) - .addReg(RISCV::X28) - .addImm(Size - 1), - MCSTI); - OutStreamer->emitInstruction( + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X28) + .addReg(RISCV::X28) + .addImm(Size - 1), + MCSTI); + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::BGE) .addReg(RISCV::X28) .addReg(RISCV::X6) .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), MCSTI); - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::ORI).addReg(RISCV::X6).addReg(Reg).addImm(0xF), MCSTI); - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::LBU).addReg(RISCV::X6).addReg(RISCV::X6).addImm(0), MCSTI); - OutStreamer->emitInstruction( - MCInstBuilder(RISCV::BEQ) - .addReg(RISCV::X6) - .addReg(RISCV::X7) - .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::BEQ) + .addReg(RISCV::X6) + .addReg(RISCV::X7) + .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), + MCSTI); OutStreamer->emitLabel(HandleMismatchSym); @@ -781,50 +799,54 @@ void RISCVAsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { // +---------------------------------+ <-- [x2 / SP] // Adjust sp - OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI) - .addReg(RISCV::X2) - .addReg(RISCV::X2) - .addImm(-256), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X2) + .addReg(RISCV::X2) + .addImm(-256), + MCSTI); // store x10(arg0) by new sp - OutStreamer->emitInstruction(MCInstBuilder(RISCV::SD) - .addReg(RISCV::X10) - .addReg(RISCV::X2) - .addImm(8 * 10), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::SD) + .addReg(RISCV::X10) + .addReg(RISCV::X2) + .addImm(8 * 10), + MCSTI); // store x11(arg1) by new sp - OutStreamer->emitInstruction(MCInstBuilder(RISCV::SD) - .addReg(RISCV::X11) - .addReg(RISCV::X2) - .addImm(8 * 11), - MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::SD) + .addReg(RISCV::X11) + .addReg(RISCV::X2) + .addImm(8 * 11), + MCSTI); // store x8(fp) by new sp - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::SD).addReg(RISCV::X8).addReg(RISCV::X2).addImm(8 * 8), MCSTI); // store x1(ra) by new sp - OutStreamer->emitInstruction( + EmitToStreamer( + *OutStreamer, MCInstBuilder(RISCV::SD).addReg(RISCV::X1).addReg(RISCV::X2).addImm(1 * 8), MCSTI); if (Reg != RISCV::X10) - OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI) - .addReg(RISCV::X10) - .addReg(Reg) - .addImm(0), - MCSTI); - OutStreamer->emitInstruction( - MCInstBuilder(RISCV::ADDI) - .addReg(RISCV::X11) - .addReg(RISCV::X0) - .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask), - MCSTI); - - OutStreamer->emitInstruction(MCInstBuilder(RISCV::PseudoCALL).addExpr(Expr), - MCSTI); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(RISCV::ADDI).addReg(RISCV::X10).addReg(Reg).addImm(0), + MCSTI); + EmitToStreamer(*OutStreamer, + MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X11) + .addReg(RISCV::X0) + .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask), + MCSTI); + + EmitToStreamer(*OutStreamer, MCInstBuilder(RISCV::PseudoCALL).addExpr(Expr), + MCSTI); } } diff --git a/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll b/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll index 12c95206d21bed..dfd526c8964137 100644 --- a/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll +++ b/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv64 < %s | FileCheck %s ; RUN: llc -mtriple=riscv64 --relocation-model=pic < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+c --riscv-no-aliases < %s \ +; RUN: | FileCheck %s --check-prefix=COMPRESS define ptr @f2(ptr %x0, ptr %x1) { ; CHECK-LABEL: f2: @@ -14,6 +16,18 @@ define ptr @f2(ptr %x0, ptr %x1) { ; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret +; +; COMPRESS-LABEL: f2: +; COMPRESS: # %bb.0: +; COMPRESS-NEXT: c.addi sp, -16 +; COMPRESS-NEXT: .cfi_def_cfa_offset 16 +; COMPRESS-NEXT: c.sdsp ra, 8(sp) # 8-byte Folded Spill +; COMPRESS-NEXT: .cfi_offset ra, -8 +; COMPRESS-NEXT: c.mv t0, a1 +; COMPRESS-NEXT: call __hwasan_check_x10_2_short +; COMPRESS-NEXT: c.ldsp ra, 8(sp) # 8-byte Folded Reload +; COMPRESS-NEXT: c.addi sp, 16 +; COMPRESS-NEXT: c.jr ra call void @llvm.hwasan.check.memaccess.shortgranules(ptr %x1, ptr %x0, i32 2) ret ptr %x0 } @@ -50,3 +64,34 @@ declare void @llvm.hwasan.check.memaccess.shortgranules(ptr, ptr, i32) ; CHECK-NEXT: sd ra, 8(sp) ; CHECK-NEXT: li a1, 2 ; CHECK-NEXT: call __hwasan_tag_mismatch_v2 + +; COMPRESS: .section .text.hot,"axG",@progbits,__hwasan_check_x10_2_short,comdat +; COMPRESS-NEXT: .type __hwasan_check_x10_2_short,@function +; COMPRESS-NEXT: .weak __hwasan_check_x10_2_short +; COMPRESS-NEXT: .hidden __hwasan_check_x10_2_short +; COMPRESS-NEXT: __hwasan_check_x10_2_short: +; COMPRESS-NEXT: slli t1, a0, 8 +; COMPRESS-NEXT: srli t1, t1, 12 +; COMPRESS-NEXT: c.add t1, t0 +; COMPRESS-NEXT: lbu t1, 0(t1) +; COMPRESS-NEXT: srli t2, a0, 56 +; COMPRESS-NEXT: bne t2, t1, .Ltmp0 +; COMPRESS-NEXT: .Ltmp1: +; COMPRESS-NEXT: c.jr ra +; COMPRESS-NEXT: .Ltmp0: +; COMPRESS-NEXT: c.li t3, 16 +; COMPRESS-NEXT: bgeu t1, t3, .Ltmp2 +; COMPRESS-NEXT: andi t3, a0, 15 +; COMPRESS-NEXT: c.addi t3, 3 +; COMPRESS-NEXT: bge t3, t1, .Ltmp2 +; COMPRESS-NEXT: ori t1, a0, 15 +; COMPRESS-NEXT: lbu t1, 0(t1) +; COMPRESS-NEXT: beq t1, t2, .Ltmp1 +; COMPRESS-NEXT: .Ltmp2: +; COMPRESS-NEXT: c.addi16sp sp, -256 +; COMPRESS-NEXT: c.sdsp a0, 80(sp) +; COMPRESS-NEXT: c.sdsp a1, 88(sp) +; COMPRESS-NEXT: c.sdsp s0, 64(sp) +; COMPRESS-NEXT: c.sdsp ra, 8(sp) +; COMPRESS-NEXT: c.li a1, 2 +; COMPRESS-NEXT: call __hwasan_tag_mismatch_v2 From c77b10746160f985625603b1e9c837b44caa5c67 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 10 Oct 2024 08:47:30 -0700 Subject: [PATCH 034/177] [lldb] Introduce an always-on system log category/channel (#108495) Add an "always on" log category and channel. Unlike other, existing log channels, it is not exposed to users. The channel is meant to be used sparsely and deliberately for logging high-value information to the system log. We have a similar concept in the downstream Swift fork and this has proven to be extremely valuable. This is especially true on macOS where system log messages are automatically captured as part of a sysdiagnose. --- lldb/include/lldb/Host/Host.h | 19 +++++++++++ lldb/include/lldb/Utility/Log.h | 11 +++--- lldb/source/API/SystemInitializerFull.cpp | 3 ++ lldb/source/Host/common/Host.cpp | 16 +++++++++ lldb/source/Host/common/HostInfoBase.cpp | 2 ++ lldb/source/Utility/Log.cpp | 34 ++++++++++++------- lldb/test/Shell/Host/TestSytemLogChannel.test | 3 ++ 7 files changed, 70 insertions(+), 18 deletions(-) create mode 100644 lldb/test/Shell/Host/TestSytemLogChannel.test diff --git a/lldb/include/lldb/Host/Host.h b/lldb/include/lldb/Host/Host.h index 9d0994978402f7..d8113a5fceeada 100644 --- a/lldb/include/lldb/Host/Host.h +++ b/lldb/include/lldb/Host/Host.h @@ -31,6 +31,25 @@ class ProcessInstanceInfo; class ProcessInstanceInfoMatch; typedef std::vector ProcessInstanceInfoList; +// System log category and channel. This log channel is always enabled and +// therefore is supposed to be used sparsely. Use this log channel to log +// critical information that is expected to be relevant to the majority of bug +// reports. +enum class SystemLog : Log::MaskType { + System = Log::ChannelFlag<0>, + LLVM_MARK_AS_BITMASK_ENUM(System) +}; + +LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); + +class LogChannelSystem { +public: + static void Initialize(); + static void Terminate(); +}; + +template <> Log::Channel &LogChannelFor(); + // Exit Type for inferior processes struct WaitStatus { enum Type : uint8_t { diff --git a/lldb/include/lldb/Utility/Log.h b/lldb/include/lldb/Utility/Log.h index 27707c17f9b824..ac6347153a1014 100644 --- a/lldb/include/lldb/Utility/Log.h +++ b/lldb/include/lldb/Utility/Log.h @@ -272,6 +272,12 @@ class Log final { void VAFormatf(llvm::StringRef file, llvm::StringRef function, const char *format, va_list args); + void Enable(const std::shared_ptr &handler_sp, + std::optional flags = std::nullopt, + uint32_t options = 0); + + void Disable(std::optional flags = std::nullopt); + private: Channel &m_channel; @@ -297,11 +303,6 @@ class Log final { return m_handler; } - void Enable(const std::shared_ptr &handler_sp, uint32_t options, - MaskType flags); - - void Disable(MaskType flags); - bool Dump(llvm::raw_ostream &stream); typedef llvm::StringMap ChannelMap; diff --git a/lldb/source/API/SystemInitializerFull.cpp b/lldb/source/API/SystemInitializerFull.cpp index 995d14f7c1fa1e..8a992a6889a91b 100644 --- a/lldb/source/API/SystemInitializerFull.cpp +++ b/lldb/source/API/SystemInitializerFull.cpp @@ -17,6 +17,7 @@ #include "lldb/Interpreter/CommandInterpreter.h" #include "lldb/Target/ProcessTrace.h" #include "lldb/Utility/Timer.h" +#include "lldb/Version/Version.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetSelect.h" @@ -83,6 +84,8 @@ llvm::Error SystemInitializerFull::Initialize() { // Use the Debugger's LLDBAssert callback. SetLLDBAssertCallback(Debugger::AssertCallback); + LLDB_LOG(GetLog(SystemLog::System), "{0}", GetVersion()); + return llvm::Error::success(); } diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp index f08adea6546ae1..abca6068d3604a 100644 --- a/lldb/source/Host/common/Host.cpp +++ b/lldb/source/Host/common/Host.cpp @@ -125,6 +125,22 @@ void Host::SystemLog(Severity severity, llvm::StringRef message) { #endif #endif +static constexpr Log::Category g_categories[] = { + {{"system"}, {"system log"}, SystemLog::System}}; + +static Log::Channel g_system_channel(g_categories, SystemLog::System); +static Log g_system_log(g_system_channel); + +template <> Log::Channel &lldb_private::LogChannelFor() { + return g_system_channel; +} + +void LogChannelSystem::Initialize() { + g_system_log.Enable(std::make_shared()); +} + +void LogChannelSystem::Terminate() { g_system_log.Disable(); } + #if !defined(__APPLE__) && !defined(_WIN32) static thread_result_t MonitorChildProcessThreadFunction(::pid_t pid, diff --git a/lldb/source/Host/common/HostInfoBase.cpp b/lldb/source/Host/common/HostInfoBase.cpp index 5c44c2f38b2879..89dfe4a9e9baa3 100644 --- a/lldb/source/Host/common/HostInfoBase.cpp +++ b/lldb/source/Host/common/HostInfoBase.cpp @@ -76,9 +76,11 @@ static HostInfoBase::SharedLibraryDirectoryHelper *g_shlib_dir_helper = nullptr; void HostInfoBase::Initialize(SharedLibraryDirectoryHelper *helper) { g_shlib_dir_helper = helper; g_fields = new HostInfoBaseFields(); + LogChannelSystem::Initialize(); } void HostInfoBase::Terminate() { + LogChannelSystem::Terminate(); g_shlib_dir_helper = nullptr; delete g_fields; g_fields = nullptr; diff --git a/lldb/source/Utility/Log.cpp b/lldb/source/Utility/Log.cpp index f6b1381f63ad1c..3798f406476370 100644 --- a/lldb/source/Utility/Log.cpp +++ b/lldb/source/Utility/Log.cpp @@ -93,22 +93,28 @@ Log::MaskType Log::GetFlags(llvm::raw_ostream &stream, } void Log::Enable(const std::shared_ptr &handler_sp, - uint32_t options, Log::MaskType flags) { + std::optional flags, uint32_t options) { llvm::sys::ScopedWriter lock(m_mutex); - MaskType mask = m_mask.fetch_or(flags, std::memory_order_relaxed); - if (mask | flags) { + if (!flags) + flags = m_channel.default_flags; + + MaskType mask = m_mask.fetch_or(*flags, std::memory_order_relaxed); + if (mask | *flags) { m_options.store(options, std::memory_order_relaxed); m_handler = handler_sp; m_channel.log_ptr.store(this, std::memory_order_relaxed); } } -void Log::Disable(Log::MaskType flags) { +void Log::Disable(std::optional flags) { llvm::sys::ScopedWriter lock(m_mutex); - MaskType mask = m_mask.fetch_and(~flags, std::memory_order_relaxed); - if (!(mask & ~flags)) { + if (!flags) + flags = std::numeric_limits::max(); + + MaskType mask = m_mask.fetch_and(~(*flags), std::memory_order_relaxed); + if (!(mask & ~(*flags))) { m_handler.reset(); m_channel.log_ptr.store(nullptr, std::memory_order_relaxed); } @@ -230,10 +236,11 @@ bool Log::EnableLogChannel(const std::shared_ptr &log_handler_sp, error_stream << llvm::formatv("Invalid log channel '{0}'.\n", channel); return false; } - MaskType flags = categories.empty() - ? iter->second.m_channel.default_flags - : GetFlags(error_stream, *iter, categories); - iter->second.Enable(log_handler_sp, log_options, flags); + + auto flags = categories.empty() ? std::optional{} + : GetFlags(error_stream, *iter, categories); + + iter->second.Enable(log_handler_sp, flags, log_options); return true; } @@ -245,9 +252,10 @@ bool Log::DisableLogChannel(llvm::StringRef channel, error_stream << llvm::formatv("Invalid log channel '{0}'.\n", channel); return false; } - MaskType flags = categories.empty() - ? std::numeric_limits::max() - : GetFlags(error_stream, *iter, categories); + + auto flags = categories.empty() ? std::optional{} + : GetFlags(error_stream, *iter, categories); + iter->second.Disable(flags); return true; } diff --git a/lldb/test/Shell/Host/TestSytemLogChannel.test b/lldb/test/Shell/Host/TestSytemLogChannel.test new file mode 100644 index 00000000000000..4de699f0e09a4a --- /dev/null +++ b/lldb/test/Shell/Host/TestSytemLogChannel.test @@ -0,0 +1,3 @@ +RUN: %lldb -o 'log list' -o 'log disable system' 2>&1 | FileCheck %s +CHECK-NOT: Logging categories for 'system' +CHECK: Invalid log channel 'system' From f5aec03f6dd2f92590ecec9e3419b38b11d8476e Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 10 Oct 2024 15:50:35 +0000 Subject: [PATCH 035/177] [clang][analyzer][NFC] Fix strange bracket placement --- clang/lib/Analysis/ProgramPoint.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Analysis/ProgramPoint.cpp b/clang/lib/Analysis/ProgramPoint.cpp index 2a91749affd2a6..768345c8425f02 100644 --- a/clang/lib/Analysis/ProgramPoint.cpp +++ b/clang/lib/Analysis/ProgramPoint.cpp @@ -157,7 +157,7 @@ void ProgramPoint::printJson(llvm::raw_ostream &Out, const char *NL) const { LHS->printJson(Out, nullptr, PP, AddQuotes); } else { Out << "null"; - } + } Out << ", \"rhs\": "; if (const Stmt *RHS = C->getRHS()) { From 23309d7d9553af69b2912a159bc2e488acf69255 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 10 Oct 2024 16:53:36 +0100 Subject: [PATCH 036/177] [Dexter] Set up ComInterface module to be imported correctly (#111850) Fixes issue added by: https://github.com/llvm/llvm-project/pull/111833 Following the previous commit that changed how Dexter imports modules, the ComInterface module import became broken. This is because it had a different directory structure to other modules, where we want to import single file rather than a dir containing a __init__.py. For this case, an optional extra arg has been added to load_module allowing a filename to be specified, letting us import ComInterface.py directly and fixing the issue. --- .../dex/debugger/visualstudio/VisualStudio.py | 4 +++- .../debuginfo-tests/dexter/dex/utils/Imports.py | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py index 7cb56ec0c25a76..a6752274efac20 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py @@ -24,7 +24,9 @@ def _load_com_module(): try: return load_module( - "ComInterface", os.path.join(os.path.dirname(__file__), "windows") + "ComInterface", + os.path.join(os.path.dirname(__file__), "windows"), + "ComInterface.py", ) except ImportError as e: raise LoadDebuggerException(e, sys.exc_info()) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py index ea052c21a18498..cd184f9d20ed8f 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py @@ -1,12 +1,17 @@ -import importlib +import importlib.util import os import sys -def load_module(name, path): - spec = importlib.util.spec_from_file_location( - name, os.path.join(path, name, "__init__.py") +def load_module(name, path, mod_file="__init__.py"): + # The module is either defined by a directory, in which case we search for + # `path/name/__init__.py`, or it is a single file at `path/mod_file`. + mod_path = ( + os.path.join(path, name, mod_file) + if mod_file == "__init__.py" + else os.path.join(path, mod_file) ) + spec = importlib.util.spec_from_file_location(name, mod_path) module = importlib.util.module_from_spec(spec) sys.modules[name] = module spec.loader.exec_module(module) From 03483737a7a2d72a257a5ab6ff01748ad9cf0f75 Mon Sep 17 00:00:00 2001 From: Md Asghar Ahmad Shahid Date: Thu, 10 Oct 2024 21:30:58 +0530 Subject: [PATCH 037/177] [mlir][linalg] Introduce transpose semantic to 'linalg.matmul' ops. (#104783) The main goal of this patch is to extend the semantic of 'linalg.matmul' named op to include per operand transpose semantic while also laying out a way to move ops definition from OpDSL to tablegen. Hence, it is implemented in tablegen. Transpose semantic is as follows. By default 'linalg.matmul' behavior will remain as is. Transpose semantics can be appiled on per input operand by specifying the optional permutation attributes (namely 'permutationA' for 1st input and 'permutationB' for 2nd input) for each operand explicitly as needed. By default, no transpose is mandated for any of the input operand. Example: ``` %val = linalg.matmul ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) permutationA = [1, 0] permutationB = [0, 1] ``` --- .../Dialect/Linalg/IR/LinalgInterfaces.td | 10 + .../Linalg/IR/LinalgNamedStructuredOps.yaml | 72 ----- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 134 +++++++++ .../Dialect/Linalg/IR/LinalgInterfaces.cpp | 17 +- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 263 +++++++++++++++++- .../Linalg/Transforms/TransposeMatmul.cpp | 7 + .../Linalg/Transforms/Vectorization.cpp | 5 + .../NVGPU/TransformOps/NVGPUTransformOps.cpp | 6 + .../linalg/opdsl/ops/core_named_ops.py | 17 -- .../Dialect/Linalg/generalize-named-ops.mlir | 111 ++++++++ mlir/test/Dialect/Linalg/invalid.mlir | 159 +++++++++++ mlir/test/Dialect/Linalg/named-ops.mlir | 243 ++++++++++++++++ mlir/test/python/dialects/linalg/ops.py | 75 ----- .../mlir-linalg-ods-yaml-gen.cpp | 6 +- 14 files changed, 943 insertions(+), 182 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td index fbf3f19cde0e9b..e80dbb2afb9ef7 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td @@ -684,6 +684,16 @@ def LinalgStructuredInterface return; }] >, + InterfaceMethod< + /*desc=*/[{ + Return true if the user has supplied an explicit indexing maps for this op. + }], + /*retTy=*/"bool", + /*methodName=*/"hasUserDefinedMaps", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ return false; }] + >, //===------------------------------------------------------------------===// // Linalg generalization hooks. //===------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index 8cb698096ef5b7..97b90333e2b200 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -1065,78 +1065,6 @@ structured_op: !LinalgStructuredOpConfig - !ScalarExpression scalar_arg: rhs --- !LinalgOpConfig -metadata: !LinalgOpMetadata - name: matmul - cpp_class_name: MatmulOp - doc: |- - Performs a matrix multiplication of two 2D inputs. - - Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. - implements: - - LinalgContractionOpInterface -structured_op: !LinalgStructuredOpConfig - args: - - !LinalgOperandDefConfig - name: A - kind: input_tensor - type_var: T1 - shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)> - - !LinalgOperandDefConfig - name: B - kind: input_tensor - type_var: T2 - shape_map: affine_map<()[s0, s1, s2] -> (s1, s2)> - - !LinalgOperandDefConfig - name: C - kind: output_tensor - type_var: U - shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)> - - !LinalgOperandDefConfig - name: cast - kind: type_fn_attr - default_fn: cast_signed - indexing_maps: !LinalgIndexingMapsConfig - static_indexing_maps: - - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)> - - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)> - - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)> - iterator_types: - - parallel - - parallel - - reduction - assignments: - - !ScalarAssign - arg: C - value: !ScalarExpression - scalar_fn: - kind: binary - fn_name: add - operands: - - !ScalarExpression - scalar_arg: C - - !ScalarExpression - scalar_fn: - kind: binary - fn_name: mul - operands: - - !ScalarExpression - scalar_fn: - kind: type - attr_name: cast - type_var: U - operands: - - !ScalarExpression - scalar_arg: A - - !ScalarExpression - scalar_fn: - kind: type - attr_name: cast - type_var: U - operands: - - !ScalarExpression - scalar_arg: B ---- !LinalgOpConfig metadata: !LinalgOpMetadata name: quantized_matmul cpp_class_name: QuantizedMatmulOp diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 31f29139247267..61d4fc9734c6de 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -535,6 +535,140 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [ let hasCanonicalizer = 1; } +//===----------------------------------------------------------------------===// +// Op definition for MatmulOp +//===----------------------------------------------------------------------===// + +def MatmulOp : LinalgStructuredBase_Op<"matmul", [ + AttrSizedOperandSegments, + LinalgContractionOpInterface]> { + + let summary = [{ + Performs a matrix multiplication of two 2D inputs without broadcast or transpose. + }]; + let description = [{ + Numeric casting is performed on the operands to the inner multiply, + promoting them to the same data type as the accumulator/output. + + Broadcast and Transpose semantics can be appiled by specifying the explicit attribute + 'indexing_maps' as shown below.This is a list attribute, so the list must include all + the maps if specified. + + Example Transpose: + ``` + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>,memref<5x7xf32>) + outs(%arg2: memref<3x7xf32>) + ``` + + Example Broadcast: + ``` + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, // broadcast + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) + outs(%arg2: memref<3x7xf32>) + ``` + + Example Broadcast and transpose: + ``` + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose + affine_map<(d0, d1, d2) -> (d2)>, // broadcast + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) + }]; + + let arguments = (ins + Variadic:$inputs, + Variadic:$outputs, + DefaultValuedOptionalAttr:$indexing_maps, + DefaultValuedOptionalAttr:$cast + ); + let results = (outs Variadic:$result_tensors); + let regions = (region AnyRegion:$region); + + let skipDefaultBuilders = 1; + let builders = [ + OpBuilder< + (ins "ValueRange":$inputs, "ValueRange":$outputs, + CArg<"ArrayRef", "{}">:$attributes), + [{ + buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs, + attributes, MatmulOp::getRegionBuilder()); + }]>, + OpBuilder< + (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, + "ValueRange":$outputs, + CArg<"ArrayRef", "{}">:$attributes), + [{ + buildStructuredOp($_builder, $_state, resultTensorTypes, + inputs, outputs, attributes, MatmulOp::getRegionBuilder()); + }]>, + OpBuilder< + (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands, + CArg<"ArrayRef", "{}">:$attributes), + [{ + $_state.addOperands(operands); + $_state.addAttributes(attributes); + $_state.addTypes(resultTensorTypes); + (void)$_state.addRegion(); + }]>, + OpBuilder< + (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, + "ValueRange":$outputs, + "Attribute":$cast, CArg<"ArrayRef", "{}">:$attributes), + [{ + $_state.addAttribute("cast", cast); + buildStructuredOp($_builder, $_state, resultTensorTypes, inputs, outputs, + attributes, MatmulOp::getRegionBuilder()); + }]> + + ]; + let hasCustomAssemblyFormat = 1; + let hasFolder = 1; + let hasVerifier = 1; + + let extraClassDeclaration = structuredOpsBaseDecls # [{ + SmallVector getIteratorTypesArray(); + + /// Implements the block region builder. + static void regionBuilder(ImplicitLocOpBuilder &b, + Block &block, ArrayRef attrs); + + /// Returns a list of AffineMap with the typical matmul indexing charactristic. + SmallVector getDefaultIndexingMaps(); + + /// Returns true if the given broadcast map \p bcastMap is valid for this op. + bool isValidLhsRhsBroadcastMap(AffineMap bcastMap); + + static std::function)> + getRegionBuilder() { + return regionBuilder; + } + + ::mlir::MutableOperandRange getDpsInitsMutable() { + return getOutputsMutable(); + } + + // Generic methods. + static unsigned getNumRegionArgs(); + std::string getLibraryCallName(); + bool hasDynamicIndexingMaps(); + /// Check if the op has broadcast and/or transpose semantic. Returns true if the + /// user defined indexing maps are not equal to default map. + bool hasUserDefinedMaps(); + }]; +} + //===----------------------------------------------------------------------===// // Named Linalg ops, implemented as a declarative configurations of generic ops. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp index 40795879c3026d..3b9194098fa783 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp @@ -15,13 +15,20 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "mlir/IR/MLIRContext.h" #include "mlir/IR/TypeUtilities.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" #include +#include using namespace mlir; using namespace mlir::linalg; @@ -1142,7 +1149,6 @@ int64_t LinalgOp::getIndexingMapIndex(OpOperand *opOperand) { LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { LinalgOp linalgOp = cast(op); - // Mixed tensor/buffer operands are not allowed. if (!linalgOp.hasPureTensorSemantics() && !linalgOp.hasPureBufferSemantics() && op->getNumOperands() > 0) @@ -1162,6 +1168,8 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { << ") to be equal to the number of input/output operands (" << linalgOp->getNumOperands() << ")"; + // Set this flag if this op has user defined maps. This is required to guard + // the below error condition which assume default indexing maps. for (OpOperand &opOperand : linalgOp->getOpOperands()) { AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand); @@ -1178,13 +1186,13 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { << " dim(s) to match the number of loops"; int64_t rank = linalgOp.getRank(&opOperand); + if (indexingMap.getNumResults() != rank) return op->emitOpError("expected operand rank (") << rank << ") to match the result rank of indexing_map #" << opOperand.getOperandNumber() << " (" << indexingMap.getNumResults() << ")"; } - SmallVector redDims; linalgOp.getReductionDims(redDims); @@ -1194,9 +1202,8 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { // Check if given shapes match to inferred shapes. SmallVector endLoopRangeValues = linalgOp.getStaticLoopRanges(); SmallVector startLoopRangeValues(endLoopRangeValues.size(), 0); - - // Verify only static cases since we can't get exact dimension sizes and loop - // ranges for dynamic cases in this stage. + // Verify only static cases since we can't get exact dimension sizes and + // loop ranges for dynamic cases in this stage. if (llvm::none_of(endLoopRangeValues, ShapedType::isDynamic)) { for (int64_t &range : endLoopRangeValues) range -= 1; diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 730c478c2883ef..4f350ea236da84 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -27,6 +27,7 @@ #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" +#include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Matchers.h" @@ -37,12 +38,17 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/LogicalResult.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include #include using namespace mlir; @@ -149,15 +155,36 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region ®ion, // iterator_types is an auto-generated method. } +/// Helper to create a typical indexing map for MatmulOp. Returns a list of +/// AffineMap. +static SmallVector +getDefaultIndexingMapsForMatmul(MLIRContext *context) { + AffineExpr d0, d1, d2; + SmallVector indexingMaps; + bindDims(context, d0, d1, d2); + indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context)); + indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context)); + indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context)); + return indexingMaps; +} + +/// Wrapper to return the typical indexing map array attribute for MatmulOp. +static SmallVector getDefaultIndexingMapAttr(MLIRContext *context) { + return llvm::map_to_vector( + getDefaultIndexingMapsForMatmul(context), + [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); }); +} + /// Creates a structured operation given `inputs`, `outputs`, and `attributes`. /// The result types are derived automatically if `resultTensorTypes` is none. /// The body of the operation is filled using `regionBuilder`. All ods-gen /// created structured operations use the method to implement their builders. -static void buildStructuredOp(OpBuilder &b, OperationState &state, - std::optional resultTensorTypes, - ValueRange inputs, ValueRange outputs, - ArrayRef attributes, - RegionBuilderFn regionBuilder) { +static void buildStructuredOp( + OpBuilder &b, OperationState &state, + std::optional resultTensorTypes, ValueRange inputs, + ValueRange outputs, ArrayRef attributes, + RegionBuilderFn regionBuilder, + std::optional> indexingMaps = std::nullopt) { // Derive the result types if needed. SmallVector derivedResultTypes = resultTensorTypes.value_or(TypeRange()); @@ -168,6 +195,20 @@ static void buildStructuredOp(OpBuilder &b, OperationState &state, state.addOperands(inputs); state.addOperands(outputs); state.addTypes(derivedResultTypes); + + // Initialize indexingMaps, for MatmulOp. + SmallVector indexingMapsAttrVal; + if (indexingMaps.has_value()) { + for (mlir::AffineMap map : *indexingMaps) { + // Convert each AffineMap to an AffineMapAttr + indexingMapsAttrVal.push_back(AffineMapAttr::get(map)); + } + state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal)); + } else { + indexingMapsAttrVal = getDefaultIndexingMapAttr(b.getContext()); + state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal)); + } + state.addAttributes(attributes); state.addAttribute( "operandSegmentSizes", @@ -299,11 +340,48 @@ static ParseResult parseNamedStructuredOp(OpAsmParser &parser, OperationState &result, unsigned numRegionArgs, RegionBuilderFn regionBuilder) { + + SmallVector indexingMapsAttr; + Attribute mapAttr; + if (succeeded(parser.parseOptionalKeyword("indexing_maps"))) { + if (parser.parseEqual()) + return failure(); + + if (parser.parseLSquare()) + return failure(); + + do { + if (parser.parseAttribute(mapAttr)) + return failure(); + if (!isa(mapAttr)) { + return parser.emitError(parser.getCurrentLocation(), + "expected affine map attribute"); + } + indexingMapsAttr.push_back(mapAttr); + + if (parser.parseOptionalComma()) + break; + } while (true); + + if (parser.parseRSquare()) + return failure(); + } + // Initialize indexingMaps, if not supplied explicitly. + if (indexingMapsAttr.empty()) { + indexingMapsAttr = getDefaultIndexingMapAttr(result.getContext()); + } + result.addAttribute("indexing_maps", + parser.getBuilder().getArrayAttr(indexingMapsAttr)); + // TODO: Enable when ods-gen supports captures. SmallVector inputTypes, outputTypes; if (parseCommonStructuredOpParts(parser, result, inputTypes, outputTypes)) return failure(); + // Parse optional attributes. + if (parser.parseOptionalAttrDict(result.attributes)) + return failure(); + // TODO: consider merging results parsing into region parsing. // Need to wait for declarative assembly resolution to decide. SmallVector outputTensorsTypes; @@ -329,13 +407,9 @@ static void printNamedStructuredOpResults(OpAsmPrinter &p, } static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op, - ValueRange inputs, ValueRange outputs) { - p.printOptionalAttrDict( - op->getAttrs(), - /*elidedAttrs=*/{"operandSegmentSizes", - // See generated code in - // LinalgNamedStructuredOps.yamlgen.cpp.inc - "linalg.memoized_indexing_maps"}); + ValueRange inputs, ValueRange outputs, + ArrayRef elidedAttrs = {}) { + p.printOptionalAttrDict(op->getAttrs(), elidedAttrs); // Printing is shared with generic ops, except for the region and // attributes. @@ -3382,3 +3456,168 @@ Operation *LinalgDialect::materializeConstant(OpBuilder &builder, Location loc) { return arith::ConstantOp::materialize(builder, value, type, loc); } + +/// Returns true if the result AffineExpr of the \p explicitMap is same as \p +/// defaultMap. +static bool isValidResultDimExprs(AffineMap explictMap, AffineMap defaultMap) { + auto explicitRange = explictMap.getResults(); + auto defaultRange = defaultMap.getResults(); + DenseSet explicitSet(explicitRange.begin(), explicitRange.end()); + DenseSet defaultSet(defaultRange.begin(), defaultRange.end()); + llvm::set_union(explicitSet, defaultSet); + return explicitSet == defaultSet; +} + +/// Returns true if the \p explictMap is broadcasted with respect to the +/// \p defaultMap. +static bool isBroadcasted(AffineMap explictMap, AffineMap defaultMap) { + return explictMap.getNumResults() < defaultMap.getNumResults(); +} + +/// Verifies the broadcast and transpose semantic sepecified by the explicit +/// indexing map for the MatmulOp \p op for each operand specified by \p +/// opIndex. +static LogicalResult verifyExtendedMatmulSemantic(MatmulOp matmulOp, + unsigned opIndex) { + SmallVector opIndexingMaps = matmulOp.getIndexingMapsArray(); + SmallVector defaultIndexingMaps = + matmulOp.getDefaultIndexingMaps(); + + auto opIndexingMap = opIndexingMaps[opIndex]; + auto defaultIndexingMap = defaultIndexingMaps[opIndex]; + // Check general validity of indexing map results. + if (!isValidResultDimExprs(opIndexingMap, defaultIndexingMap)) + return matmulOp->emitOpError() + << "Unexpected dim expression in map result."; + + // Check if the requested broadcast is valid. + if (isBroadcasted(opIndexingMap, defaultIndexingMap)) { + if (!matmulOp.isValidLhsRhsBroadcastMap(opIndexingMap)) { + return matmulOp->emitOpError() + << "Invalid broadcast requested, should be (d2)."; + } + return success(); + } + return success(); +} + +namespace mlir { +namespace linalg { +//===----------------------------------------------------------------------===// +// MatMulOp +//===----------------------------------------------------------------------===// +SmallVector MatmulOp::getIteratorTypesArray() { + return SmallVector{utils::IteratorType::parallel, + utils::IteratorType::parallel, + utils::IteratorType::reduction}; +} + +unsigned MatmulOp::getNumRegionArgs() { return 3; } + +std::string MatmulOp::getLibraryCallName() { + return generateLibraryCallName(getOperation()); +} + +bool MatmulOp::hasDynamicIndexingMaps() { return true; } + +/// Check if the op has broadcast and/or transpose semantic. Returns true if the +/// user defined indexing maps are not equal to default map. +bool MatmulOp::hasUserDefinedMaps() { + SmallVector defaultMaps = getDefaultIndexingMaps(); + SmallVector explicitMaps = getIndexingMapsArray(); + return defaultMaps != explicitMaps; +} + +/// Implements the block region builder for the MatmulOp. This is called by +/// 'fillStructuredOpRegion'. +void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block, + ArrayRef attrs) { + assert(3 > 0 && block.getNumArguments() == 3 && + "MatmulOp regionBuilder expects 3 (>=0) args"); + RegionBuilderHelper helper(b, block); + SmallVector yields; + + TypeFn castVal = TypeFn::cast_signed; + auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { + return attr.getName() == "cast"; + }); + if (castIter != attrs.end()) { + if (auto attr = llvm::dyn_cast(castIter->getValue())) + castVal = attr.getValue(); + } + + Value value1 = helper.buildTypeFn(castVal, block.getArgument(2).getType(), + block.getArgument(0)); + Value value2 = helper.buildTypeFn(castVal, block.getArgument(2).getType(), + block.getArgument(1)); + Value value3 = helper.buildBinaryFn(BinaryFn::mul, value1, value2); + Value value4 = + helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), value3); + yields.push_back(value4); + helper.yieldOutputs(yields); +} + +/// Returns a list of AffineMap with the typical matmul indexing charactristic. +SmallVector MatmulOp::getDefaultIndexingMaps() { + MLIRContext *context = this->getContext(); + return getDefaultIndexingMapsForMatmul(context); +} + +/// Returns true if the given broadcast map \p bcastMap is valid for this op. +bool MatmulOp::isValidLhsRhsBroadcastMap(AffineMap bcastMap) { + assert(bcastMap.getNumResults() == 1 && "Expected single result dim expr."); + AffineExpr exp = bcastMap.getResult(0); + // Invalid map if the common dimension of matmul not found. + return exp.isFunctionOfDim(bcastMap.getNumDims() - 1); +} + +ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) { + return parseNamedStructuredOp(parser, result, MatmulOp::getNumRegionArgs(), + MatmulOp::getRegionBuilder()); +} +void MatmulOp::print(OpAsmPrinter &p) { + SmallVector elidedAttrs = { + "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; + printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), + elidedAttrs); + + SmallVector indexingMaps = + getDefaultIndexingMapAttr(getContext()); + if (!llvm::equal(getIndexingMaps(), indexingMaps)) { + p << " indexing_maps = ["; + llvm::interleaveComma(getIndexingMaps(), p, + [&](Attribute attr) { p.printAttribute(attr); }); + p << "]"; + } +} + +/// Verify the user defined indexing maps. +LogicalResult MatmulOp::verify() { + // Verification of pure matmul is handled by verifyStructuredOpInterface(). + if (!hasUserDefinedMaps()) + return success(); + + for (unsigned opIndex = 0; opIndex < 2; opIndex++) { + if (failed(verifyExtendedMatmulSemantic(*this, opIndex))) + return failure(); + } + return success(); +} + +LogicalResult MatmulOp::fold(FoldAdaptor, SmallVectorImpl &) { + return memref::foldMemRefCast(*this); +} +void MatmulOp::getEffects( + SmallVectorImpl> + &effects) { + if (hasPureTensorSemantics()) + return; + getGenericEffectsImpl(effects, cast(getOperation())); +} + +Speculation::Speculatability MatmulOp::getSpeculatability() { + return getGenericSpeculatabilityImpl(cast(getOperation())); +} + +} // namespace linalg +} // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp index aa0052ce47fa7b..6b934f7e8157d4 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp @@ -31,6 +31,13 @@ using namespace mlir::linalg; FailureOr mlir::linalg::transposeMatmul(RewriterBase &rewriter, linalg::MatmulOp matmulOp, bool transposeLHS) { + // Check to not let go the matmul with extended semantic, through this + // transform. + if (matmulOp.hasUserDefinedMaps()) { + return rewriter.notifyMatchFailure( + matmulOp, "only matmul ops with non-extended semantics are supported"); + } + if (!bufferization::hasTensorSemantics(matmulOp)) return rewriter.notifyMatchFailure( matmulOp, "only matmul ops with tensors are supported"); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 09c6b2683b4388..e3f010d9cfb20b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2071,6 +2071,11 @@ vectorizeScalableVectorPrecondition(Operation *op, return failure(); } + // Check to not let go the matmul with extended semantic, through this + // transform. + if (linalgOp.hasUserDefinedMaps()) + return failure(); + // Cond 4: Only the following ops are supported in the // presence of scalable vectors return success(isElementwise(linalgOp) || isa(op) || diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index 0c2275bbc4b224..3c508ed6e324b2 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -821,6 +821,12 @@ DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne( bool fail = true; // TODO: more robust detection of matmulOp, with transposes etc. if (isa_and_nonnull(linalgOp.getOperation())) { + // Check to not let go the matmul with extended semantic, through this + // transform. + if (linalgOp.hasUserDefinedMaps()) { + return emitSilenceableError() + << "only matmul ops with non-extended semantics are supported"; + } Location loc = linalgOp.getLoc(); // TODO: more robust computation of laneId, for now assume a single warp. Value laneId = rewriter.create( diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index e4a6ec7487bb2f..d5e79b4d3cb6dd 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -383,23 +383,6 @@ def select( O[None] = TernaryFn.select(cond[None], lhs[None], rhs[None]) -@linalg_structured_op -def matmul( - A=TensorDef(T1, S.M, S.K), - B=TensorDef(T2, S.K, S.N), - C=TensorDef(U, S.M, S.N, output=True), - cast=TypeFnAttrDef(default=TypeFn.cast_signed), -): - """Performs a matrix multiplication of two 2D inputs. - - Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. - """ - domain(D.m, D.n, D.k) - implements(ContractionOpInterface) - C[D.m, D.n] += cast(U, A[D.m, D.k]) * cast(U, B[D.k, D.n]) - - @linalg_structured_op def quantized_matmul( A=TensorDef(T1, S.M, S.K), diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir index 1e8f1435ca0fa5..aba26c35931fd3 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir @@ -29,6 +29,34 @@ func.func @generalize_matmul_buffer(%A : memref<16x8xf32>, %B: memref<8x32xf32>, // ----- +func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK-LABEL: func.func @matmul_bcast_a( +// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { +// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) { +// CHECK: ^bb0(%[[VAL_3:.*]]: f32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: f32): +// CHECK: %[[VAL_6:.*]] = arith.mulf %[[VAL_3]], %[[VAL_4]] : f32 +// CHECK: %[[VAL_7:.*]] = arith.addf %[[VAL_5]], %[[VAL_6]] : f32 +// CHECK: linalg.yield %[[VAL_7]] : f32 +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + func.func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> { %0 = linalg.matmul ins(%A, %B: tensor<16x8xf32>, tensor<8x32xf32>) outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32> @@ -891,3 +919,86 @@ func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor, tensor, tensor> } + +// ----- + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @matmul_transpose_a_explicit( +// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { + +// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} +// CHECK: arith.mulf +// CHECK: arith.addf + +func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) + outs(%arg2: memref<3x7xf32>) + + return +} + +// ----- + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK-LABEL: func.func @matmul_transpose_b_explicit( +// CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { + +// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} +// CHECK: arith.mulf +// CHECK: arith.addf + +func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) + outs(%arg2: memref<3x7xf32>) + + return +} + +// ----- + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @matmul_transpose_a_b_explicit( +// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { + +// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} +// CHECK: arith.mulf +// CHECK: arith.addf + +func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>) + outs(%arg2: memref<3x7xf32>) + + return +} + +// ----- + diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index c481a723c5623c..b2869893b8042d 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -361,6 +361,165 @@ func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, // ----- +func.func @invalid_indexing_maps_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) { + // expected-error @+1 {{expected attribute value}} + linalg.matmul indexing_maps = [ + , + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>) + outs(%arg2 :memref<2x4xf32>) + return +} + +// ----- + +func.func @invalid_matmul_dim_a(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) { + // expected-error @+1 {{Unexpected dim expression in map result}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>) + return +} + +// ----- + +func.func @invalid_matmul_dim_b(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) { + // expected-error @+1 {{Unexpected dim expression in map result}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>) + return +} + +// ----- + +func.func @invalid_transpose_a_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> { + // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 1}} + %0 = linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) + outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32> + return %0: tensor<4x64xf32> +} + +// ----- + +func.func @invalid_transpose_b_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> { + // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #1 to be 1, but found 64}} + %0 = linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) + outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32> + return %0: tensor<4x64xf32> +} + +// ----- + +func.func @invalid_bcast_a(%arg0: memref<3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// ----- + +func.func @invalid_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) { + // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// ----- + +func.func @invalid_bcast_a_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #0 (1)}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// ----- + +func.func @invalid_bcast_b_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #1 (1)}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// ----- + +func.func @invalid_matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) { + // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 5, but found 7}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// ----- + +func.func @invalid_matmul_bcast_b_transpose_a_wrong_dim(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { + // expected-error @+1 {{'linalg.matmul' op Unexpected dim expression in map result.}} + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// ----- + +func.func @invalid_indexing_maps_placement_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) { + // expected-error @+2 {{custom op 'indexing_maps' is unknown (tried 'func.indexing_maps' as well)}} + linalg.matmul ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) outs(%init : tensor<4x64xf32>) + indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + return +} + +// ----- + func.func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2x1xf32>, %output: memref<1x2x3x1xf32>) { // expected-error @+1 {{inferred input/output operand #0 has shape's dimension #1 to be greater than or equal to 4, but found 3}} linalg.conv_2d_nhwc_hwcf diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 02ecbed232c8b5..65c18de8424771 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1201,6 +1201,249 @@ func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %a // ----- +// CHECK-LABEL: func @matmul_transpose_a_explicit +// CHECK: linalg.matmul +// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>) +// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) +func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) + outs(%arg2: memref<3x7xf32>) + + return +} + +// ----- + +func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) + outs(%arg2: memref<3x7xf32>) + + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @matmul_transpose_b_explicit( +// CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { +// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: return +// CHECK: } + +// ----- + +func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>) + outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @matmul_transpose_a_b_explicit( +// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { +// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: return +// CHECK: } + +// ----- + +func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK-LABEL: func @matmul_bcast_a +// CHECK: linalg.matmul +// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) +// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) + +// ----- + +func.func @matmul_bcast_a_dim1(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK-LABEL: func @matmul_bcast_a_dim1 +// CHECK: linalg.matmul +// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) +// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) + +// ----- + +func.func @matmul_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK-LABEL: func @matmul_bcast_b +// CHECK: linalg.matmul +// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) +// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) + +// ----- + +func.func @matmul_bcast_a_b(%arg0: memref<5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @matmul_bcast_a_b( +// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, %[[VAL_1:.*]]: memref<5xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { +// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]] +// CHECK: return +// CHECK: } + +// ----- + +func.func @matmul_bcast_b_dim1(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK-LABEL: func @matmul_bcast_b_dim1 +// CHECK: linalg.matmul +// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) +// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) + +// ----- + +func.func @dynamic_matmul_bcast_a(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref, memref) outs(%arg2: memref) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @dynamic_matmul_bcast_a( +// CHECK-SAME: %[[VAL_0:.*]]: memref, +// CHECK-SAME: %[[VAL_1:.*]]: memref, +// CHECK-SAME: %[[VAL_2:.*]]: memref) { +// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref, memref) outs(%[[VAL_2]] : memref) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: return +// CHECK: } + +// ----- + +func.func @matmul_bcast_a_transpose_b(%arg0: memref<5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @matmul_bcast_a_transpose_b( +// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { +// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: return +// CHECK: } + +// ----- + +func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { + linalg.matmul indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ] + ins(%arg0, %arg1 : memref<5x3xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) + return +} + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func.func @matmul_bcast_b_transpose_a( +// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<5xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { +// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: return +// CHECK: } + +// ----- + // CHECK-LABEL: func @matmul_transpose_b // CHECK: linalg.matmul_transpose_b // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>) diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py index 3bfbcf7d7f7c81..72045a07b2da80 100644 --- a/mlir/test/python/dialects/linalg/ops.py +++ b/mlir/test/python/dialects/linalg/ops.py @@ -84,81 +84,6 @@ def named_form(lhs, rhs): print(module) - -# CHECK-LABEL: TEST: testNamedStructuredOpGenericForm -@run -def testNamedStructuredOpGenericForm(): - with Context() as ctx, Location.unknown(): - module = Module.create() - f32 = F32Type.get() - with InsertionPoint(module.body): - - @func.FuncOp.from_py_func( - RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) - ) - def named_form(lhs, rhs): - init_result = tensor.empty([4, 8], f32) - # CHECK: "linalg.matmul"(%{{.*}}) - # CHECK-SAME: cast = #linalg.type_fn - # CHECK-SAME: operandSegmentSizes = array - # CHECK-NEXT: ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32): - # CHECK-NEXT: arith.mulf{{.*}} (f32, f32) -> f32 - # CHECK-NEXT: arith.addf{{.*}} (f32, f32) -> f32 - # CHECK-NEXT: linalg.yield{{.*}} (f32) -> () - # CHECK-NEXT: (tensor<4x16xf32>, tensor<16x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32> - return linalg.matmul(lhs, rhs, outs=[init_result]) - - module.operation.print(print_generic_op_form=True) - - -# CHECK-LABEL: TEST: testNamedStructuredAsGenericOp -@run -def testNamedStructuredAsGenericOp(): - with Context() as ctx, Location.unknown(): - module = Module.create() - f32 = F32Type.get() - with InsertionPoint(module.body): - - @func.FuncOp.from_py_func( - RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) - ) - def generic_form(lhs, rhs): - init_result = tensor.EmptyOp([4, 8], f32) - # CHECK: linalg.generic - return linalg.matmul( - lhs, rhs, outs=[init_result.result], emit_generic=True - ) - - print(module) - - -# CHECK-LABEL: TEST: testOpResultFromOtherOp -@run -def testOpResultFromOtherOp(): - with Context(), Location.unknown(): - module = Module.create() - f32 = F32Type.get() - with InsertionPoint(module.body): - - @func.FuncOp.from_py_func( - RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) - ) - def pass_an_op_directly(arg0, arg1): - one = arith.ConstantOp(F32Type.get(), 1.0) - # CHECK: %[[LHS:.*]] = linalg.fill - lhs = linalg.fill(one, outs=[arg0]) - # CHECK: %[[RHS:.*]] = linalg.fill - rhs = linalg.fill(one, outs=[arg1]) - # CHECK: %[[INIT:.*]] = tensor.empty - init = tensor.EmptyOp([4, 8], f32) - # CHECK: linalg.matmul - # CHECK: ins(%[[LHS]], %[[RHS]] - # CHECK: outs(%[[INIT]] - return linalg.matmul(lhs, rhs, outs=init) - - print(module) - - # CHECK-LABEL: TEST: testIdentityRegionOps @run def testIdentityRegionOps(): diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp index aa5a52a21f1251..f820cb7ee8c3c4 100644 --- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp +++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp @@ -681,7 +681,11 @@ ParseResult {0}::parse(OpAsmParser &parser, OperationState &result) {{ {0}::getNumRegionArgs(), {0}::getRegionBuilder()); } void {0}::print(OpAsmPrinter &p) {{ - ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs()); + SmallVector elidedAttrs = {{"operandSegmentSizes", + "linalg.memoized_indexing_maps", + "indexing_maps"}; + ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), + elidedAttrs); } )FMT"; From cb5fbd2f60a5a588bfa4668ea8269c3568cbff6e Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Thu, 10 Oct 2024 09:01:50 -0700 Subject: [PATCH 038/177] [CodeLayout] Do not verify after assigning blocks (#111754) Rather than invariantly running `F->verify()` when asserts are enabled, run machine IR verification in LIT tests only. Swap `CHECK-PERF` and `CHECK-SIZE` in `code_placement_ext_tsp_large.ll`. Remove `={0,1,true,false}` from flags in tests. --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 7 +--- .../CodeGen/X86/code_placement_ext_tsp.ll | 2 +- .../X86/code_placement_ext_tsp_large.ll | 8 ++--- .../X86/code_placement_ext_tsp_size.ll | 34 +++++++++---------- 4 files changed, 23 insertions(+), 28 deletions(-) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index c42e63202c3b5a..dd5220b4599f95 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3572,7 +3572,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (UseExtTspForPerf || UseExtTspForSize) { assert( !(UseExtTspForPerf && UseExtTspForSize) && - "UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly"); + "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously"); applyExtTsp(/*OptForSize=*/UseExtTspForSize); createCFGChainExtTsp(); } @@ -3745,11 +3745,6 @@ void MachineBlockPlacement::assignBlockOrder( continue; MBB.updateTerminator(FTMBB); } - -#ifndef NDEBUG - // Make sure we correctly constructed all branches. - F->verify(this, "After optimized block reordering", &errs()); -#endif } void MachineBlockPlacement::createCFGChainExtTsp() { diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll index be0b9820e14541..37e3245467c869 100644 --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll @@ -1,5 +1,5 @@ ;; See also llvm/unittests/Transforms/Utils/CodeLayoutTest.cpp -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 < %s | FileCheck %s +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement < %s | FileCheck %s define void @func1a() { ; Test that the algorithm positions the most likely successor first diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll index ac172d32c6d8b6..24c52f1e88656e 100644 --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll @@ -1,8 +1,8 @@ ; REQUIRES: asserts -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=128 -debug-only=block-placement < %s 2>&1 | FileCheck %s -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=1 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK2 -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=0 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK3 -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-block-placement-max-blocks=8 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK4 +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement -ext-tsp-chain-split-threshold=128 -debug-only=block-placement < %s 2>&1 | FileCheck %s +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement -ext-tsp-chain-split-threshold=1 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK2 +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK3 +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement -ext-tsp-block-placement-max-blocks=8 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK4 @yydebug = dso_local global i32 0, align 4 diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll index 59eaf2586f1737..e7a4d6d8fd23a5 100644 --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll @@ -1,5 +1,5 @@ -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true < %s | FileCheck %s -check-prefix=CHECK-PERF -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK-SIZE +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -apply-ext-tsp-for-size < %s | FileCheck %s -check-prefix=CHECK-SIZE +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK-PERF define void @func1() minsize { ; @@ -19,15 +19,15 @@ define void @func1() minsize { ; | b2 | <+ ; +-----+ ; -; CHECK-PERF-LABEL: func1: -; CHECK-PERF: %b0 -; CHECK-PERF: %b1 -; CHECK-PERF: %b2 -; ; CHECK-SIZE-LABEL: func1: ; CHECK-SIZE: %b0 -; CHECK-SIZE: %b2 ; CHECK-SIZE: %b1 +; CHECK-SIZE: %b2 +; +; CHECK-PERF-LABEL: func1: +; CHECK-PERF: %b0 +; CHECK-PERF: %b2 +; CHECK-PERF: %b1 b0: %call = call zeroext i1 @a() @@ -75,21 +75,21 @@ define void @func_loop() minsize !prof !9 { ; | end | ; +--------+ ; -; CHECK-PERF-LABEL: func_loop: -; CHECK-PERF: %entry -; CHECK-PERF: %header -; CHECK-PERF: %if.then -; CHECK-PERF: %if.else -; CHECK-PERF: %if.end -; CHECK-PERF: %end -; ; CHECK-SIZE-LABEL: func_loop: ; CHECK-SIZE: %entry ; CHECK-SIZE: %header +; CHECK-SIZE: %if.then ; CHECK-SIZE: %if.else ; CHECK-SIZE: %if.end -; CHECK-SIZE: %if.then ; CHECK-SIZE: %end +; +; CHECK-PERF-LABEL: func_loop: +; CHECK-PERF: %entry +; CHECK-PERF: %header +; CHECK-PERF: %if.else +; CHECK-PERF: %if.end +; CHECK-PERF: %if.then +; CHECK-PERF: %end entry: br label %header From 25d9688c43d37c0c918e9b8ab2f67be35b0fb75f Mon Sep 17 00:00:00 2001 From: yronglin Date: Fri, 11 Oct 2024 00:04:02 +0800 Subject: [PATCH 039/177] [Clang] Extend lifetime of temporaries in mem-default-init for P2718R0 (#86960) Depends on [CWG1815](https://github.com/llvm/llvm-project/pull/108039). Fixes https://github.com/llvm/llvm-project/issues/85613. In [[Clang] Implement P2718R0 "Lifetime extension in range-based for loops"](https://github.com/llvm/llvm-project/pull/76361), we've not implement the lifetime extensions for the temporaries which in `CXXDefaultInitExpr`. As the confirmation in https://github.com/llvm/llvm-project/issues/85613, we should extend lifetime for that. To avoid modifying current CodeGen rules, in a lifetime extension context, the cleanup of `CXXDefaultInitExpr` was ignored. --------- Signed-off-by: yronglin --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/Sema/SemaExpr.cpp | 2 + clang/lib/Sema/SemaInit.cpp | 2 + .../test/AST/ast-dump-for-range-lifetime.cpp | 59 +++++++++ clang/test/CXX/special/class.temporary/p6.cpp | 122 +++++++++++++++++- clang/www/cxx_status.html | 9 +- 6 files changed, 188 insertions(+), 9 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c0019cfe4658d7..e48835d4738007 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -171,6 +171,9 @@ C++23 Feature Support ^^^^^^^^^^^^^^^^^^^^^ - Removed the restriction to literal types in constexpr functions in C++23 mode. +- Extend lifetime of temporaries in mem-default-init for P2718R0. Clang now fully + supported `P2718R0 Lifetime extension in range-based for loops `_. + C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index e2141e03ca4230..4e37385710af5e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -5649,6 +5649,8 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { runWithSufficientStackSpace(Loc, [&] { MarkDeclarationsReferencedInExpr(E, /*SkipLocalVariables=*/false); }); + if (isInLifetimeExtendingContext()) + DiscardCleanupsInEvaluationContext(); // C++11 [class.base.init]p7: // The initialization of each base and member constitutes a // full-expression. diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index edd1fe40fdf278..5d6a586fe5a2cf 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -763,6 +763,8 @@ void InitListChecker::FillInEmptyInitForField(unsigned Init, FieldDecl *Field, SemaRef.currentEvaluationContext().DelayedDefaultInitializationContext = SemaRef.parentEvaluationContext() .DelayedDefaultInitializationContext; + SemaRef.currentEvaluationContext().InLifetimeExtendingContext = + SemaRef.parentEvaluationContext().InLifetimeExtendingContext; DIE = SemaRef.BuildCXXDefaultInitExpr(Loc, Field); } if (DIE.isInvalid()) { diff --git a/clang/test/AST/ast-dump-for-range-lifetime.cpp b/clang/test/AST/ast-dump-for-range-lifetime.cpp index 0e92b6990ed504..ee046be19ab632 100644 --- a/clang/test/AST/ast-dump-for-range-lifetime.cpp +++ b/clang/test/AST/ast-dump-for-range-lifetime.cpp @@ -449,4 +449,63 @@ void test13() { for (auto e : dg().r().g().r().g().r().g()) bar(e); } + +extern "C" void exit(int); + +struct A14 { + int arr[1]; + ~A14() noexcept(false) { throw 42; } +}; + +struct B14 { + int x; + const A14 &a = A14{{0}}; + const int *begin() { return a.arr; } + const int *end() { return &a.arr[1]; } +}; + +void test14() { + // The ExprWithCleanups in CXXDefaultInitExpr will be ignored. + + // CHECK: FunctionDecl {{.*}} test14 'void ()' + // CHECK: -CXXForRangeStmt {{.*}} + // CHECK-NEXT: |-<<>> + // CHECK-NEXT: |-DeclStmt {{.*}} + // CHECK-NEXT: | `-VarDecl {{.*}} implicit used __range1 'const int (&)[1]' cinit + // CHECK-NEXT: | `-ExprWithCleanups {{.*}} 'const int[1]' lvalue + // CHECK-NEXT: | `-MemberExpr {{.*}} 'const int[1]' lvalue .arr {{.*}} + // CHECK-NEXT: | `-MemberExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue .a {{.*}} + // CHECK-NEXT: | `-MaterializeTemporaryExpr {{.*}} 'B14':'P2718R0::B14' xvalue extended by Var {{.*}} '__range1' 'const int (&)[1]' + // CHECK-NEXT: | `-CXXFunctionalCastExpr {{.*}} 'B14':'P2718R0::B14' functional cast to B14 + // CHECK-NEXT: | `-InitListExpr {{.*}} 'B14':'P2718R0::B14' + // CHECK-NEXT: | |-IntegerLiteral {{.*}} 'int' 0 + // CHECK-NEXT: | `-CXXDefaultInitExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue has rewritten init + // CHECK-NEXT: | `-MaterializeTemporaryExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue extended by Var {{.*}} '__range1' 'const int (&)[1]' + // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} 'const A14':'const P2718R0::A14' + // CHECK-NEXT: | `-CXXFunctionalCastExpr {{.*}} 'A14':'P2718R0::A14' functional cast to A14 + // CHECK-NEXT: | `-CXXBindTemporaryExpr {{.*}} 'A14':'P2718R0::A14' (CXXTemporary {{.*}}) + // CHECK-NEXT: | `-InitListExpr {{.*}} 'A14':'P2718R0::A14' + // CHECK-NEXT: | `-InitListExpr {{.*}} 'int[1]' + // CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 0 + for (auto &&x : B14{0}.a.arr) { exit(0); } + + // CHECK: -CXXForRangeStmt {{.*}} + // CHECK-NEXT: |-<<>> + // CHECK-NEXT: |-DeclStmt {{.*}} + // CHECK-NEXT: | `-VarDecl {{.*}} col:19 implicit used __range1 'B14 &&' cinit + // CHECK-NEXT: | `-ExprWithCleanups {{.*}} 'B14':'P2718R0::B14' xvalue + // CHECK-NEXT: | `-MaterializeTemporaryExpr {{.*}} 'B14':'P2718R0::B14' xvalue extended by Var {{.*}} '__range1' 'B14 &&' + // CHECK-NEXT: | `-CXXFunctionalCastExpr {{.*}} 'B14':'P2718R0::B14' functional cast to B14 + // CHECK-NEXT: | `-InitListExpr {{.*}} 'B14':'P2718R0::B14' + // CHECK-NEXT: | |-IntegerLiteral {{.*}} 'int' 0 + // CHECK-NEXT: | `-CXXDefaultInitExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue has rewritten init + // CHECK-NEXT: | `-MaterializeTemporaryExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue extended by Var {{.*}} '__range1' 'B14 &&' + // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} 'const A14':'const P2718R0::A14' + // CHECK-NEXT: | `-CXXFunctionalCastExpr {{.*}} 'A14':'P2718R0::A14' functional cast to A14 + // CHECK-NEXT: | `-CXXBindTemporaryExpr {{.*}} 'A14':'P2718R0::A14' (CXXTemporary {{.*}}) + // CHECK-NEXT: | `-InitListExpr {{.*}} 'A14':'P2718R0::A14' + // CHECK-NEXT: | `-InitListExpr {{.*}} 'int[1]' + // CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 0 + for (auto &&x : B14{0}) { exit(0); } +} } // namespace P2718R0 diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp index a6d2adfd1fd2c5..2b1b531b7172ca 100644 --- a/clang/test/CXX/special/class.temporary/p6.cpp +++ b/clang/test/CXX/special/class.temporary/p6.cpp @@ -463,6 +463,80 @@ template void default_arg_dependent_context2(); template void default_arg_dependent_context3(); } // namespace default_arg +namespace default_init { +template +struct DepA { + T arr[1]; + ~DepA() {} +}; + +template +struct DepB { + int x; + const DepA &a = DepA{{0}}; + ~DepB() {} + const int *begin() { return a.arr; } + const int *end() { return &a.arr[1]; } +}; + +template +void default_init1_dependent() { + // CHECK-CXX23: void @_ZN7P2718R012default_init23default_init1_dependentINS0_4DepBIiEEEEvv() + // CHECK-CXX23-LABEL: for.cond.cleanup: + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepBIiED1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepAIiED1Ev( + for (auto &&x : T{0}) {} +} + +template +void default_init2_dependent() { + // CHECK-CXX23: void @_ZN7P2718R012default_init23default_init2_dependentINS0_4DepBIiEEEEvv() + // CHECK-CXX23-LABEL: for.cond.cleanup: + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepBIiED1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepAIiED1Ev( + for (auto &&x : T{0}.a.arr) {} +} + +template void default_init1_dependent>(); +template void default_init2_dependent>(); +} // namespace default_init + +// -- Examples from https://wg21.link/p2718r0 +extern void block_scope_begin_function(); +extern void block_scope_end_function(); +namespace std_examples { +using T = std::list; +const T& f1(const T& t) { return t; } +const T& f2(T t) { return t; } +T g(); +void foo() { + // CHECK-CXX23: define {{.*}} void @_ZN7P2718R012std_examples3fooEv() + // CHECK-CXX23: call void @_ZN7P2718R026block_scope_begin_functionEv + block_scope_begin_function(); + { + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012std_examples1gEv + // CHECK-CXX23-NEXT: call {{.*}} @_ZN7P2718R012std_examples2f1ERKSt4listIiE + // CHECK-CXX23: for.cond.cleanup: + // CHECK-CXX23-NEXT: call void @_ZNSt4listIiED1Ev + for (auto e : f1(g())) {} // OK, lifetime of return value of g() extended + } + // CHECK-CXX23: call void @_ZN7P2718R024block_scope_end_functionEv + block_scope_end_function(); + + // The lifetime of temporary returned by g() in this case will not be extended. + // CHECK-CXX23: call void @_ZN7P2718R026block_scope_begin_functionEv + block_scope_begin_function(); + { + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012std_examples1gEv + // CHECK-CXX23-NEXT: call {{.*}} @_ZN7P2718R012std_examples2f2ESt4listIiE + // CHECK-CXX23-NEXT: call void @_ZNSt4listIiED1Ev + for (auto e : f2(g())) {} // undefined behavior + } + // CHECK-CXX23: call void @_ZN7P2718R024block_scope_end_functionEv + block_scope_end_function(); +} +} // namespace std_examples + namespace basic { using T = std::list; const T& f1(const T& t) { return t; } @@ -579,5 +653,51 @@ void default_arg3() { for (auto e : C(0, C(0, C(0, C())))) {} } } // namespace default_arg -} // namespace P2718R0 +namespace default_init { +struct X { + int x; + ~X() {} +}; + +struct Y { + int y; + const X &x = X{1}; + ~Y() {} +}; + +struct A { + int arr[1]; + const Y &y = Y{1}; + ~A() {} +}; + +struct B { + int x; + const A &a = A{{0}}; + ~B() {} + const int *begin() { return a.arr; } + const int *end() { return &a.arr[1]; } +}; + +void default_init1() { + // CHECK-CXX23: void @_ZN7P2718R012default_init13default_init1Ev() + // CHECK-CXX23-LABEL: for.cond.cleanup: + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1BD1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1AD1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1YD1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1XD1Ev( + for (auto &&x : B{0}) {} +} + +void default_init2() { + // CHECK-CXX23: void @_ZN7P2718R012default_init13default_init2Ev() + // CHECK-CXX23-LABEL: for.cond.cleanup: + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1BD1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1AD1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1YD1Ev( + // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1XD1Ev( + for (auto &&x : B{0}.a.arr) {} +} +} // namespace default_init +} // namespace P2718R0 diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html index 3f6a46c08c8514..d59cbbbbec1b5b 100755 --- a/clang/www/cxx_status.html +++ b/clang/www/cxx_status.html @@ -475,14 +475,7 @@

C++23 implementation status

Lifetime extension in range-based for loops
P2718R0 - -
- Clang 19 (Partial) - The lifetime extension of temporaries bound to member references - by default member initializers in aggregate initialization was - not supported now. -
- + Clang 20 From 2190ffa0f7e874d04fd0f750142135faa5df5d6b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 10 Oct 2024 11:07:14 -0500 Subject: [PATCH 040/177] [libc] Fix missing namespace declarations --- libc/src/stdio/asprintf.h | 4 ++-- libc/src/stdio/vasprintf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libc/src/stdio/asprintf.h b/libc/src/stdio/asprintf.h index 222dfdee9d4fd7..168721c4f98b98 100644 --- a/libc/src/stdio/asprintf.h +++ b/libc/src/stdio/asprintf.h @@ -11,10 +11,10 @@ #include "src/__support/macros/config.h" -namespace LIBC_NAMESPACE { +namespace LIBC_NAMESPACE_DECL { int asprintf(char **__restrict s, const char *__restrict format, ...); -} // namespace LIBC_NAMESPACE +} // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_STDIO_ASPRINTF_H diff --git a/libc/src/stdio/vasprintf.h b/libc/src/stdio/vasprintf.h index 8b286fe69bf203..b914c2f9ae0789 100644 --- a/libc/src/stdio/vasprintf.h +++ b/libc/src/stdio/vasprintf.h @@ -11,11 +11,11 @@ #include -namespace LIBC_NAMESPACE { +namespace LIBC_NAMESPACE_DECL { int vasprintf(char **__restrict s, const char *__restrict format, va_list vlist); -} // namespace LIBC_NAMESPACE +} // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_STDIO_VASPRINTF_H From 73e74e496ec32a13a5ae71df71364065f7be3cca Mon Sep 17 00:00:00 2001 From: Eric Astor Date: Thu, 10 Oct 2024 12:21:34 -0400 Subject: [PATCH 041/177] [clang][frontend] Support applying the annotate attribute to statements (#111841) By allowing AnnotateAttr to be applied to statements, users can place arbitrary information in the AST for later use. For example, this can be used for HW-targeted language extensions that involve specialized loop annotations. --- clang/include/clang/AST/Attr.h | 17 +++++++++ clang/include/clang/Basic/Attr.td | 7 +++- clang/include/clang/Sema/Sema.h | 7 ++-- clang/lib/Sema/Sema.cpp | 28 ++++++++++++++ clang/lib/Sema/SemaDeclAttr.cpp | 25 ++---------- clang/lib/Sema/SemaStmtAttr.cpp | 2 + clang/lib/Sema/SemaTemplateInstantiate.cpp | 13 +++++++ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 5 ++- clang/test/AST/attr-print-emit.cpp | 3 ++ clang/test/Sema/annotate.c | 3 ++ clang/test/SemaTemplate/attributes.cpp | 38 +++++++++++++++++++ clang/utils/TableGen/ClangAttrEmitter.cpp | 30 ++++++++------- 12 files changed, 137 insertions(+), 41 deletions(-) diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index ac44e9fdd7c4e9..725498e132fc28 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -197,6 +197,23 @@ class InheritableParamAttr : public InheritableAttr { } }; +class InheritableParamOrStmtAttr : public InheritableParamAttr { +protected: + InheritableParamOrStmtAttr(ASTContext &Context, + const AttributeCommonInfo &CommonInfo, + attr::Kind AK, bool IsLateParsed, + bool InheritEvenIfAlreadyPresent) + : InheritableParamAttr(Context, CommonInfo, AK, IsLateParsed, + InheritEvenIfAlreadyPresent) {} + +public: + // Implement isa/cast/dyncast/etc. + static bool classof(const Attr *A) { + return A->getKind() >= attr::FirstInheritableParamOrStmtAttr && + A->getKind() <= attr::LastInheritableParamOrStmtAttr; + } +}; + class HLSLAnnotationAttr : public InheritableAttr { protected: HLSLAnnotationAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index fbcbf0ed416416..ec3d6e0079f630 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -759,6 +759,11 @@ class TargetSpecificAttr { /// redeclarations, even when it's written on a parameter. class InheritableParamAttr : InheritableAttr; +/// A attribute that is either a declaration attribute or a statement attribute, +/// and if used as a declaration attribute, is inherited by later +/// redeclarations, even when it's written on a parameter. +class InheritableParamOrStmtAttr : InheritableParamAttr; + /// An attribute which changes the ABI rules for a specific parameter. class ParameterABIAttr : InheritableParamAttr { let Subjects = SubjectList<[ParmVar]>; @@ -928,7 +933,7 @@ def AnalyzerNoReturn : InheritableAttr { let Documentation = [Undocumented]; } -def Annotate : InheritableParamAttr { +def Annotate : InheritableParamOrStmtAttr { let Spellings = [Clang<"annotate">]; let Args = [StringArgument<"Annotation">, VariadicExprArgument<"Args">]; // Ensure that the annotate attribute can be used with diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index ef010fafb1573e..f8118ca64ad3f2 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -4528,9 +4528,10 @@ class Sema final : public SemaBase { /// declaration. void AddAlignValueAttr(Decl *D, const AttributeCommonInfo &CI, Expr *E); - /// AddAnnotationAttr - Adds an annotation Annot with Args arguments to D. - void AddAnnotationAttr(Decl *D, const AttributeCommonInfo &CI, - StringRef Annot, MutableArrayRef Args); + /// CreateAnnotationAttr - Creates an annotation Annot with Args arguments. + Attr *CreateAnnotationAttr(const AttributeCommonInfo &CI, StringRef Annot, + MutableArrayRef Args); + Attr *CreateAnnotationAttr(const ParsedAttr &AL); bool checkMSInheritanceAttrOnDefinition(CXXRecordDecl *RD, SourceRange Range, bool BestCase, diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index f05760428458b1..9f91ee9a39f2f9 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -2777,3 +2777,31 @@ bool Sema::isDeclaratorFunctionLike(Declarator &D) { }); return Result; } + +Attr *Sema::CreateAnnotationAttr(const AttributeCommonInfo &CI, StringRef Annot, + MutableArrayRef Args) { + + auto *A = AnnotateAttr::Create(Context, Annot, Args.data(), Args.size(), CI); + if (!ConstantFoldAttrArgs( + CI, MutableArrayRef(A->args_begin(), A->args_end()))) { + return nullptr; + } + return A; +} + +Attr *Sema::CreateAnnotationAttr(const ParsedAttr &AL) { + // Make sure that there is a string literal as the annotation's first + // argument. + StringRef Str; + if (!checkStringLiteralArgumentAttr(AL, 0, Str)) + return nullptr; + + llvm::SmallVector Args; + Args.reserve(AL.getNumArgs() - 1); + for (unsigned Idx = 1; Idx < AL.getNumArgs(); Idx++) { + assert(!AL.isArgIdent(Idx)); + Args.push_back(AL.getArgAsExpr(Idx)); + } + + return CreateAnnotationAttr(AL, Str, Args); +} diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index e2174ba926f17f..6759aae37afac1 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3958,30 +3958,11 @@ static void handleTransparentUnionAttr(Sema &S, Decl *D, const ParsedAttr &AL) { RD->addAttr(::new (S.Context) TransparentUnionAttr(S.Context, AL)); } -void Sema::AddAnnotationAttr(Decl *D, const AttributeCommonInfo &CI, - StringRef Str, MutableArrayRef Args) { - auto *Attr = AnnotateAttr::Create(Context, Str, Args.data(), Args.size(), CI); - if (ConstantFoldAttrArgs( - CI, MutableArrayRef(Attr->args_begin(), Attr->args_end()))) { - D->addAttr(Attr); - } -} - static void handleAnnotateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { - // Make sure that there is a string literal as the annotation's first - // argument. - StringRef Str; - if (!S.checkStringLiteralArgumentAttr(AL, 0, Str)) - return; - - llvm::SmallVector Args; - Args.reserve(AL.getNumArgs() - 1); - for (unsigned Idx = 1; Idx < AL.getNumArgs(); Idx++) { - assert(!AL.isArgIdent(Idx)); - Args.push_back(AL.getArgAsExpr(Idx)); + auto *Attr = S.CreateAnnotationAttr(AL); + if (Attr) { + D->addAttr(Attr); } - - S.AddAnnotationAttr(D, AL, Str, Args); } static void handleAlignValueAttr(Sema &S, Decl *D, const ParsedAttr &AL) { diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index b9b3b4063bc383..d81c6de3428dc7 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -679,6 +679,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, return handleMSConstexprAttr(S, St, A, Range); case ParsedAttr::AT_NoConvergent: return handleNoConvergentAttr(S, St, A, Range); + case ParsedAttr::AT_Annotate: + return S.CreateAnnotationAttr(A); default: // N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a // declaration attribute is not written on a statement, but this code is diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 74252bd7513cd7..2f60c0beb22e73 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1552,6 +1552,7 @@ namespace { NamedDecl *FirstQualifierInScope = nullptr, bool AllowInjectedClassName = false); + const AnnotateAttr *TransformAnnotateAttr(const AnnotateAttr *AA); const CXXAssumeAttr *TransformCXXAssumeAttr(const CXXAssumeAttr *AA); const LoopHintAttr *TransformLoopHintAttr(const LoopHintAttr *LH); const NoInlineAttr *TransformStmtNoInlineAttr(const Stmt *OrigS, @@ -2182,6 +2183,18 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E, Arg, PackIndex); } +const AnnotateAttr * +TemplateInstantiator::TransformAnnotateAttr(const AnnotateAttr *AA) { + SmallVector Args; + for (Expr *Arg : AA->args()) { + ExprResult Res = getDerived().TransformExpr(Arg); + if (Res.isUsable()) + Args.push_back(Res.get()); + } + return AnnotateAttr::CreateImplicit(getSema().Context, AA->getAnnotation(), + Args.data(), Args.size(), AA->getRange()); +} + const CXXAssumeAttr * TemplateInstantiator::TransformCXXAssumeAttr(const CXXAssumeAttr *AA) { ExprResult Res = getDerived().TransformExpr(AA->getAssumption()); diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 34558e1a005d5a..6b1af35f5c80a8 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -230,7 +230,10 @@ static void instantiateDependentAnnotationAttr( ActualArgs.insert(ActualArgs.begin(), Args.begin() + 1, Args.end()); std::swap(Args, ActualArgs); } - S.AddAnnotationAttr(New, *Attr, Str, Args); + auto *AA = S.CreateAnnotationAttr(*Attr, Str, Args); + if (AA) { + New->addAttr(AA); + } } static Expr *instantiateDependentFunctionAttrCondition( diff --git a/clang/test/AST/attr-print-emit.cpp b/clang/test/AST/attr-print-emit.cpp index d8e62ed5f6cd11..a9bca6778d0f1a 100644 --- a/clang/test/AST/attr-print-emit.cpp +++ b/clang/test/AST/attr-print-emit.cpp @@ -78,6 +78,9 @@ class C { ANNOTATE_ATTR int annotated_attr ANNOTATE_ATTR = 0; // CHECK: __attribute__((annotate("Annotated"))) int annotated_attr __attribute__((annotate("Annotated"))) = 0; +void increment() { [[clang::annotate("Annotated")]] annotated_attr++; } +// CHECK: {{\[\[}}clang::annotate("Annotated")]] annotated_attr++; + // FIXME: We do not print the attribute as written after the type specifier. int ANNOTATE_ATTR annotated_attr_fixme = 0; // CHECK: __attribute__((annotate("Annotated"))) int annotated_attr_fixme = 0; diff --git a/clang/test/Sema/annotate.c b/clang/test/Sema/annotate.c index b4551a102e6174..f2ef08d6378975 100644 --- a/clang/test/Sema/annotate.c +++ b/clang/test/Sema/annotate.c @@ -3,10 +3,12 @@ void __attribute__((annotate("foo"))) foo(float *a) { __attribute__((annotate("bar"))) int x; [[clang::annotate("bar")]] int x2; + [[clang::annotate("bar")]] x2 += 1; __attribute__((annotate(1))) int y; // expected-error {{expected string literal as argument of 'annotate' attribute}} [[clang::annotate(1)]] int y2; // expected-error {{expected string literal as argument of 'annotate' attribute}} __attribute__((annotate("bar", 1))) int z; [[clang::annotate("bar", 1)]] int z2; + [[clang::annotate("bar", 1)]] z2 += 1; int u = __builtin_annotation(z, (char*) 0); // expected-error {{second argument to __builtin_annotation must be a non-wide string constant}} int v = __builtin_annotation(z, (char*) L"bar"); // expected-error {{second argument to __builtin_annotation must be a non-wide string constant}} @@ -15,4 +17,5 @@ void __attribute__((annotate("foo"))) foo(float *a) { __attribute__((annotate())) int c; // expected-error {{'annotate' attribute takes at least 1 argument}} [[clang::annotate()]] int c2; // expected-error {{'annotate' attribute takes at least 1 argument}} + [[clang::annotate()]] c2 += 1; // expected-error {{'annotate' attribute takes at least 1 argument}} } diff --git a/clang/test/SemaTemplate/attributes.cpp b/clang/test/SemaTemplate/attributes.cpp index f6c9f13f0842d2..dea19d09745ca2 100644 --- a/clang/test/SemaTemplate/attributes.cpp +++ b/clang/test/SemaTemplate/attributes.cpp @@ -65,6 +65,17 @@ namespace attribute_annotate { template [[clang::annotate("ANNOTATE_FOO"), clang::annotate("ANNOTATE_BAR")]] void HasAnnotations(); void UseAnnotations() { HasAnnotations(); } +// CHECK: FunctionTemplateDecl {{.*}} HasStmtAnnotations +// CHECK: AnnotateAttr {{.*}} "ANNOTATE_BAZ" +// CHECK: FunctionDecl {{.*}} HasStmtAnnotations +// CHECK: TemplateArgument type 'int' +// CHECK: AnnotateAttr {{.*}} "ANNOTATE_BAZ" +template void HasStmtAnnotations() { + int x = 0; + [[clang::annotate("ANNOTATE_BAZ")]] x++; +} +void UseStmtAnnotations() { HasStmtAnnotations(); } + // CHECK: FunctionTemplateDecl {{.*}} HasPackAnnotations // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is // CHECK-NEXT: FunctionDecl {{.*}} HasPackAnnotations 'void ()' @@ -95,6 +106,33 @@ void UseAnnotations() { HasAnnotations(); } template [[clang::annotate("ANNOTATE_BAZ", Is...)]] void HasPackAnnotations(); void UsePackAnnotations() { HasPackAnnotations<1, 2, 3>(); } +// CHECK: FunctionTemplateDecl {{.*}} HasStmtPackAnnotations +// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is +// CHECK-NEXT: FunctionDecl {{.*}} HasStmtPackAnnotations 'void ()' +// CHECK: AttributedStmt {{.*}} +// CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_QUUX" +// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' +// CHECK: FunctionDecl {{.*}} used HasStmtPackAnnotations 'void ()' +// CHECK-NEXT: TemplateArgument{{.*}} pack +// CHECK-NEXT: TemplateArgument{{.*}} integral '1' +// CHECK-NEXT: TemplateArgument{{.*}} integral '2' +// CHECK-NEXT: TemplateArgument{{.*}} integral '3' +// CHECK: AttributedStmt {{.*}} +// CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_QUUX" +// CHECK-NEXT: PackExpansionExpr {{.*}} +// CHECK-NEXT: SubstNonTypeTemplateParmPackExpr {{.*}} +// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is +// CHECK-NEXT: TemplateArgument pack '<1, 2, 3>' +// CHECK-NEXT: TemplateArgument integral '1' +// CHECK-NEXT: TemplateArgument integral '2' +// CHECK-NEXT: TemplateArgument integral '3' +template void HasStmtPackAnnotations() { + int x = 0; + [[clang::annotate("ANNOTATE_QUUX", Is...)]] x++; +} +void UseStmtPackAnnotations() { HasStmtPackAnnotations<1, 2, 3>(); } + template [[clang::annotate(Is...)]] void HasOnlyPackAnnotation() {} // expected-error {{expected string literal as argument of 'annotate' attribute}} void UseOnlyPackAnnotations() { diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 28b7ec8f822cf8..4890d249c6d8f7 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -3282,16 +3282,16 @@ namespace { } // end anonymous namespace static const AttrClassDescriptor AttrClassDescriptors[] = { - { "ATTR", "Attr" }, - { "TYPE_ATTR", "TypeAttr" }, - { "STMT_ATTR", "StmtAttr" }, - { "DECL_OR_STMT_ATTR", "DeclOrStmtAttr" }, - { "INHERITABLE_ATTR", "InheritableAttr" }, - { "DECL_OR_TYPE_ATTR", "DeclOrTypeAttr" }, - { "INHERITABLE_PARAM_ATTR", "InheritableParamAttr" }, - { "PARAMETER_ABI_ATTR", "ParameterABIAttr" }, - { "HLSL_ANNOTATION_ATTR", "HLSLAnnotationAttr"} -}; + {"ATTR", "Attr"}, + {"TYPE_ATTR", "TypeAttr"}, + {"STMT_ATTR", "StmtAttr"}, + {"DECL_OR_STMT_ATTR", "DeclOrStmtAttr"}, + {"INHERITABLE_ATTR", "InheritableAttr"}, + {"DECL_OR_TYPE_ATTR", "DeclOrTypeAttr"}, + {"INHERITABLE_PARAM_ATTR", "InheritableParamAttr"}, + {"INHERITABLE_PARAM_OR_STMT_ATTR", "InheritableParamOrStmtAttr"}, + {"PARAMETER_ABI_ATTR", "ParameterABIAttr"}, + {"HLSL_ANNOTATION_ATTR", "HLSLAnnotationAttr"}}; static void emitDefaultDefine(raw_ostream &OS, StringRef name, const char *superName) { @@ -4319,10 +4319,12 @@ static void GenerateMutualExclusionsChecks(const Record &Attr, // This means the attribute is either a statement attribute, a decl // attribute, or both; find out which. - bool CurAttrIsStmtAttr = - Attr.isSubClassOf("StmtAttr") || Attr.isSubClassOf("DeclOrStmtAttr"); - bool CurAttrIsDeclAttr = - !CurAttrIsStmtAttr || Attr.isSubClassOf("DeclOrStmtAttr"); + bool CurAttrIsStmtAttr = Attr.isSubClassOf("StmtAttr") || + Attr.isSubClassOf("DeclOrStmtAttr") || + Attr.isSubClassOf("InheritableParamOrStmtAttr"); + bool CurAttrIsDeclAttr = !CurAttrIsStmtAttr || + Attr.isSubClassOf("DeclOrStmtAttr") || + Attr.isSubClassOf("InheritableParamOrStmtAttr"); std::vector DeclAttrs, StmtAttrs; From c04b640a919de50342fca9e0afcbf4b710c7ea2f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 10 Oct 2024 11:21:58 -0500 Subject: [PATCH 042/177] [libc] Add missing config include --- libc/src/stdio/vasprintf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libc/src/stdio/vasprintf.h b/libc/src/stdio/vasprintf.h index b914c2f9ae0789..7a98568edbc071 100644 --- a/libc/src/stdio/vasprintf.h +++ b/libc/src/stdio/vasprintf.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SRC_STDIO_VASPRINTF_H #define LLVM_LIBC_SRC_STDIO_VASPRINTF_H +#include "src/__support/macros/config.h" #include namespace LIBC_NAMESPACE_DECL { From cc9e7cb99b63559c5baf7e380287e5658c412370 Mon Sep 17 00:00:00 2001 From: TatWai Chong <78814694+tatwaichong@users.noreply.github.com> Date: Thu, 10 Oct 2024 09:54:34 -0700 Subject: [PATCH 043/177] [mlir][tosa] Change the type of profile option to ListOption (#111214) In tosa valiation pass, change the type of profile option to ListOption. Now TOSA profiles is turned from hierarchical to composable. Each profile is an independent set, i.e. an target can implement multiple profiles. Set the profile option to none by default, and limit to profiles if requested. The profiles can be specified via command line, e.g. $ mlir-opt ... --tosa-validate="profile=bi,mi" which tells the valiation pass that BI and MI are enabled. Change-Id: I1fb8d0c1b27eccd768349b6eb4234093313efb57 --- .../mlir/Conversion/TosaToLinalg/TosaToLinalg.h | 4 ++-- .../mlir/Dialect/Tosa/Transforms/Passes.td | 17 +++-------------- .../TosaToLinalg/TosaToLinalgPass.cpp | 2 +- .../Dialect/Tosa/Transforms/TosaValidation.cpp | 16 +++++++++++++++- mlir/test/Dialect/Tosa/invalid.mlir | 8 +++++++- mlir/test/Dialect/Tosa/level_check.mlir | 6 +++++- 6 files changed, 33 insertions(+), 20 deletions(-) diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h index 192583f347b8a4..1822016fc88fe6 100644 --- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h +++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h @@ -39,8 +39,8 @@ void addTosaToLinalgPasses( TosaToLinalgNamedOptions(), // Note: Default to 'none' level unless otherwise specified. std::optional validationOptions = - tosa::TosaValidationOptions{tosa::TosaProfileEnum::Undefined, false, - tosa::TosaLevelEnum::None}); + tosa::TosaValidationOptions{ + {"none"}, false, tosa::TosaLevelEnum::None}); /// Populates TOSA to linalg pipelines /// Currently, this includes only the "tosa-to-linalg-pipeline". diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td index c0352fa88fe08d..dac67633769c76 100644 --- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td @@ -76,7 +76,7 @@ def TosaProfileType : I32EnumAttr<"TosaProfileEnum", "Tosa profile", I32EnumAttrCase<"BaseInference", 0, "bi">, I32EnumAttrCase<"MainInference", 1, "mi">, I32EnumAttrCase<"MainTraining", 2, "mt">, - I32EnumAttrCase<"Undefined", 3> + I32EnumAttrCase<"Undefined", 3, "none"> ]>{ let cppNamespace = "mlir::tosa"; } @@ -97,19 +97,8 @@ def TosaValidation : Pass<"tosa-validate", "mlir::ModuleOp"> { }]; let options = [ - Option<"profile", "profile", "mlir::tosa::TosaProfileEnum", - /*default=*/"mlir::tosa::TosaProfileEnum::Undefined", - "Validate if operations match for the given profile", - [{::llvm::cl::values( - clEnumValN(mlir::tosa::TosaProfileEnum::BaseInference, "bi", - "Use Base Inference profile."), - clEnumValN(mlir::tosa::TosaProfileEnum::MainInference, "mi", - "Use Main Inference profile."), - clEnumValN(mlir::tosa::TosaProfileEnum::MainTraining, "mt", - "Use Main Training profile."), - clEnumValN(mlir::tosa::TosaProfileEnum::Undefined, "undefined", - "Do not define a profile.") - )}]>, + ListOption<"profile", "profile", "std::string", + "Validate if operations match for the given profile set">, Option<"StrictOperationSpecAlignment", "strict-op-spec-alignment", "bool", /*default=*/"false", "Verify if the properties of certain operations align the spec requirement">, diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp index 44036d7c31a912..06a7262c467421 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp @@ -115,7 +115,7 @@ void mlir::tosa::registerTosaToLinalgPipelines() { TosaToLinalgOptions tosaToLinalgOptions; TosaToLinalgNamedOptions tosaToLinalgNamedOptions; TosaValidationOptions validationOptions; - validationOptions.profile = tosa::TosaProfileEnum::BaseInference; + validationOptions.profile = {"none"}; validationOptions.StrictOperationSpecAlignment = true; validationOptions.level = tosa::TosaLevelEnum::EightK; tosa::addTosaToLinalgPasses(pm, tosaToLinalgOptions, diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index b78c372af77e64..e390a613b58077 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -405,14 +405,28 @@ struct TosaValidation : public tosa::impl::TosaValidationBase { if (level == TosaLevelEnum::EightK) { tosaLevel = TOSA_LEVEL_EIGHTK; } + + if (!profile.empty()) { + for (std::string &prof : profile) { + auto profSymbol = symbolizeTosaProfileEnum(prof); + if (profSymbol) { + enabled_profiles.push_back(profSymbol.value()); + } + } + } } bool CheckVariable(Operation *op); bool CheckVariableReadOrWrite(Operation *op); bool isValidElementType(Type type); + bool isEnabledProfile(TosaProfileEnum prof) { + return std::find(enabled_profiles.begin(), enabled_profiles.end(), prof) != + std::end(enabled_profiles); + } SmallVector> constCheckers; + SmallVector enabled_profiles; TosaLevel tosaLevel; DenseMap variablesMap; }; @@ -507,7 +521,7 @@ LogicalResult TosaValidation::applyVariableCheck(Operation *op) { bool TosaValidation::isValidElementType(Type type) { if (isa(type)) { - if (profile == TosaProfileEnum::BaseInference) + if (!isEnabledProfile(TosaProfileEnum::MainInference)) return false; return type.isF32() || type.isF16() || type.isBF16(); } diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index e5c5b9b3663903..b9298b66643538 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -1,4 +1,10 @@ -// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate=strict-op-spec-alignment +//-------------------------------------------------------------------------------------------------- +// Test expected errors in terms of the shape and type of tensor, and the argument type of +// operation. Excludes the profile compilance checking since it is performed earlier in the +// validation flow. +//-------------------------------------------------------------------------------------------------- + +// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=bi,mi,mt strict-op-spec-alignment" func.func @test_const() -> tensor<1xf32> { diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir index 9b652f2d0bd142..e851019362958f 100644 --- a/mlir/test/Dialect/Tosa/level_check.mlir +++ b/mlir/test/Dialect/Tosa/level_check.mlir @@ -1,4 +1,8 @@ -// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate +//-------------------------------------------------------------------------------------------------- +// Enable all supported profiles to focus the verification of expected level errors. +//-------------------------------------------------------------------------------------------------- + +// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=bi,mi,mt" func.func @test_argmax(%arg0: tensor<1x1x1x1x29x29x4xf32>) -> tensor<1x1x1x1x29x4xi32> { From f2c5aa920054fa60372a161520e6ea8e8d23880d Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 10 Oct 2024 09:53:45 -0700 Subject: [PATCH 044/177] [lldb] Fix a variety of LLDB_LOG format strings LLVM now triggers an assertion when the format string and arguments don't match. Fix a variety of incorrect format strings I discovered when enabling logging with a debug build. --- .../ExpressionParser/Clang/ClangExpressionDeclMap.cpp | 4 ++-- .../ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp | 2 +- lldb/source/Target/ScriptedThreadPlan.cpp | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp index f994d025043352..5edaa9e4e053cc 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp @@ -934,7 +934,7 @@ void ClangExpressionDeclMap::LookUpLldbObjCClass(NameSearchContext &context) { QualType(interface_type, 0).getAsOpaquePtr(), function_decl_ctx.GetTypeSystem()->weak_from_this()); - LLDB_LOG(log, " FEVD[{0}] Adding type for $__lldb_objc_class: {1}", + LLDB_LOG(log, " FEVD Adding type for $__lldb_objc_class: {0}", ClangUtil::ToString(interface_type)); AddOneType(context, class_user_type); @@ -974,7 +974,7 @@ void ClangExpressionDeclMap::LookUpLldbObjCClass(NameSearchContext &context) { if (!self_clang_type) return; - LLDB_LOG(log, " FEVD[{0}] Adding type for $__lldb_objc_class: {1}", + LLDB_LOG(log, " FEVD Adding type for $__lldb_objc_class: {0}", ClangUtil::ToString(self_type->GetFullCompilerType())); TypeFromUser class_user_type(self_clang_type); diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp index f3a008ff1e8932..96a259b811b5e7 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp @@ -605,7 +605,7 @@ uint32_t AppleObjCDeclVendor::FindDecls(ConstString name, bool append, if (log) { clang::QualType new_iface_type = ast_ctx.getObjCInterfaceType(iface_decl); - LLDB_LOG(log, "AOCTV::FT Created {1} (isa 0x{2:x})", + LLDB_LOG(log, "AOCTV::FT Created {0} (isa 0x{1:x})", new_iface_type.getAsString(), (uint64_t)isa); } diff --git a/lldb/source/Target/ScriptedThreadPlan.cpp b/lldb/source/Target/ScriptedThreadPlan.cpp index a8432f12258ee4..c4bdc8d080e350 100644 --- a/lldb/source/Target/ScriptedThreadPlan.cpp +++ b/lldb/source/Target/ScriptedThreadPlan.cpp @@ -184,8 +184,9 @@ void ScriptedThreadPlan::GetDescription(Stream *s, lldb::StreamSP stream = std::make_shared(); llvm::Error err = m_interface->GetStopDescription(stream); if (err) { - LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), std::move(err), - "Can't call ScriptedThreadPlan::GetStopDescription."); + LLDB_LOG_ERROR( + GetLog(LLDBLog::Thread), std::move(err), + "Can't call ScriptedThreadPlan::GetStopDescription: {0}"); s->Printf("Scripted thread plan implemented by class %s.", m_class_name.c_str()); } else From 0fc3e4093ca5d226df37206626bfac3e4853b0db Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Thu, 10 Oct 2024 10:00:42 -0700 Subject: [PATCH 045/177] [alpha.webkit.UncountedCallArgsChecker] Skip std::forward in tryToFindPtrOrigin. (#111222) Ignore std::forward when it appears while looking for the pointer origin. --- .../StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp | 5 +++++ .../Checkers/WebKit/uncounted-obj-arg.cpp | 15 +++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp index 394cb26f03cf99..b7b2f8a16f07b3 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp @@ -101,6 +101,11 @@ bool tryToFindPtrOrigin( if (isSingleton(callee)) return callback(E, true); + if (callee->isInStdNamespace() && safeGetName(callee) == "forward") { + E = call->getArg(0); + continue; + } + if (isPtrConversion(callee)) { E = call->getArg(0); continue; diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp index 97efb354f0371d..b6ab369f69a87d 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp @@ -588,6 +588,8 @@ class UnrelatedClass { getFieldTrivial().nonTrivial23(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} } + + void setField(RefCounted*); }; class UnrelatedClass2 { @@ -598,11 +600,24 @@ class UnrelatedClass2 { RefCounted &getFieldTrivialRecursively() { return getFieldTrivial().getFieldTrivial(); } RefCounted *getFieldTrivialTernary() { return Field ? Field->getFieldTernary() : nullptr; } + template + void callSetField(T&& item, AdditionalArgs&&... args) + { + item.setField(std::forward(args)...); + } + + template + void callSetField2(T&& item, AdditionalArgs&&... args) + { + item.setField(std::move(args)...); + } + void test() { getFieldTrivialRecursively().trivial1(); // no-warning getFieldTrivialTernary()->trivial2(); // no-warning getFieldTrivialRecursively().someFunction(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} + callSetField(getFieldTrivial(), refCountedObj()); // no-warning } }; From 820bab8fb581f2fcd1a96b495f4762b02195d86a Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Thu, 10 Oct 2024 10:01:35 -0700 Subject: [PATCH 046/177] [alpha.webkit.UncountedCallArgsChecker] Add the support for trivial CXXInheritedCtorInitExpr. (#111198) --- .../Checkers/WebKit/PtrTypesSemantics.cpp | 4 ++++ .../Checkers/WebKit/uncounted-obj-arg.cpp | 21 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 4d145be808f6d8..317642c5b9ca20 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -508,6 +508,10 @@ class TrivialFunctionAnalysisVisitor return IsFunctionTrivial(CE->getConstructor()); } + bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E) { + return IsFunctionTrivial(E->getConstructor()); + } + bool VisitCXXNewExpr(const CXXNewExpr *NE) { return VisitChildren(NE); } bool VisitImplicitCastExpr(const ImplicitCastExpr *ICE) { diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp index b6ab369f69a87d..1a42de90105a55 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp @@ -224,6 +224,20 @@ class ObjectWithMutatingDestructor { Number n; }; +class BaseType { +public: + BaseType() : n(0) { } + BaseType(int v) : n(v) { } + BaseType(const char*); +private: + Number n; +}; + +class SomeType : public BaseType { +public: + using BaseType::BaseType; +}; + class RefCounted { public: void ref() const; @@ -336,6 +350,8 @@ class RefCounted { unsigned trivial60() { return ObjectWithNonTrivialDestructor { 5 }.value(); } unsigned trivial61() { return DerivedNumber('7').value(); } void trivial62() { WTFReportBacktrace(); } + SomeType trivial63() { return SomeType(0); } + SomeType trivial64() { return SomeType(); } static RefCounted& singleton() { static RefCounted s_RefCounted; @@ -425,6 +441,7 @@ class RefCounted { unsigned nonTrivial21() { return Number("123").value(); } unsigned nonTrivial22() { return ComplexNumber(123, "456").real().value(); } unsigned nonTrivial23() { return DerivedNumber("123").value(); } + SomeType nonTrivial24() { return SomeType("123"); } static unsigned s_v; unsigned v { 0 }; @@ -515,6 +532,8 @@ class UnrelatedClass { getFieldTrivial().trivial60(); // no-warning getFieldTrivial().trivial61(); // no-warning getFieldTrivial().trivial62(); // no-warning + getFieldTrivial().trivial63(); // no-warning + getFieldTrivial().trivial64(); // no-warning RefCounted::singleton().trivial18(); // no-warning RefCounted::singleton().someFunction(); // no-warning @@ -587,6 +606,8 @@ class UnrelatedClass { // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} getFieldTrivial().nonTrivial23(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} + getFieldTrivial().nonTrivial24(); + // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} } void setField(RefCounted*); From 39a91413c3f79181b4a45447bdb08d04d3efc975 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Thu, 10 Oct 2024 10:02:07 -0700 Subject: [PATCH 047/177] isUncountedPtr should take QualType as an argument. (#110213) Make isUncountedPtr take QualType as an argument instead of Type*. This simplifies some code. --- .../Checkers/WebKit/PtrTypesSemantics.cpp | 16 ++++------------ .../Checkers/WebKit/PtrTypesSemantics.h | 2 +- .../Checkers/WebKit/UncountedCallArgsChecker.cpp | 6 +----- .../WebKit/UncountedLambdaCapturesChecker.cpp | 10 +++++----- .../WebKit/UncountedLocalVarsChecker.cpp | 6 +----- 5 files changed, 12 insertions(+), 28 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 317642c5b9ca20..2298fe39850de5 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -177,14 +177,10 @@ std::optional isUncounted(const CXXRecordDecl* Class) return (*IsRefCountable); } -std::optional isUncountedPtr(const Type* T) -{ - assert(T); - +std::optional isUncountedPtr(const QualType T) { if (T->isPointerType() || T->isReferenceType()) { - if (auto *CXXRD = T->getPointeeCXXRecordDecl()) { + if (auto *CXXRD = T->getPointeeCXXRecordDecl()) return isUncounted(CXXRD); - } } return false; } @@ -208,12 +204,8 @@ std::optional isGetterOfRefCounted(const CXXMethodDecl* M) // Ref -> T conversion // FIXME: Currently allowing any Ref -> whatever cast. if (isRefType(className)) { - if (auto *maybeRefToRawOperator = dyn_cast(M)) { - if (auto *targetConversionType = - maybeRefToRawOperator->getConversionType().getTypePtrOrNull()) { - return isUncountedPtr(targetConversionType); - } - } + if (auto *maybeRefToRawOperator = dyn_cast(M)) + return isUncountedPtr(maybeRefToRawOperator->getConversionType()); } } return false; diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h index 3528c52a7d659d..8e6aadf63b6d67 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h @@ -61,7 +61,7 @@ std::optional isUncounted(const clang::CXXRecordDecl* Class); /// \returns true if \p T is either a raw pointer or reference to an uncounted /// class, false if not, std::nullopt if inconclusive. -std::optional isUncountedPtr(const clang::Type* T); +std::optional isUncountedPtr(const clang::QualType T); /// \returns true if Name is a RefPtr, Ref, or its variant, false if not. bool isRefType(const std::string &Name); diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp index 0ed93ab26bf5ca..cea3503fa2c314 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp @@ -115,12 +115,8 @@ class UncountedCallArgsChecker // continue; QualType ArgType = (*P)->getType().getCanonicalType(); - const auto *TypePtr = ArgType.getTypePtrOrNull(); - if (!TypePtr) - continue; // FIXME? Should we bail? - // FIXME: more complex types (arrays, references to raw pointers, etc) - std::optional IsUncounted = isUncountedPtr(TypePtr); + std::optional IsUncounted = isUncountedPtr(ArgType); if (!IsUncounted || !(*IsUncounted)) continue; diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp index a226a01ec0a579..998bd4ccee07db 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp @@ -59,11 +59,11 @@ class UncountedLambdaCapturesChecker for (const LambdaCapture &C : L->captures()) { if (C.capturesVariable()) { ValueDecl *CapturedVar = C.getCapturedVar(); - if (auto *CapturedVarType = CapturedVar->getType().getTypePtrOrNull()) { - std::optional IsUncountedPtr = isUncountedPtr(CapturedVarType); - if (IsUncountedPtr && *IsUncountedPtr) { - reportBug(C, CapturedVar, CapturedVarType); - } + QualType CapturedVarQualType = CapturedVar->getType(); + if (auto *CapturedVarType = CapturedVarQualType.getTypePtrOrNull()) { + auto IsUncountedPtr = isUncountedPtr(CapturedVarQualType); + if (IsUncountedPtr && *IsUncountedPtr) + reportBug(C, CapturedVar, CapturedVarType); } } } diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp index 9d0a3bb5da7325..81d21100de878d 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp @@ -199,11 +199,7 @@ class UncountedLocalVarsChecker if (shouldSkipVarDecl(V)) return; - const auto *ArgType = V->getType().getTypePtr(); - if (!ArgType) - return; - - std::optional IsUncountedPtr = isUncountedPtr(ArgType); + std::optional IsUncountedPtr = isUncountedPtr(V->getType()); if (IsUncountedPtr && *IsUncountedPtr) { if (tryToFindPtrOrigin( Value, /*StopAtFirstRefCountedObj=*/false, From 36c34ec967c28c77406fe85ef3237a167a243763 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Thu, 10 Oct 2024 18:07:06 +0100 Subject: [PATCH 048/177] [mlir][debug] Support DICommonBlock. (#111706) A COMMON block is a named area of memory that holds a collection of variables. Fortran subprograms may map the COMMON block memory area to a list of variables. A common block is represented in LLVM debug by DICommonBlock. This PR adds support for this in MLIR. The changes are mostly mechanical apart from small change to access the DICompileUnit when the scope of the variable is DICommonBlock. --------- Co-authored-by: Tobias Gysi --- .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td | 16 ++++++++++ mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp | 20 +++++++------ mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 11 +++---- mlir/lib/Target/LLVMIR/DebugImporter.cpp | 9 ++++++ mlir/lib/Target/LLVMIR/DebugImporter.h | 1 + mlir/lib/Target/LLVMIR/DebugTranslation.cpp | 20 +++++++++---- mlir/lib/Target/LLVMIR/DebugTranslation.h | 1 + mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 28 ++++++++++------- mlir/test/Dialect/LLVMIR/debuginfo.mlir | 8 +++++ mlir/test/Target/LLVMIR/Import/debug-info.ll | 24 +++++++++++++++ mlir/test/Target/LLVMIR/llvmir-debug.mlir | 30 +++++++++++++++++++ 11 files changed, 138 insertions(+), 30 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index c298c8277eb0c3..0d904f13037c61 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -701,6 +701,22 @@ def LLVM_DISubrangeAttr : LLVM_Attr<"DISubrange", "di_subrange", /*traits=*/[], let assemblyFormat = "`<` struct(params) `>`"; } +//===----------------------------------------------------------------------===// +// DICommonBlockAttr +//===----------------------------------------------------------------------===// + +def LLVM_DICommonBlockAttr : LLVM_Attr<"DICommonBlock", "di_common_block", + /*traits=*/[], "DIScopeAttr"> { + let parameters = (ins + "DIScopeAttr":$scope, + OptionalParameter<"DIGlobalVariableAttr">:$decl, + "StringAttr":$name, + OptionalParameter<"DIFileAttr">:$file, + OptionalParameter<"unsigned">:$line + ); + let assemblyFormat = "`<` struct(params) `>`"; +} + //===----------------------------------------------------------------------===// // DISubroutineTypeAttr //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp index 99871dac81d326..9640bbdf28df45 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp @@ -56,13 +56,14 @@ void LLVMDialect::registerAttributes() { //===----------------------------------------------------------------------===// bool DINodeAttr::classof(Attribute attr) { - return llvm::isa(attr); + return llvm::isa( + attr); } //===----------------------------------------------------------------------===// @@ -70,8 +71,9 @@ bool DINodeAttr::classof(Attribute attr) { //===----------------------------------------------------------------------===// bool DIScopeAttr::classof(Attribute attr) { - return llvm::isa(attr); + return llvm::isa( + attr); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 2c7af8712d420c..006d412936a337 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -3369,11 +3369,12 @@ struct LLVMOpAsmDialectInterface : public OpAsmDialectInterface { AliasResult getAlias(Attribute attr, raw_ostream &os) const override { return TypeSwitch(attr) .CasegetStride())); } +DICommonBlockAttr DebugImporter::translateImpl(llvm::DICommonBlock *node) { + return DICommonBlockAttr::get(context, translate(node->getScope()), + translate(node->getDecl()), + getStringAttrOrNull(node->getRawName()), + translate(node->getFile()), node->getLineNo()); +} + DISubroutineTypeAttr DebugImporter::translateImpl(llvm::DISubroutineType *node) { SmallVector types; @@ -339,6 +346,8 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) { auto translateNode = [this](llvm::DINode *node) -> DINodeAttr { if (auto *casted = dyn_cast(node)) return translateImpl(casted); + if (auto *casted = dyn_cast(node)) + return translateImpl(casted); if (auto *casted = dyn_cast(node)) return translateImpl(casted); if (auto *casted = dyn_cast(node)) diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h index cb796676759c39..a452e01a9f6041 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.h +++ b/mlir/lib/Target/LLVMIR/DebugImporter.h @@ -79,6 +79,7 @@ class DebugImporter { DIScopeAttr translateImpl(llvm::DIScope *node); DISubprogramAttr translateImpl(llvm::DISubprogram *node); DISubrangeAttr translateImpl(llvm::DISubrange *node); + DICommonBlockAttr translateImpl(llvm::DICommonBlock *node); DISubroutineTypeAttr translateImpl(llvm::DISubroutineType *node); DITypeAttr translateImpl(llvm::DIType *node); diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp index 92ff079a10c8aa..2491db299af312 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp @@ -397,6 +397,13 @@ llvm::DISubrange *DebugTranslation::translateImpl(DISubrangeAttr attr) { getMetadataOrNull(attr.getStride())); } +llvm::DICommonBlock *DebugTranslation::translateImpl(DICommonBlockAttr attr) { + return llvm::DICommonBlock::get(llvmCtx, translate(attr.getScope()), + translate(attr.getDecl()), + getMDStringOrNull(attr.getName()), + translate(attr.getFile()), attr.getLine()); +} + llvm::DISubroutineType * DebugTranslation::translateImpl(DISubroutineTypeAttr attr) { // Concatenate the result and argument types into a single array. @@ -428,12 +435,13 @@ llvm::DINode *DebugTranslation::translate(DINodeAttr attr) { if (!node) node = TypeSwitch(attr) - .Case( + .Case( [&](auto attr) { return translateImpl(attr); }); if (node && !node->isTemporary()) diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.h b/mlir/lib/Target/LLVMIR/DebugTranslation.h index 422aa34e28f3c9..ff4eaa46c564e2 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.h +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.h @@ -88,6 +88,7 @@ class DebugTranslation { llvm::DIScope *translateImpl(DIScopeAttr attr); llvm::DISubprogram *translateImpl(DISubprogramAttr attr); llvm::DISubrange *translateImpl(DISubrangeAttr attr); + llvm::DICommonBlock *translateImpl(DICommonBlockAttr attr); llvm::DISubroutineType *translateImpl(DISubroutineTypeAttr attr); llvm::DIType *translateImpl(DITypeAttr attr); diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index cc0de5bc838c99..a5de90160c4145 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -1064,19 +1064,27 @@ LogicalResult ModuleTranslation::convertGlobals() { // There is no `globals` field in DICompileUnitAttr which can be directly // assigned to DICompileUnit. We have to build the list by looking at the // dbgExpr of all the GlobalOps. The scope of the variable is used to get - // the DICompileUnit in which to add it. But for the languages that - // support modules, the scope hierarchy can be - // variable -> module -> compile unit - // If a variable scope points to the module then we use the scope of the - // module to get the compile unit. - // Global variables are also used for things like static local variables - // in C and local variables with the save attribute in Fortran. The scope - // of the variable is the parent function. We use the compile unit of the - // parent function in this case. + // the DICompileUnit in which to add it. + // But there are cases where the scope of a global does not + // directly point to the DICompileUnit and we have to do a bit more work + // to get to it. Some of those cases are: + // + // 1. For the languages that support modules, the scope hierarchy can be + // variable -> DIModule -> DICompileUnit + // + // 2. For the Fortran common block variable, the scope hierarchy can be + // variable -> DICommonBlock -> DISubprogram -> DICompileUnit + // + // 3. For entities like static local variables in C or variable with + // SAVE attribute in Fortran, the scope hierarchy can be + // variable -> DISubprogram -> DICompileUnit llvm::DIScope *scope = diGlobalVar->getScope(); if (auto *mod = dyn_cast_if_present(scope)) scope = mod->getScope(); - else if (auto *sp = dyn_cast_if_present(scope)) + else if (auto *cb = dyn_cast_if_present(scope)) { + if (auto *sp = dyn_cast_if_present(cb->getScope())) + scope = sp->getUnit(); + } else if (auto *sp = dyn_cast_if_present(scope)) scope = sp->getUnit(); // Get the compile unit (scope) of the the global variable. diff --git a/mlir/test/Dialect/LLVMIR/debuginfo.mlir b/mlir/test/Dialect/LLVMIR/debuginfo.mlir index af95ec97833a13..8475ec6c3510db 100644 --- a/mlir/test/Dialect/LLVMIR/debuginfo.mlir +++ b/mlir/test/Dialect/LLVMIR/debuginfo.mlir @@ -156,6 +156,14 @@ // CHECK-DAG: #[[LABEL2:.*]] = #llvm.di_label #label2 = #llvm.di_label +// CHECK-DAG: #llvm.di_common_block +#di_common_block = #llvm.di_common_block +#global_var = #llvm.di_global_variable +#var_expression = #llvm.di_global_variable_expression> +llvm.mlir.global common @block_() {dbg_expr = #var_expression} : i64 + // CHECK: llvm.func @addr(%[[ARG:.*]]: i64) llvm.func @addr(%arg: i64) { // CHECK: %[[ALLOC:.*]] = llvm.alloca diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll index 6267990b0bf803..09909d7d63b2ab 100644 --- a/mlir/test/Target/LLVMIR/Import/debug-info.ll +++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll @@ -843,3 +843,27 @@ define void @fn_with_annotations() !dbg !12 { ; CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "fn_with_annotations"{{.*}}annotations = #llvm.di_annotation> + +; // ----- + +@block = common global [4 x i8] zeroinitializer, !dbg !0 + +define void @test() !dbg !3 { + ret void +} + +!llvm.module.flags = !{!10} +!llvm.dbg.cu = !{!7} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "alpha", scope: !2, file: !4, type: !9) +!2 = !DICommonBlock(scope: !3, declaration: null, name: "block", file: !4, line: 3) +!3 = distinct !DISubprogram(name: "test", scope: !4, file: !4, spFlags: DISPFlagDefinition, unit: !7) +!4 = !DIFile(filename: "test.f90", directory: "") +!7 = distinct !DICompileUnit(language: DW_LANG_Fortran95, file: !4) +!9 = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed) +!10 = !{i32 2, !"Debug Info Version", i32 3} + +; CHECK: #[[FILE:.+]] = #llvm.di_file<"test.f90" in ""> +; CHECK: #[[SP:.+]] = #llvm.di_subprogram<{{.*}}name = "test"{{.*}}> +; CHECK: #llvm.di_common_block diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir index b09a60b8dcac90..826fda60c5efef 100644 --- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir @@ -660,3 +660,33 @@ llvm.func @string_ty(%arg0: !llvm.ptr) { // CHECK-DAG: !DIStringType(name: "character(*)", stringLength: ![[VAR:[0-9]+]], stringLengthExpression: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 8), stringLocationExpression: !DIExpression(DW_OP_push_object_address, DW_OP_deref), size: 32, align: 8) // CHECK-DAG: ![[VAR]] = !DILocalVariable(name: "string_size"{{.*}} flags: DIFlagArtificial) + +// ----- + +// Test translation of DICommonBlockAttr. +#bt = #llvm.di_basic_type +#file = #llvm.di_file<"test.f90" in ""> +#cu = #llvm.di_compile_unit, sourceLanguage = DW_LANG_C, + file = #file, isOptimized = false, emissionKind = Full> +#sp = #llvm.di_subprogram +#di_common_block = #llvm.di_common_block +#global_var = #llvm.di_global_variable +#var_expression = #llvm.di_global_variable_expression> + +llvm.mlir.global common @block_(dense<0> : tensor<8xi8>) + {dbg_expr = #var_expression} : !llvm.array<8 x i8> + +llvm.func @test() { + llvm.return +} loc(#loc2) + +#loc1 = loc("test.f90":1:0) +#loc2 = loc(fused<#sp>[#loc1]) + +// CHECK: !DICommonBlock(scope: ![[SCOPE:[0-9]+]], declaration: null, name: "block", file: ![[FILE:[0-9]+]], line: 3) +// CHECK: ![[SCOPE]] = {{.*}}!DISubprogram(name: "test"{{.*}}) +// CHECK: ![[FILE]] = !DIFile(filename: "test.f90"{{.*}}) From 43ba97e7079525a9686e15a6963508dfbd493f81 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 10 Oct 2024 13:13:17 -0400 Subject: [PATCH 049/177] [runtimes][NFC] Reindent CMake files (#111821) This is a purely mechanical commit for fixing the indentation of the runtimes' CMakeLists files after #80007. That PR didn't update the indentation in order to make the diff easier to review and for merge conflicts to be easier to resolve (for downstream changes). This doesn't change any code, it only reindents it. --- libcxx/src/CMakeLists.txt | 194 +++++++++++++++++------------------ libcxxabi/src/CMakeLists.txt | 140 ++++++++++++------------- libunwind/src/CMakeLists.txt | 40 ++++---- 3 files changed, 187 insertions(+), 187 deletions(-) diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 9f31822065be9d..4af04f202db1f7 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -173,76 +173,76 @@ split_list(LIBCXX_COMPILE_FLAGS) split_list(LIBCXX_LINK_FLAGS) # Build the shared library. - add_library(cxx_shared SHARED ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) - target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) - target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared - PRIVATE ${LIBCXX_LIBRARIES}) - set_target_properties(cxx_shared - PROPERTIES - EXCLUDE_FROM_ALL "$,FALSE,TRUE>" - COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}" - LINK_FLAGS "${LIBCXX_LINK_FLAGS}" - OUTPUT_NAME "${LIBCXX_SHARED_OUTPUT_NAME}" - VERSION "${LIBCXX_LIBRARY_VERSION}" - SOVERSION "${LIBCXX_ABI_VERSION}" - DEFINE_SYMBOL "" +add_library(cxx_shared SHARED ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) +target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared + PRIVATE ${LIBCXX_LIBRARIES}) +set_target_properties(cxx_shared + PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" + COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}" + LINK_FLAGS "${LIBCXX_LINK_FLAGS}" + OUTPUT_NAME "${LIBCXX_SHARED_OUTPUT_NAME}" + VERSION "${LIBCXX_LIBRARY_VERSION}" + SOVERSION "${LIBCXX_ABI_VERSION}" + DEFINE_SYMBOL "" +) +cxx_add_common_build_flags(cxx_shared) + +if(ZOS) + add_custom_command(TARGET cxx_shared POST_BUILD + COMMAND + ${LIBCXX_SOURCE_DIR}/utils/zos_rename_dll_side_deck.sh + $ $ "${LIBCXX_DLL_NAME}" + COMMENT "Rename dll name inside the side deck file" + WORKING_DIRECTORY $ ) - cxx_add_common_build_flags(cxx_shared) - - if(ZOS) - add_custom_command(TARGET cxx_shared POST_BUILD - COMMAND - ${LIBCXX_SOURCE_DIR}/utils/zos_rename_dll_side_deck.sh - $ $ "${LIBCXX_DLL_NAME}" - COMMENT "Rename dll name inside the side deck file" - WORKING_DIRECTORY $ - ) - endif() +endif() - # Link against libc++abi - if (LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY) - target_link_libraries(cxx_shared PRIVATE libcxx-abi-shared-objects) - else() - target_link_libraries(cxx_shared PUBLIC libcxx-abi-shared) - endif() +# Link against libc++abi +if (LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY) + target_link_libraries(cxx_shared PRIVATE libcxx-abi-shared-objects) +else() + target_link_libraries(cxx_shared PUBLIC libcxx-abi-shared) +endif() - # Maybe force some symbols to be weak, not weak or not exported. - # TODO: This shouldn't depend on the platform, and ideally it should be done in the sources. - if (APPLE AND LIBCXX_CXX_ABI MATCHES "libcxxabi$" - AND NOT LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY) - target_link_libraries(cxx_shared PRIVATE - "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp" - "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp") - endif() +# Maybe force some symbols to be weak, not weak or not exported. +# TODO: This shouldn't depend on the platform, and ideally it should be done in the sources. +if (APPLE AND LIBCXX_CXX_ABI MATCHES "libcxxabi$" + AND NOT LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY) + target_link_libraries(cxx_shared PRIVATE + "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp" + "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp") +endif() - # Generate a linker script in place of a libc++.so symlink. - if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT) - set(link_libraries) - - set(imported_libname "$") - set(output_name "$") - string(APPEND link_libraries "${CMAKE_LINK_LIBRARY_FLAG}$,${imported_libname},${output_name}>") - - # TODO: Move to the same approach as above for the unwind library - if (LIBCXXABI_USE_LLVM_UNWINDER) - if (LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_SHARED_LIBRARY) - # libunwind is already included in libc++abi - elseif (TARGET unwind_shared OR HAVE_LIBUNWIND) - string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}$") - else() - string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}unwind") - endif() - endif() +# Generate a linker script in place of a libc++.so symlink. +if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT) + set(link_libraries) - set(linker_script "INPUT($ ${link_libraries})") - add_custom_command(TARGET cxx_shared POST_BUILD - COMMAND "${CMAKE_COMMAND}" -E remove "$" - COMMAND "${CMAKE_COMMAND}" -E echo "${linker_script}" > "$" - COMMENT "Generating linker script: '${linker_script}' as file $" - VERBATIM - ) + set(imported_libname "$") + set(output_name "$") + string(APPEND link_libraries "${CMAKE_LINK_LIBRARY_FLAG}$,${imported_libname},${output_name}>") + + # TODO: Move to the same approach as above for the unwind library + if (LIBCXXABI_USE_LLVM_UNWINDER) + if (LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_SHARED_LIBRARY) + # libunwind is already included in libc++abi + elseif (TARGET unwind_shared OR HAVE_LIBUNWIND) + string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}$") + else() + string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}unwind") + endif() endif() + set(linker_script "INPUT($ ${link_libraries})") + add_custom_command(TARGET cxx_shared POST_BUILD + COMMAND "${CMAKE_COMMAND}" -E remove "$" + COMMAND "${CMAKE_COMMAND}" -E echo "${linker_script}" > "$" + COMMENT "Generating linker script: '${linker_script}' as file $" + VERBATIM + ) +endif() + if (LIBCXX_ENABLE_SHARED) list(APPEND LIBCXX_BUILD_TARGETS "cxx_shared") endif() @@ -263,43 +263,43 @@ endif() set(CMAKE_STATIC_LIBRARY_PREFIX "lib") # Build the static library. - add_library(cxx_static STATIC ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) - target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) - target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static - PRIVATE ${LIBCXX_LIBRARIES} - PRIVATE libcxx-abi-static) - set_target_properties(cxx_static - PROPERTIES - EXCLUDE_FROM_ALL "$,FALSE,TRUE>" - COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}" - LINK_FLAGS "${LIBCXX_LINK_FLAGS}" - OUTPUT_NAME "${LIBCXX_STATIC_OUTPUT_NAME}" - ) - cxx_add_common_build_flags(cxx_static) - - if (LIBCXX_HERMETIC_STATIC_LIBRARY) - # If the hermetic library doesn't define the operator new/delete functions - # then its code shouldn't declare them with hidden visibility. They might - # actually be provided by a shared library at link time. - if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS) - append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden) - if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG) - append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden) - endif() +add_library(cxx_static STATIC ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) +target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static + PRIVATE ${LIBCXX_LIBRARIES} + PRIVATE libcxx-abi-static) +set_target_properties(cxx_static + PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" + COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}" + LINK_FLAGS "${LIBCXX_LINK_FLAGS}" + OUTPUT_NAME "${LIBCXX_STATIC_OUTPUT_NAME}" +) +cxx_add_common_build_flags(cxx_static) + +if (LIBCXX_HERMETIC_STATIC_LIBRARY) + # If the hermetic library doesn't define the operator new/delete functions + # then its code shouldn't declare them with hidden visibility. They might + # actually be provided by a shared library at link time. + if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS) + append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden) + if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG) + append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden) endif() - target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS}) - # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site - # too. Define it in the same way here, to avoid redefinition conflicts. - target_compile_definitions(cxx_static PRIVATE _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=) endif() + target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS}) + # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site + # too. Define it in the same way here, to avoid redefinition conflicts. + target_compile_definitions(cxx_static PRIVATE _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=) +endif() - if (LIBCXX_ENABLE_STATIC) - list(APPEND LIBCXX_BUILD_TARGETS "cxx_static") - endif() - # Attempt to merge the libc++.a archive and the ABI library archive into one. - if (LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY) - target_link_libraries(cxx_static PRIVATE libcxx-abi-static-objects) - endif() +if (LIBCXX_ENABLE_STATIC) + list(APPEND LIBCXX_BUILD_TARGETS "cxx_static") +endif() +# Attempt to merge the libc++.a archive and the ABI library archive into one. +if (LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY) + target_link_libraries(cxx_static PRIVATE libcxx-abi-static-objects) +endif() # Add a meta-target for both libraries. add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS}) diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt index e496cf3339164e..84fe2784bec5ca 100644 --- a/libcxxabi/src/CMakeLists.txt +++ b/libcxxabi/src/CMakeLists.txt @@ -184,78 +184,78 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO endif() target_compile_options(cxxabi_shared_objects PRIVATE "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}") - add_library(cxxabi_shared SHARED) - set_target_properties(cxxabi_shared - PROPERTIES - EXCLUDE_FROM_ALL "$,FALSE,TRUE>" - LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}" - OUTPUT_NAME "${LIBCXXABI_SHARED_OUTPUT_NAME}" - SOVERSION "1" - VERSION "${LIBCXXABI_LIBRARY_VERSION}" - ) +add_library(cxxabi_shared SHARED) +set_target_properties(cxxabi_shared + PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" + LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}" + OUTPUT_NAME "${LIBCXXABI_SHARED_OUTPUT_NAME}" + SOVERSION "1" + VERSION "${LIBCXXABI_LIBRARY_VERSION}" +) - if (ZOS) - add_custom_command(TARGET cxxabi_shared POST_BUILD - COMMAND - ${LIBCXXABI_LIBCXX_PATH}/utils/zos_rename_dll_side_deck.sh - $ $ "${LIBCXXABI_DLL_NAME}" - COMMENT "Rename dll name inside the side deck file" - WORKING_DIRECTORY $ - ) - endif () +if (ZOS) + add_custom_command(TARGET cxxabi_shared POST_BUILD + COMMAND + ${LIBCXXABI_LIBCXX_PATH}/utils/zos_rename_dll_side_deck.sh + $ $ "${LIBCXXABI_DLL_NAME}" + COMMENT "Rename dll name inside the side deck file" + WORKING_DIRECTORY $ + ) +endif () - target_link_libraries(cxxabi_shared - PUBLIC cxxabi_shared_objects - PRIVATE ${LIBCXXABI_LIBRARIES}) +target_link_libraries(cxxabi_shared + PUBLIC cxxabi_shared_objects + PRIVATE ${LIBCXXABI_LIBRARIES}) if (LIBCXXABI_ENABLE_SHARED) - list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_shared") +list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_shared") endif() if (LIBCXXABI_INSTALL_SHARED_LIBRARY) - list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared") +list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared") endif() - # TODO: Move this to libc++'s HandleLibCXXABI.cmake since this is effectively trying to control - # what libc++ re-exports. - add_library(cxxabi-reexports INTERFACE) - function(export_symbols file) - # -exported_symbols_list is only available on Apple platforms - if (APPLE) - target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}") - endif() - endfunction() +# TODO: Move this to libc++'s HandleLibCXXABI.cmake since this is effectively trying to control +# what libc++ re-exports. +add_library(cxxabi-reexports INTERFACE) +function(export_symbols file) + # -exported_symbols_list is only available on Apple platforms + if (APPLE) + target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}") + endif() +endfunction() - function(reexport_symbols file) - export_symbols("${file}") - # -reexported_symbols_list is only available on Apple platforms - if (APPLE) - target_link_libraries(cxxabi-reexports INTERFACE "-Wl,-reexported_symbols_list,${file}") - endif() - endfunction() +function(reexport_symbols file) + export_symbols("${file}") + # -reexported_symbols_list is only available on Apple platforms + if (APPLE) + target_link_libraries(cxxabi-reexports INTERFACE "-Wl,-reexported_symbols_list,${file}") + endif() +endfunction() - export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/symbols-not-reexported.exp") - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxxabiv1.exp") - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/fundamental-types.exp") - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp") - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-misc.exp") +export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/symbols-not-reexported.exp") +reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxxabiv1.exp") +reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/fundamental-types.exp") +reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp") +reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-misc.exp") - if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS) - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp") - endif() +if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS) + reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp") +endif() - # Note that std:: exception types are always defined by the library regardless of - # whether the exception runtime machinery is provided. - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-exceptions.exp") +# Note that std:: exception types are always defined by the library regardless of +# whether the exception runtime machinery is provided. +reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-exceptions.exp") - if (LIBCXXABI_ENABLE_EXCEPTIONS) - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-exceptions.exp") +if (LIBCXXABI_ENABLE_EXCEPTIONS) + reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-exceptions.exp") - if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$") - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp") - else() - reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp") - endif() + if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$") + reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp") + else() + reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp") endif() +endif() # Build the static library. add_library(cxxabi_static_objects OBJECT EXCLUDE_FROM_ALL ${LIBCXXABI_SOURCES} ${LIBCXXABI_HEADERS}) @@ -295,19 +295,19 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY) _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=) endif() - add_library(cxxabi_static STATIC) - if (LIBCXXABI_USE_LLVM_UNWINDER AND NOT LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY) - target_link_libraries(cxxabi_static PUBLIC unwind_static) - endif() - set_target_properties(cxxabi_static - PROPERTIES - EXCLUDE_FROM_ALL "$,FALSE,TRUE>" - LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}" - OUTPUT_NAME "${LIBCXXABI_STATIC_OUTPUT_NAME}" - ) - target_link_libraries(cxxabi_static - PUBLIC cxxabi_static_objects - PRIVATE ${LIBCXXABI_STATIC_LIBRARIES} ${LIBCXXABI_LIBRARIES}) +add_library(cxxabi_static STATIC) +if (LIBCXXABI_USE_LLVM_UNWINDER AND NOT LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY) + target_link_libraries(cxxabi_static PUBLIC unwind_static) +endif() +set_target_properties(cxxabi_static + PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" + LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}" + OUTPUT_NAME "${LIBCXXABI_STATIC_OUTPUT_NAME}" + ) +target_link_libraries(cxxabi_static + PUBLIC cxxabi_static_objects + PRIVATE ${LIBCXXABI_STATIC_LIBRARIES} ${LIBCXXABI_LIBRARIES}) if (LIBCXXABI_ENABLE_STATIC) list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_static") diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt index 3065bfc8a07050..2e18b109656331 100644 --- a/libunwind/src/CMakeLists.txt +++ b/libunwind/src/CMakeLists.txt @@ -153,17 +153,17 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO set_target_properties(unwind_shared_objects PROPERTIES POSITION_INDEPENDENT_CODE ON) # must set manually because it's an object library endif() - add_library(unwind_shared SHARED) - target_link_libraries(unwind_shared PUBLIC unwind_shared_objects) - set_target_properties(unwind_shared - PROPERTIES - EXCLUDE_FROM_ALL "$,FALSE,TRUE>" - LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}" - LINKER_LANGUAGE C - OUTPUT_NAME "${LIBUNWIND_SHARED_OUTPUT_NAME}" - VERSION "${LIBUNWIND_LIBRARY_VERSION}" - SOVERSION "1" - ) +add_library(unwind_shared SHARED) +target_link_libraries(unwind_shared PUBLIC unwind_shared_objects) +set_target_properties(unwind_shared + PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" + LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}" + LINKER_LANGUAGE C + OUTPUT_NAME "${LIBUNWIND_SHARED_OUTPUT_NAME}" + VERSION "${LIBUNWIND_LIBRARY_VERSION}" + SOVERSION "1" +) if (LIBUNWIND_ENABLE_SHARED) list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_shared") @@ -200,15 +200,15 @@ if(LIBUNWIND_HIDE_SYMBOLS) target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS) endif() - add_library(unwind_static STATIC) - target_link_libraries(unwind_static PUBLIC unwind_static_objects) - set_target_properties(unwind_static - PROPERTIES - EXCLUDE_FROM_ALL "$,FALSE,TRUE>" - LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}" - LINKER_LANGUAGE C - OUTPUT_NAME "${LIBUNWIND_STATIC_OUTPUT_NAME}" - ) +add_library(unwind_static STATIC) +target_link_libraries(unwind_static PUBLIC unwind_static_objects) +set_target_properties(unwind_static + PROPERTIES + EXCLUDE_FROM_ALL "$,FALSE,TRUE>" + LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}" + LINKER_LANGUAGE C + OUTPUT_NAME "${LIBUNWIND_STATIC_OUTPUT_NAME}" +) if (LIBUNWIND_ENABLE_STATIC) list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_static") From 3f9998af4f79e95fe8be615df9d6b898008044b9 Mon Sep 17 00:00:00 2001 From: Justin Fargnoli Date: Thu, 10 Oct 2024 10:24:02 -0700 Subject: [PATCH 050/177] [NVPTX] Prefer prmt.b32 over bfi.b32 (#110766) In [[NVPTX] Improve lowering of v4i8](https://github.com/llvm/llvm-project/commit/cbafb6f2f5c99474164dcc725820cbbeb2e02e14) @Artem-B add the ability to lower ISD::BUILD_VECTOR with bfi PTX instructions. @Artem-B did this because: ([source](https://github.com/llvm/llvm-project/pull/67866#discussion_r1343066911)) > Under the hood byte extraction/insertion ends up as BFI/BFE instructions, so we may as well do that in PTX, too. https://godbolt.org/z/Tb3zWbj9b However, the example that @Artem-B linked was targeting sm_52. On modern architectures, ptxas uses prmt.b32. [Example](https://godbolt.org/z/Ye4W1n84o). Thus, remove uses of NVPTXISD::BFI in favor of NVPTXISD::PRMT. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 31 +- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 614 ++++++++++--------- llvm/test/CodeGen/NVPTX/sext-setcc.ll | 18 +- 3 files changed, 335 insertions(+), 328 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 57bc5fe0ac361c..d95f8f214be557 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2332,20 +2332,23 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us // to optimize calculation of constant parts. if (VT == MVT::v4i8) { - SDValue C8 = DAG.getConstant(8, DL, MVT::i32); - SDValue E01 = DAG.getNode( - NVPTXISD::BFI, DL, MVT::i32, - DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), - DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); - SDValue E012 = - DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, - DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), - E01, DAG.getConstant(16, DL, MVT::i32), C8); - SDValue E0123 = - DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, - DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), - E012, DAG.getConstant(24, DL, MVT::i32), C8); - return DAG.getNode(ISD::BITCAST, DL, VT, E0123); + SDValue PRMT__10 = DAG.getNode( + NVPTXISD::PRMT, DL, MVT::v4i8, + {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), + DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), + DAG.getConstant(0x3340, DL, MVT::i32), + DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + SDValue PRMT32__ = DAG.getNode( + NVPTXISD::PRMT, DL, MVT::v4i8, + {DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), + DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), + DAG.getConstant(0x4033, DL, MVT::i32), + DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + SDValue PRMT3210 = DAG.getNode( + NVPTXISD::PRMT, DL, MVT::v4i8, + {PRMT__10, PRMT32__, DAG.getConstant(0x5410, DL, MVT::i32), + DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); } return Op; } diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 96a4359d0ec43e..84dde539ce4c47 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_add( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; +; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; ; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, %b @@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_0( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: add.s16 %rs2, %rs1, 4; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 2; +; CHECK-NEXT: add.s16 %rs4, %rs3, 3; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 16435; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 3; +; CHECK-NEXT: add.s16 %rs6, %rs5, 2; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: add.s16 %rs8, %rs7, 4; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> , %a @@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_1( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: add.s16 %rs2, %rs1, 4; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 2; +; CHECK-NEXT: add.s16 %rs4, %rs3, 3; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 16435; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 3; +; CHECK-NEXT: add.s16 %rs6, %rs5, 2; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: add.s16 %rs8, %rs7, 4; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, @@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_sub( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; +; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; ; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = sub <4 x i8> %a, %b @@ -251,38 +251,38 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<27>; +; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; -; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 16435; +; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; -; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; -; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; -; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; +; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b @@ -294,30 +294,30 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ugt <4 x i8> %a, %b @@ -329,38 +329,38 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<27>; +; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; -; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 16435; +; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; -; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; -; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; -; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; +; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b @@ -372,30 +372,30 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ule <4 x i8> %a, %b @@ -407,35 +407,35 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_eq( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<24>; +; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; ; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; ; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; -; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 16435; +; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; -; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; -; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; +; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; +; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; +; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; +; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; ; CHECK-NEXT: ret; %cmp = icmp eq <4 x i8> %a, %b @@ -447,35 +447,35 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_ne( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<24>; +; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; -; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 16435; +; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; -; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; -; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; +; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; +; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; +; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; +; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; ; CHECK-NEXT: ret; %cmp = icmp ne <4 x i8> %a, %b @@ -487,38 +487,38 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_mul( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; +; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; ; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = mul <4 x i8> %a, %b @@ -548,12 +548,13 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; -; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; -; CHECK-NEXT: or.b32 %r8, %r6, %r4; +; CHECK-NEXT: mov.b32 %r1, 0; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; CHECK-NEXT: or.b32 %r8, %r6, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -613,12 +614,13 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; -; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; -; CHECK-NEXT: xor.b32 %r8, %r6, %r4; +; CHECK-NEXT: mov.b32 %r1, 0; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; CHECK-NEXT: xor.b32 %r8, %r6, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -678,12 +680,13 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; -; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; -; CHECK-NEXT: and.b32 %r8, %r6, %r4; +; CHECK-NEXT: mov.b32 %r1, 0; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; CHECK-NEXT: and.b32 %r8, %r6, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -926,40 +929,40 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> ; CHECK-LABEL: test_select_cc( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<29>; +; CHECK-NEXT: .reg .b32 %r<28>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; ; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; -; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; -; CHECK-NEXT: bfe.u32 %r6, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8; +; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r4, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r3, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r4, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r4, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r3, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r4, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r4, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r13, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; -; CHECK-NEXT: bfe.u32 %r16, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r17, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r16, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8; -; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r21, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r19, %r18, %r15, 16435; +; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r21, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; -; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8; -; CHECK-NEXT: bfe.u32 %r24, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r25, %r1, 24, 8; -; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1; -; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r24, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r25, %r24, %r23, %p1; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r22, 13120; +; CHECK-NEXT: prmt.b32 %r27, %r26, %r19, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27; ; CHECK-NEXT: ret; %cc = icmp ne <4 x i8> %c, %d @@ -1006,32 +1009,32 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, ; CHECK-LABEL: test_select_cc_i8_i32( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<27>; +; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; ; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0]; -; CHECK-NEXT: setp.ne.s32 %p1, %r6, %r10; -; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9; -; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8; -; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7; -; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; +; CHECK-NEXT: setp.ne.s32 %p1, %r3, %r7; +; CHECK-NEXT: setp.ne.s32 %p2, %r4, %r8; +; CHECK-NEXT: setp.ne.s32 %p3, %r5, %r9; +; CHECK-NEXT: setp.ne.s32 %p4, %r6, %r10; +; CHECK-NEXT: bfe.u32 %r11, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r15, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8; -; CHECK-NEXT: bfe.u32 %r18, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r19, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 16435; +; CHECK-NEXT: bfe.u32 %r18, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; -; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8; -; CHECK-NEXT: bfe.u32 %r22, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r23, %r1, 24, 8; -; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1; -; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8; +; CHECK-NEXT: bfe.u32 %r21, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r22, %r21, %p1; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r20, 13120; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r17, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; <4 x i32> %c, <4 x i32> %d) #0 { @@ -1044,13 +1047,13 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { ; CHECK-LABEL: test_trunc_2xi32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; -; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8; -; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8; +; CHECK-NEXT: prmt.b32 %r5, %r3, %r4, 16435; +; CHECK-NEXT: prmt.b32 %r6, %r1, %r2, 13120; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i32> %a to <4 x i8> @@ -1060,19 +1063,19 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { ; CHECK-LABEL: test_trunc_2xi64( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; -; CHECK-NEXT: cvt.u32.u64 %r1, %rd1; -; CHECK-NEXT: cvt.u32.u64 %r2, %rd2; -; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; -; CHECK-NEXT: cvt.u32.u64 %r4, %rd3; -; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8; -; CHECK-NEXT: cvt.u32.u64 %r6, %rd4; -; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd4; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd3; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 16435; +; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd1; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 13120; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i64> %a to <4 x i8> @@ -1184,15 +1187,16 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: mov.b32 %r1, 6; +; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 16435; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 5, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -1255,27 +1259,27 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; -; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; +; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; -; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; ; CHECK-NEXT: ret; %r = fptosi <4 x half> %a to <4 x i8> @@ -1286,27 +1290,27 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; -; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; +; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; -; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; ; CHECK-NEXT: ret; %r = fptoui <4 x half> %a to <4 x i8> @@ -1326,33 +1330,33 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.s8.s32 %rs2, %r4; ; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs5, %r7; ; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; +; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs7, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs8, %r11; ; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.s8.s32 %rs10, %r14; -; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.s8.s32 %rs11, %r15; +; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs10, %r13; +; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs11, %r14; ; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: @@ -1373,7 +1377,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: test_srem_v3i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b32 %r<17>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry @@ -1392,25 +1396,25 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs9; ; CHECK-NEXT: ld.s8 %rs10, [%rd2+2]; -; CHECK-NEXT: bfe.s32 %r5, %r3, 0, 8; +; CHECK-NEXT: bfe.s32 %r5, %r3, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs11, %r5; -; CHECK-NEXT: bfe.s32 %r6, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs12, %r6; ; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11; ; CHECK-NEXT: cvt.u32.u16 %r7, %rs13; -; CHECK-NEXT: bfe.s32 %r8, %r3, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r3, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs14, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs15, %r9; ; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14; ; CHECK-NEXT: cvt.u32.u16 %r10, %rs16; -; CHECK-NEXT: bfi.b32 %r11, %r10, %r7, 8, 8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r7, 13120; ; CHECK-NEXT: // implicit-def: %r13 -; CHECK-NEXT: bfi.b32 %r12, %r13, %r11, 16, 8; -; CHECK-NEXT: // implicit-def: %r15 -; CHECK-NEXT: bfi.b32 %r14, %r15, %r12, 24, 8; +; CHECK-NEXT: // implicit-def: %r14 +; CHECK-NEXT: prmt.b32 %r12, %r13, %r14, 16435; +; CHECK-NEXT: prmt.b32 %r15, %r11, %r12, 21520; ; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; -; CHECK-NEXT: cvt.u16.u32 %rs18, %r14; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; } ; CHECK-NEXT: st.u8 [%rd3], %rs18; ; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; ; CHECK-NEXT: st.u8 [%rd3+1], %rs19; @@ -1437,25 +1441,25 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.s32 %r11, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r12, -1, 0, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; ; CHECK-NEXT: selp.s32 %r14, -1, 0, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.s32 %r16, -1, 0, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: selp.s32 %r15, -1, 0, %p1; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index f471d47077cf0d..8b7e5235443f05 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -33,35 +33,35 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: and.b16 %rs2, %rs1, 255; ; CHECK-NEXT: setp.eq.s16 %p1, %rs2, 0; -; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; ; CHECK-NEXT: and.b16 %rs4, %rs3, 255; ; CHECK-NEXT: setp.eq.s16 %p2, %rs4, 0; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r4; ; CHECK-NEXT: and.b16 %rs6, %rs5, 255; ; CHECK-NEXT: setp.eq.s16 %p3, %rs6, 0; -; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; ; CHECK-NEXT: and.b16 %rs8, %rs7, 255; ; CHECK-NEXT: setp.eq.s16 %p4, %rs8, 0; ; CHECK-NEXT: selp.s32 %r6, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r7, -1, 0, %p3; -; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; ; CHECK-NEXT: selp.s32 %r9, -1, 0, %p2; -; CHECK-NEXT: bfi.b32 %r10, %r9, %r8, 16, 8; -; CHECK-NEXT: selp.s32 %r11, -1, 0, %p1; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r10, 24, 8; +; CHECK-NEXT: selp.s32 %r10, -1, 0, %p1; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r9, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r8, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; entry: From c893e3d02d1f7b67880090485a030b79741bba1c Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Thu, 10 Oct 2024 10:24:59 -0700 Subject: [PATCH 051/177] [flang][runtime] Fix runtime crash after bad recoverable OPEN (#111454) When an OPEN statement with a unit number fails in a recoverable manner, the runtime needs to delete the ExternalFileUnit instance that was created in the unit map. And we do this too soon -- that instance still holds some of the I/O statement state that will be used by a later call into the runtime for EndIoStatement. Move the code that deletes the unit after a failed but recoverable OPEN into ExternalIoStatementBase::EndIoStatement, and don't do things afterwards that would need the I/O statement state that has been destroyed. Fixes https://github.com/llvm/llvm-project/issues/111404. --- flang/runtime/io-stmt.cpp | 14 +++++++++----- flang/runtime/io-stmt.h | 2 ++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp index cd7a196335d31e..f24eb929ce748a 100644 --- a/flang/runtime/io-stmt.cpp +++ b/flang/runtime/io-stmt.cpp @@ -243,7 +243,15 @@ int ExternalIoStatementBase::EndIoStatement() { CompleteOperation(); auto result{IoStatementBase::EndIoStatement()}; #if !defined(RT_USE_PSEUDO_FILE_UNIT) + auto unitNumber{unit_.unitNumber()}; unit_.EndIoStatement(); // annihilates *this in unit_.u_ + if (destroy_) { + if (ExternalFileUnit * + toClose{ExternalFileUnit::LookUpForClose(unitNumber)}) { + toClose->Close(CloseStatus::Delete, *this); + toClose->DestroyClosed(); + } + } #else // Fetch the unit pointer before *this disappears. ExternalFileUnit *unitPtr{&unit_}; @@ -329,11 +337,7 @@ void OpenStatementState::CompleteOperation() { } if (!wasExtant_ && InError()) { // Release the new unit on failure - if (ExternalFileUnit * - toClose{unit().LookUpForClose(unit().unitNumber())}) { - toClose->Close(CloseStatus::Delete, *this); - toClose->DestroyClosed(); - } + set_destroy(); } IoStatementBase::CompleteOperation(); } diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h index 2e0ca46078ecdc..1f1419b249e5e5 100644 --- a/flang/runtime/io-stmt.h +++ b/flang/runtime/io-stmt.h @@ -455,6 +455,7 @@ class ExternalIoStatementBase : public IoStatementBase { RT_API_ATTRS MutableModes &mutableModes(); RT_API_ATTRS ConnectionState &GetConnectionState(); RT_API_ATTRS int asynchronousID() const { return asynchronousID_; } + RT_API_ATTRS void set_destroy(bool yes = true) { destroy_ = yes; } RT_API_ATTRS int EndIoStatement(); RT_API_ATTRS ExternalFileUnit *GetExternalFileUnit() const { return &unit_; } RT_API_ATTRS void SetAsynchronous(); @@ -463,6 +464,7 @@ class ExternalIoStatementBase : public IoStatementBase { private: ExternalFileUnit &unit_; int asynchronousID_{-1}; + bool destroy_{false}; }; template From 4f2b65fb80a4b27e5fb88db816ed0ce174c9b1b4 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Thu, 10 Oct 2024 10:25:19 -0700 Subject: [PATCH 052/177] [flang] Fix references to destroyed objects (#111582) ProgramTree instances are created as the value of a local variable in the Pre(const parser::ProgramUnit &) member function in name resolution. But references to these ProgramTree instances can persist in SubprogramNameDetails symbol table entries that might survive that function call's lifetime, and lead to trouble later when (e.g.) expression semantics needs to deal with a possible forward reference in a function reference in an expression being processed later in expression checking. So put those ProgramTree instances into a longer-lived linked list within the SemanticsContext. Might fix some weird crashes reported on big-endian targets (AIX & Solaris). --- flang/{lib => include/flang}/Semantics/program-tree.h | 4 ++-- flang/include/flang/Semantics/semantics.h | 7 ++++++- flang/lib/Semantics/program-tree.cpp | 8 ++++---- flang/lib/Semantics/resolve-names.cpp | 5 +++-- flang/lib/Semantics/semantics.cpp | 4 ++++ 5 files changed, 19 insertions(+), 9 deletions(-) rename flang/{lib => include/flang}/Semantics/program-tree.h (97%) diff --git a/flang/lib/Semantics/program-tree.h b/flang/include/flang/Semantics/program-tree.h similarity index 97% rename from flang/lib/Semantics/program-tree.h rename to flang/include/flang/Semantics/program-tree.h index ab00261a964a13..1c89e6c175b964 100644 --- a/flang/lib/Semantics/program-tree.h +++ b/flang/include/flang/Semantics/program-tree.h @@ -9,8 +9,8 @@ #ifndef FORTRAN_SEMANTICS_PROGRAM_TREE_H_ #define FORTRAN_SEMANTICS_PROGRAM_TREE_H_ +#include "symbol.h" #include "flang/Parser/parse-tree.h" -#include "flang/Semantics/symbol.h" #include #include @@ -35,7 +35,7 @@ class ProgramTree { std::list>; // Build the ProgramTree rooted at one of these program units. - static ProgramTree Build(const parser::ProgramUnit &, SemanticsContext &); + static ProgramTree &Build(const parser::ProgramUnit &, SemanticsContext &); static std::optional Build( const parser::MainProgram &, SemanticsContext &); static std::optional Build( diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h index 606afbe288c38d..c981d86fbd94cb 100644 --- a/flang/include/flang/Semantics/semantics.h +++ b/flang/include/flang/Semantics/semantics.h @@ -9,6 +9,8 @@ #ifndef FORTRAN_SEMANTICS_SEMANTICS_H_ #define FORTRAN_SEMANTICS_SEMANTICS_H_ +#include "module-dependences.h" +#include "program-tree.h" #include "scope.h" #include "symbol.h" #include "flang/Common/Fortran-features.h" @@ -17,7 +19,6 @@ #include "flang/Evaluate/intrinsics.h" #include "flang/Evaluate/target.h" #include "flang/Parser/message.h" -#include "flang/Semantics/module-dependences.h" #include #include #include @@ -280,6 +281,9 @@ class SemanticsContext { void DumpSymbols(llvm::raw_ostream &); + // Top-level ProgramTrees are owned by the SemanticsContext for persistence. + ProgramTree &SaveProgramTree(ProgramTree &&); + private: struct ScopeIndexComparator { bool operator()(parser::CharBlock, parser::CharBlock) const; @@ -331,6 +335,7 @@ class SemanticsContext { ModuleDependences moduleDependences_; std::map moduleFileOutputRenamings_; UnorderedSymbolSet isDefined_; + std::list programTrees_; }; class Semantics { diff --git a/flang/lib/Semantics/program-tree.cpp b/flang/lib/Semantics/program-tree.cpp index 250f5801b39e1a..86085e78803a23 100644 --- a/flang/lib/Semantics/program-tree.cpp +++ b/flang/lib/Semantics/program-tree.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "program-tree.h" +#include "flang/Semantics/program-tree.h" #include "flang/Common/idioms.h" #include "flang/Parser/char-block.h" #include "flang/Semantics/scope.h" @@ -130,13 +130,13 @@ static ProgramTree BuildModuleTree( return node; } -ProgramTree ProgramTree::Build( +ProgramTree &ProgramTree::Build( const parser::ProgramUnit &x, SemanticsContext &context) { return common::visit( - [&](const auto &y) { + [&](const auto &y) -> ProgramTree & { auto node{Build(y.value(), context)}; CHECK(node.has_value()); - return std::move(*node); + return context.SaveProgramTree(std::move(*node)); }, x.u); } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index e5e03f644f1b00..f1ce0b415ebe9c 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -10,7 +10,6 @@ #include "definable.h" #include "mod-file.h" #include "pointer-assignment.h" -#include "program-tree.h" #include "resolve-directives.h" #include "resolve-names-utils.h" #include "rewrite-parse-tree.h" @@ -32,6 +31,7 @@ #include "flang/Parser/tools.h" #include "flang/Semantics/attr.h" #include "flang/Semantics/expression.h" +#include "flang/Semantics/program-tree.h" #include "flang/Semantics/scope.h" #include "flang/Semantics/semantics.h" #include "flang/Semantics/symbol.h" @@ -2490,6 +2490,7 @@ Symbol &ScopeHandler::CopySymbol(const SourceName &name, const Symbol &symbol) { } // Look for name only in scope, not in enclosing scopes. + Symbol *ScopeHandler::FindInScope( const Scope &scope, const parser::Name &name) { return Resolve(name, FindInScope(scope, name.source)); @@ -9120,7 +9121,7 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) { ResolveAccParts(context(), x, &topScope_); return false; } - auto root{ProgramTree::Build(x, context())}; + ProgramTree &root{ProgramTree::Build(x, context())}; SetScope(topScope_); ResolveSpecificationParts(root); FinishSpecificationParts(root); diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp index 637088ff0171c0..58dc1f218b56f4 100644 --- a/flang/lib/Semantics/semantics.cpp +++ b/flang/lib/Semantics/semantics.cpp @@ -663,6 +663,10 @@ void SemanticsContext::DumpSymbols(llvm::raw_ostream &os) { DoDumpSymbols(os, globalScope()); } +ProgramTree &SemanticsContext::SaveProgramTree(ProgramTree &&tree) { + return programTrees_.emplace_back(std::move(tree)); +} + void Semantics::DumpSymbols(llvm::raw_ostream &os) { context_.DumpSymbols(os); } void Semantics::DumpSymbolsSources(llvm::raw_ostream &os) const { From 2f22656db541e4e5c3401e7bbab25277c8438a23 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Thu, 10 Oct 2024 10:25:42 -0700 Subject: [PATCH 053/177] [flang] Minor cleanup (move function into /tools.cpp) (#111587) The semantics utility GetAllNames has declarations in two header files and a definition that really should be in the common utilities source file. Remove the redudant declaration from resolve-names-utils.h and move code from resolve-names-utils.cpp into Semantics/tools.cpp. --- flang/lib/Semantics/resolve-names-utils.cpp | 33 --------------------- flang/lib/Semantics/resolve-names-utils.h | 5 ---- flang/lib/Semantics/tools.cpp | 31 +++++++++++++++++++ 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp index b8ce8d14a33faa..a838d49c06104d 100644 --- a/flang/lib/Semantics/resolve-names-utils.cpp +++ b/flang/lib/Semantics/resolve-names-utils.cpp @@ -31,8 +31,6 @@ using common::NumericOperator; using common::RelationalOperator; using IntrinsicOperator = parser::DefinedOperator::IntrinsicOperator; -static constexpr const char *operatorPrefix{"operator("}; - static GenericKind MapIntrinsicOperator(IntrinsicOperator); Symbol *Resolve(const parser::Name &name, Symbol *symbol) { @@ -69,37 +67,6 @@ bool IsIntrinsicOperator( return false; } -template -std::forward_list GetOperatorNames( - const SemanticsContext &context, E opr) { - std::forward_list result; - for (const char *name : context.languageFeatures().GetNames(opr)) { - result.emplace_front(std::string{operatorPrefix} + name + ')'); - } - return result; -} - -std::forward_list GetAllNames( - const SemanticsContext &context, const SourceName &name) { - std::string str{name.ToString()}; - if (!name.empty() && name.end()[-1] == ')' && - name.ToString().rfind(std::string{operatorPrefix}, 0) == 0) { - for (int i{0}; i != common::LogicalOperator_enumSize; ++i) { - auto names{GetOperatorNames(context, LogicalOperator{i})}; - if (llvm::is_contained(names, str)) { - return names; - } - } - for (int i{0}; i != common::RelationalOperator_enumSize; ++i) { - auto names{GetOperatorNames(context, RelationalOperator{i})}; - if (llvm::is_contained(names, str)) { - return names; - } - } - } - return {str}; -} - bool IsLogicalConstant( const SemanticsContext &context, const SourceName &name) { std::string str{name.ToString()}; diff --git a/flang/lib/Semantics/resolve-names-utils.h b/flang/lib/Semantics/resolve-names-utils.h index 5b537d80e5f880..64784722ff4f84 100644 --- a/flang/lib/Semantics/resolve-names-utils.h +++ b/flang/lib/Semantics/resolve-names-utils.h @@ -51,11 +51,6 @@ parser::MessageFixedText WithSeverity( bool IsIntrinsicOperator(const SemanticsContext &, const SourceName &); bool IsLogicalConstant(const SemanticsContext &, const SourceName &); -// Some intrinsic operators have more than one name (e.g. `operator(.eq.)` and -// `operator(==)`). GetAllNames() returns them all, including symbolName. -std::forward_list GetAllNames( - const SemanticsContext &, const SourceName &); - template MaybeIntExpr EvaluateIntExpr(SemanticsContext &context, const T &expr) { if (MaybeExpr maybeExpr{ diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 4d2a0a607abe89..379d5d0eb3eef0 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -1654,6 +1654,37 @@ bool HasDefinedIo(common::DefinedIo which, const DerivedTypeSpec &derived, return parentType && HasDefinedIo(which, *parentType, scope); } +template +std::forward_list GetOperatorNames( + const SemanticsContext &context, E opr) { + std::forward_list result; + for (const char *name : context.languageFeatures().GetNames(opr)) { + result.emplace_front("operator("s + name + ')'); + } + return result; +} + +std::forward_list GetAllNames( + const SemanticsContext &context, const SourceName &name) { + std::string str{name.ToString()}; + if (!name.empty() && name.end()[-1] == ')' && + name.ToString().rfind("operator(", 0) == 0) { + for (int i{0}; i != common::LogicalOperator_enumSize; ++i) { + auto names{GetOperatorNames(context, common::LogicalOperator{i})}; + if (llvm::is_contained(names, str)) { + return names; + } + } + for (int i{0}; i != common::RelationalOperator_enumSize; ++i) { + auto names{GetOperatorNames(context, common::RelationalOperator{i})}; + if (llvm::is_contained(names, str)) { + return names; + } + } + } + return {str}; +} + void WarnOnDeferredLengthCharacterScalar(SemanticsContext &context, const SomeExpr *expr, parser::CharBlock at, const char *what) { if (context.languageFeatures().ShouldWarn( From 7e16571eb02e7e9da24fee45359e981af783d0d0 Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Thu, 10 Oct 2024 19:27:27 +0200 Subject: [PATCH 054/177] [lldb][libc++] Hide all libc++ implementation details from stacktraces (#108870) This commit changes the libc++ frame recognizer to hide implementation details of libc++ more aggressively. The applied heuristic is rather straightforward: We consider every function name starting with `__` as an implementation detail. This works pretty neatly for `std::invoke`, `std::function`, `std::sort`, `std::map::emplace` and many others. Also, this should align quite nicely with libc++'s general coding convention of using the `__` for their implementation details, thereby keeping the future maintenance effort low. However, this heuristic by itself does not work in 100% of the cases: E.g., `std::ranges::sort` is not a function, but an object with an overloaded `operator()`, which means that there is no actual call `std::ranges::sort` in the call stack. Instead, there is a `std::ranges::__sort::operator()` call. To make sure that we don't hide this stack frame, we never hide the frame which represents the entry point from user code into libc++ code --- libcxx/docs/UserDocumentation.rst | 29 +++++++ .../CPlusPlus/CPPLanguageRuntime.cpp | 49 ++++++----- .../Makefile | 2 +- .../TestLibcxxInternalsRecognizer.py | 67 +++++++++++++++ .../cpp/libcxx-internals-recognizer/main.cpp | 86 +++++++++++++++++++ .../TestStdInvokeRecognizer.py | 44 ---------- .../lang/cpp/std-invoke-recognizer/main.cpp | 30 ------- 7 files changed, 211 insertions(+), 96 deletions(-) rename lldb/test/API/lang/cpp/{std-invoke-recognizer => libcxx-internals-recognizer}/Makefile (68%) create mode 100644 lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py create mode 100644 lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp delete mode 100644 lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py delete mode 100644 lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp diff --git a/libcxx/docs/UserDocumentation.rst b/libcxx/docs/UserDocumentation.rst index f5e55994aa7572..1db437ce58b95e 100644 --- a/libcxx/docs/UserDocumentation.rst +++ b/libcxx/docs/UserDocumentation.rst @@ -355,6 +355,35 @@ Third-party Integrations Libc++ provides integration with a few third-party tools. +Debugging libc++ internals in LLDB +---------------------------------- + +LLDB hides the implementation details of libc++ by default. + +E.g., when setting a breakpoint in a comparator passed to ``std::sort``, the +backtrace will read as + +.. code-block:: + + (lldb) thread backtrace + * thread #1, name = 'a.out', stop reason = breakpoint 3.1 + * frame #0: 0x000055555555520e a.out`my_comparator(a=1, b=8) at test-std-sort.cpp:6:3 + frame #7: 0x0000555555555615 a.out`void std::__1::sort[abi:ne200000], bool (*)(int, int)>(__first=(item = 8), __last=(item = 0), __comp=(a.out`my_less(int, int) at test-std-sort.cpp:5)) at sort.h:1003:3 + frame #8: 0x000055555555531a a.out`main at test-std-sort.cpp:24:3 + +Note how the caller of ``my_comparator`` is shown as ``std::sort``. Looking at +the frame numbers, we can see that frames #1 until #6 were hidden. Those frames +represent internal implementation details such as ``__sort4`` and similar +utility functions. + +To also show those implementation details, use ``thread backtrace -u``. +Alternatively, to disable those compact backtraces, use ``frame recognizer list`` +and ``frame recognizer disable`` on the "libc++ frame recognizer". + +Futhermore, stepping into libc++ functions is disabled by default. This is controlled via the +setting ``target.process.thread.step-avoid-regexp`` which defaults to ``^std::`` and can be +disabled using ``settings set target.process.thread.step-avoid-regexp ""``. + GDB Pretty printers for libc++ ------------------------------ diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index faa05e8f834ea1..e7ca3f655f237c 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -45,7 +45,7 @@ char CPPLanguageRuntime::ID = 0; /// A frame recognizer that is installed to hide libc++ implementation /// details from the backtrace. class LibCXXFrameRecognizer : public StackFrameRecognizer { - std::array m_hidden_regex; + std::array m_hidden_regex; RecognizedStackFrameSP m_hidden_frame; struct LibCXXHiddenFrame : public RecognizedStackFrame { @@ -55,28 +55,17 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer { public: LibCXXFrameRecognizer() : m_hidden_regex{ - // internal implementation details of std::function + // internal implementation details in the `std::` namespace // std::__1::__function::__alloc_func, void ()>::operator()[abi:ne200000] // std::__1::__function::__func, void ()>::operator() // std::__1::__function::__value_func::operator()[abi:ne200000]() const - RegularExpression{"" - R"(^std::__[^:]*::)" // Namespace. - R"(__function::.*::operator\(\))"}, - // internal implementation details of std::function in ABI v2 // std::__2::__function::__policy_invoker::__call_impl[abi:ne200000]> - RegularExpression{"" - R"(^std::__[^:]*::)" // Namespace. - R"(__function::.*::__call_impl)"}, - // internal implementation details of std::invoke - // std::__1::__invoke[abi:ne200000] - RegularExpression{ - R"(^std::__[^:]*::)" // Namespace. - R"(__invoke)"}, - // internal implementation details of std::invoke - // std::__1::__invoke_void_return_wrapper::__call[abi:ne200000] - RegularExpression{ - R"(^std::__[^:]*::)" // Namespace. - R"(__invoke_void_return_wrapper<.*>::__call)"} + // std::__1::__invoke[abi:ne200000] + // std::__1::__invoke_void_return_wrapper::__call[abi:ne200000] + RegularExpression{R"(^std::__[^:]*::__)"}, + // internal implementation details in the `std::ranges` namespace + // std::__1::ranges::__sort::__sort_fn_impl[abi:ne200000], std::__1::__wrap_iter, bool (*)(int, int), std::__1::identity> + RegularExpression{R"(^std::__[^:]*::ranges::__)"}, }, m_hidden_frame(new LibCXXHiddenFrame()) {} @@ -90,9 +79,27 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer { if (!sc.function) return {}; - for (RegularExpression &r : m_hidden_regex) - if (r.Execute(sc.function->GetNameNoArguments())) + // Check if we have a regex match + for (RegularExpression &r : m_hidden_regex) { + if (!r.Execute(sc.function->GetNameNoArguments())) + continue; + + // Only hide this frame if the immediate caller is also within libc++. + lldb::ThreadSP thread_sp = frame_sp->GetThread(); + if (!thread_sp) + return {}; + lldb::StackFrameSP parent_frame_sp = + thread_sp->GetStackFrameAtIndex(frame_sp->GetFrameIndex() + 1); + if (!parent_frame_sp) + return {}; + const auto &parent_sc = + parent_frame_sp->GetSymbolContext(lldb::eSymbolContextFunction); + if (!parent_sc.function) + return {}; + if (parent_sc.function->GetNameNoArguments().GetStringRef().starts_with( + "std::")) return m_hidden_frame; + } return {}; } diff --git a/lldb/test/API/lang/cpp/std-invoke-recognizer/Makefile b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/Makefile similarity index 68% rename from lldb/test/API/lang/cpp/std-invoke-recognizer/Makefile rename to lldb/test/API/lang/cpp/libcxx-internals-recognizer/Makefile index 69014eb9c0f2eb..bb571299664934 100644 --- a/lldb/test/API/lang/cpp/std-invoke-recognizer/Makefile +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/Makefile @@ -1,5 +1,5 @@ CXX_SOURCES := main.cpp USE_LIBCPP := 1 -CXXFLAGS_EXTRAS := -std=c++17 +CXXFLAGS_EXTRAS := -std=c++20 include Makefile.rules diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py new file mode 100644 index 00000000000000..ad48208f21e502 --- /dev/null +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py @@ -0,0 +1,67 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class LibCxxInternalsRecognizerTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + @add_test_categories(["libc++"]) + def test_frame_recognizer(self): + """Test that implementation details of libc++ are hidden""" + self.build() + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "break here", lldb.SBFileSpec("main.cpp") + ) + + expected_parents = { + "sort_less(int, int)": ["::sort", "test_algorithms"], + # `std::ranges::sort` is implemented as an object of types `__sort`. + # We never hide the frame of the entry-point into the standard library, even + # if the name starts with `__` which usually indicates an internal function. + "ranges_sort_less(int, int)": [ + "ranges::__sort::operator()", + "test_algorithms", + ], + # `ranges::views::transform` internally uses `std::invoke`, and that + # call also shows up in the stack trace + "view_transform(int)": [ + "::invoke", + "ranges::transform_view", + "test_algorithms", + ], + # Various types of `invoke` calls + "consume_number(int)": ["::invoke", "test_invoke"], + "invoke_add(int, int)": ["::invoke", "test_invoke"], + "Callable::member_function(int) const": ["::invoke", "test_invoke"], + "Callable::operator()(int) const": ["::invoke", "test_invoke"], + # Containers + "MyKey::operator<(MyKey const&) const": [ + "less", + "::emplace", + "test_containers", + ], + } + stop_set = set() + while process.GetState() != lldb.eStateExited: + fn = thread.GetFrameAtIndex(0).GetFunctionName() + stop_set.add(fn) + self.assertIn(fn, expected_parents.keys()) + frame_id = 1 + for expected_parent in expected_parents[fn]: + # Skip all hidden frames + while ( + frame_id < thread.GetNumFrames() + and thread.GetFrameAtIndex(frame_id).IsHidden() + ): + frame_id = frame_id + 1 + # Expect the correct parent frame + self.assertIn( + expected_parent, thread.GetFrameAtIndex(frame_id).GetFunctionName() + ) + frame_id = frame_id + 1 + process.Continue() + + # Make sure that we actually verified all intended scenarios + self.assertEqual(len(stop_set), len(expected_parents)) diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp new file mode 100644 index 00000000000000..870301b0970439 --- /dev/null +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include + +bool sort_less(int a, int b) { + __builtin_printf("break here"); + return a < b; +} + +bool ranges_sort_less(int a, int b) { + __builtin_printf("break here"); + return a < b; +} + +int view_transform(int a) { + __builtin_printf("break here"); + return a * a; +} + +void test_algorithms() { + std::vector vec{8, 1, 3, 2}; + + // The internal frames for `std::sort` should be hidden + std::sort(vec.begin(), vec.end(), sort_less); + + // The internal frames for `ranges::sort` should be hidden + std::ranges::sort(vec.begin(), vec.end(), ranges_sort_less); + + // Same for views + for (auto x : vec | std::ranges::views::transform(view_transform)) { + // no-op + } +} + +void consume_number(int i) { __builtin_printf("break here"); } + +int invoke_add(int i, int j) { + __builtin_printf("break here"); + return i + j; +} + +struct Callable { + Callable(int num) : num_(num) {} + void operator()(int i) const { __builtin_printf("break here"); } + void member_function(int i) const { __builtin_printf("break here"); } + int num_; +}; + +void test_invoke() { + // Invoke a void-returning function + std::invoke(consume_number, -9); + + // Invoke a non-void-returning function + std::invoke(invoke_add, 1, 10); + + // Invoke a member function + const Callable foo(314159); + std::invoke(&Callable::member_function, foo, 1); + + // Invoke a function object + std::invoke(Callable(12), 18); +} + +struct MyKey { + int x; + bool operator==(const MyKey &) const = default; + bool operator<(const MyKey &other) const { + __builtin_printf("break here"); + return x < other.x; + } +}; + +void test_containers() { + std::map map; + map.emplace(MyKey{1}, 2); + map.emplace(MyKey{2}, 3); +} + +int main() { + test_algorithms(); + test_invoke(); + test_containers(); + return 0; +} diff --git a/lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py b/lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py deleted file mode 100644 index dbe29610bf7982..00000000000000 --- a/lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py +++ /dev/null @@ -1,44 +0,0 @@ -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -class LibCxxStdFunctionRecognizerTestCase(TestBase): - NO_DEBUG_INFO_TESTCASE = True - - @add_test_categories(["libc++"]) - def test_frame_recognizer(self): - """Test that implementation details of `std::invoke` are hidden""" - self.build() - (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( - self, "break here", lldb.SBFileSpec("main.cpp") - ) - - stop_cnt = 0 - while process.GetState() != lldb.eStateExited: - stop_cnt += 1 - self.assertTrue( - any( - f in thread.GetFrameAtIndex(0).GetFunctionName() - for f in ["consume_number", "add", "Callable"] - ) - ) - # Skip all hidden frames - frame_id = 1 - while ( - frame_id < thread.GetNumFrames() - and thread.GetFrameAtIndex(frame_id).IsHidden() - ): - frame_id = frame_id + 1 - # Expect `std::invoke` to be the direct parent - self.assertIn( - "::invoke", thread.GetFrameAtIndex(frame_id).GetFunctionName() - ) - # And right above that, there should be the `main` frame - self.assertIn( - "main", thread.GetFrameAtIndex(frame_id + 1).GetFunctionName() - ) - process.Continue() - - self.assertEqual(stop_cnt, 4) diff --git a/lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp b/lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp deleted file mode 100644 index bafbbd28386e8b..00000000000000 --- a/lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include - -void consume_number(int i) { __builtin_printf("break here"); } - -int add(int i, int j) { - // break here - return i + j; -} - -struct Callable { - Callable(int num) : num_(num) {} - void operator()(int i) const { __builtin_printf("break here"); } - void member_function(int i) const { __builtin_printf("break here"); } - int num_; -}; - -int main() { - // Invoke a void-returning function - std::invoke(consume_number, -9); - - // Invoke a non-void-returning function - std::invoke(add, 1, 10); - - // Invoke a member function - const Callable foo(314159); - std::invoke(&Callable::member_function, foo, 1); - - // Invoke a function object - std::invoke(Callable(12), 18); -} From 7026960ecfe156223c4126495c146ce0d42c64a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 10 Oct 2024 10:31:03 -0700 Subject: [PATCH 055/177] [flang][runtime][NFC] Fix header guard typo (#111741) Header guard was in sync with the filename. --- flang/include/flang/Runtime/allocator-registry.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang/include/flang/Runtime/allocator-registry.h index acfada506fafc6..3ccee56dc3fc0f 100644 --- a/flang/include/flang/Runtime/allocator-registry.h +++ b/flang/include/flang/Runtime/allocator-registry.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef FORTRAN_RUNTIME_ALLOCATOR_H_ -#define FORTRAN_RUNTIME_ALLOCATOR_H_ +#ifndef FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_ +#define FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_ #include "flang/Common/api-attrs.h" #include @@ -62,4 +62,4 @@ RT_OFFLOAD_VAR_GROUP_END } // namespace Fortran::runtime -#endif // FORTRAN_RUNTIME_ALLOCATOR_H_ +#endif // FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_ From 99c8557c175e88ff1c338c4c29e3a4d63c5a46cb Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Thu, 10 Oct 2024 18:52:20 +0100 Subject: [PATCH 056/177] Fix GCC build problem with 03483737a7a2 --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 4f350ea236da84..c909d13e4314b4 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -157,7 +157,7 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region ®ion, /// Helper to create a typical indexing map for MatmulOp. Returns a list of /// AffineMap. -static SmallVector +static SmallVector getDefaultIndexingMapsForMatmul(MLIRContext *context) { AffineExpr d0, d1, d2; SmallVector indexingMaps; From 453d373e80f3ed8d67c92956101f7b9fa9467116 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 8 Oct 2024 23:22:44 -0700 Subject: [PATCH 057/177] [lsan] Add a few "\n" missing from VReport --- compiler-rt/lib/lsan/lsan_common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index 438aa3a85f6724..6776598651ae9b 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -780,10 +780,10 @@ static bool PrintResults(LeakReport &report) { static bool CheckForLeaks() { if (&__lsan_is_turned_off && __lsan_is_turned_off()) { - VReport(1, "LeakSanitizer is disabled"); + VReport(1, "LeakSanitizer is disabled\n"); return false; } - VReport(1, "LeakSanitizer: checking for leaks"); + VReport(1, "LeakSanitizer: checking for leaks\n"); // Inside LockStuffAndStopTheWorld we can't run symbolizer, so we can't match // suppressions. However if a stack id was previously suppressed, it should be // suppressed in future checks as well. From 62b3a4bc708885f8ded09c900a79ad509f02e54a Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 10 Oct 2024 19:40:02 +0100 Subject: [PATCH 058/177] [AMDGPU] Improve codegen for s_barrier_init (#111866) --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bbdc006b9afcf0..3d8e03521e2b90 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10031,9 +10031,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, // If reference to barrier id is not an inline constant then it must be // referenced with M0[4:0]. Perform an OR with the member count to // include it in M0. - M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, - Op.getOperand(2), M0Val), - 0); + M0Val = DAG.getNode(ISD::OR, DL, MVT::i32, Op.getOperand(2), M0Val); } Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); } else if (IsInlinableBarID) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 4fb28b392c9ea9..1e13b40afb8be8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -737,11 +737,9 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 ; GFX12-SDAG-NEXT: s_barrier_init m0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe From ba530e6b64a27876ef5ea8e29806260d8bc00926 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Thu, 10 Oct 2024 11:42:23 -0700 Subject: [PATCH 059/177] [bazel] Add initial clang-doc config (#111779) --- .../clang-tools-extra/clang-doc/BUILD.bazel | 45 +++++++++++++++++++ .../clang-tools-extra/unittests/BUILD.bazel | 21 +++++++++ 2 files changed, 66 insertions(+) create mode 100644 utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel new file mode 100644 index 00000000000000..d7b9723b875c37 --- /dev/null +++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel @@ -0,0 +1,45 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") + +package( + default_visibility = ["//visibility:public"], + features = ["layering_check"], +) + +licenses(["notice"]) + +cc_library( + name = "lib", + srcs = glob(["*.cpp"]), + hdrs = glob(["*.h"]), + includes = ["."], + deps = [ + "//clang:ast", + "//clang:basic", + "//clang:frontend", + "//clang:index", + "//clang:lex", + "//clang:tooling", + "//llvm:BitstreamReader", + "//llvm:BitstreamWriter", + "//llvm:Support", + ], +) + +cc_binary( + name = "clang-doc", + srcs = ["tool/ClangDocMain.cpp"], + stamp = 0, + deps = [ + ":lib", + "//clang:ast", + "//clang:ast_matchers", + "//clang:driver", + "//clang:frontend", + "//clang:tooling", + "//llvm:Support", + ], +) diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel index 12e87cec4b76b8..47ec4552856416 100644 --- a/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel @@ -53,3 +53,24 @@ cc_test( "//third-party/unittest:gtest_main", ], ) + +cc_test( + name = "clang_doc_test", + size = "small", + srcs = glob( + [ + "clang-doc/*.cpp", + "clang-doc/*.h", + ], + allow_empty = False, + ), + deps = [ + "//clang:ast", + "//clang:basic", + "//clang-tools-extra/clang-doc:lib", + "//llvm:BitstreamReader", + "//llvm:BitstreamWriter", + "//third-party/unittest:gtest", + "//third-party/unittest:gtest_main", + ], +) From d36cef0b173329fa1f94ff3a92da6a50da4aff9e Mon Sep 17 00:00:00 2001 From: Finn Plummer <50529406+inbelic@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:44:44 -0700 Subject: [PATCH 060/177] [HLSL][DXIL] Implement WaveGetLaneIndex Intrinsic (#111576) - add additional lowering for directx backend in CGBuiltin.cpp - add directx intrinsic to IntrinsicsDirectX.td - add semantic check of arguments in SemaHLSL.cpp - add mapping to DXIL op in DXIL.td - add testing of semantics in WaveGetLaneIndex-errors.hlsl - add testing of dxil lowering in WaveGetLaneIndex.ll Resolves #70105 --- clang/lib/CodeGen/CGBuiltin.cpp | 18 ++++++++++++++--- clang/lib/Sema/SemaHLSL.cpp | 5 +++++ .../builtins/wave_get_lane_index_simple.hlsl | 20 +++++++++++++------ .../BuiltIns/WaveGetLaneIndex-errors.hlsl | 6 ++++++ llvm/include/llvm/IR/IntrinsicsDirectX.td | 1 + llvm/lib/Target/DirectX/DXIL.td | 9 +++++++++ llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll | 10 ++++++++++ 7 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl create mode 100644 llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 2449b90a0e7902..06140d6d4ce27b 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18867,9 +18867,21 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: { ArrayRef{Op0, Op1}, nullptr, "hlsl.step"); } case Builtin::BI__builtin_hlsl_wave_get_lane_index: { - return EmitRuntimeCall(CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, {}, false), "__hlsl_wave_get_lane_index", - {}, false, true)); + // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in + // defined in SPIRVBuiltins.td. So instead we manually get the matching name + // for the DirectX intrinsic and the demangled builtin name + switch (CGM.getTarget().getTriple().getArch()) { + case llvm::Triple::dxil: + return EmitRuntimeCall(Intrinsic::getDeclaration( + &CGM.getModule(), Intrinsic::dx_wave_getlaneindex)); + case llvm::Triple::spirv: + return EmitRuntimeCall(CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, {}, false), + "__hlsl_wave_get_lane_index", {}, false, true)); + default: + llvm_unreachable( + "Intrinsic WaveGetLaneIndex not supported by target architecture"); + } } case Builtin::BI__builtin_hlsl_wave_is_first_lane: { Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic(); diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 05e6e7800112df..b0acbbbbb2b1f0 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -1992,6 +1992,11 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { return true; break; } + case Builtin::BI__builtin_hlsl_wave_get_lane_index: { + if (SemaRef.checkArgCount(TheCall, 0)) + return true; + break; + } case Builtin::BI__builtin_elementwise_acos: case Builtin::BI__builtin_elementwise_asin: case Builtin::BI__builtin_elementwise_atan: diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl index 8f52d81091c180..06a2715b00e969 100644 --- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl +++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl @@ -1,14 +1,22 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,CHECK-SPIRV +// RUN: %clang_cc1 -finclude-default-header \ +// RUN: -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,CHECK-DXIL -// CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] { -// CHECK: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry() -// CHECK: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ] -uint test_1() { +// CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] { +// CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] { +// CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry() +// CHECK-SPIRV: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ] +// CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex() +int test_1() { return WaveGetLaneIndex(); } -// CHECK: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] +// CHECK-SPIRV: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] +// CHECK-DXIL: declare i32 @llvm.dx.wave.getlaneindex() [[A1:#[0-9]+]] // CHECK-DAG: attributes [[A0]] = { {{.*}}convergent{{.*}} } // CHECK-DAG: attributes [[A1]] = { {{.*}}convergent{{.*}} } diff --git a/clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl new file mode 100644 index 00000000000000..6208442fab6590 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify + +int test_too_many_arg(int x) { + return __builtin_hlsl_wave_get_lane_index(x); + // expected-error@-1 {{too many arguments to function call, expected 0, have 1}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index f2b9e286ebb476..1cf6acbf126475 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -83,6 +83,7 @@ def int_dx_imad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLV def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_dx_rsqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; +def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>; def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>; def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 9aa0af3e3a6b17..e8f56b18730d71 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -801,3 +801,12 @@ def WaveIsFirstLane : DXILOp<110, waveIsFirstLane> { let stages = [Stages]; let attributes = [Attributes]; } + +def WaveGetLaneIndex : DXILOp<111, waveGetLaneIndex> { + let Doc = "returns the index of the current lane in the wave"; + let LLVMIntrinsic = int_dx_wave_getlaneindex; + let arguments = []; + let result = Int32Ty; + let stages = [Stages]; + let attributes = [Attributes]; +} diff --git a/llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll b/llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll new file mode 100644 index 00000000000000..86b7ea4f962f77 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll @@ -0,0 +1,10 @@ +; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s + +define void @main() { +entry: +; CHECK: call i32 @dx.op.waveGetLaneIndex(i32 111) + %0 = call i32 @llvm.dx.wave.getlaneindex() + ret void +} + +declare i32 @llvm.dx.wave.getlaneindex() From b800ff67dae59e194c8e9fc5d795a5932dc726f8 Mon Sep 17 00:00:00 2001 From: Donough Liu Date: Fri, 11 Oct 2024 02:46:19 +0800 Subject: [PATCH 061/177] [lldb][debugserver][NFC] Simplify macOS thread name fetching. (#111684) Remove unnecessary `proc_pidinfo` calling. --- .../debugserver/source/MacOSX/MachThread.cpp | 48 ++++++++++--------- .../debugserver/source/MacOSX/MachThread.h | 8 ++-- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/lldb/tools/debugserver/source/MacOSX/MachThread.cpp b/lldb/tools/debugserver/source/MacOSX/MachThread.cpp index d34914be802041..de2bebfcec7090 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachThread.cpp +++ b/lldb/tools/debugserver/source/MacOSX/MachThread.cpp @@ -31,9 +31,8 @@ MachThread::MachThread(MachProcess *process, bool is_64_bit, m_state(eStateUnloaded), m_state_mutex(PTHREAD_MUTEX_RECURSIVE), m_suspend_count(0), m_stop_exception(), m_arch_up(DNBArchProtocol::Create(this)), m_reg_sets(NULL), - m_num_reg_sets(0), m_ident_info(), m_proc_threadinfo(), - m_dispatch_queue_name(), m_is_64_bit(is_64_bit), - m_pthread_qos_class_decode(nullptr) { + m_num_reg_sets(0), m_extended_info(), m_dispatch_queue_name(), + m_is_64_bit(is_64_bit), m_pthread_qos_class_decode(nullptr) { nub_size_t num_reg_sets = 0; m_reg_sets = m_arch_up->GetRegisterSetInfo(&num_reg_sets); m_num_reg_sets = num_reg_sets; @@ -255,7 +254,7 @@ struct thread_basic_info *MachThread::GetBasicInfo() { bool MachThread::GetBasicInfo(thread_t thread, struct thread_basic_info *basicInfoPtr) { if (MachPortNumberIsValid(thread)) { - unsigned int info_count = THREAD_BASIC_INFO_COUNT; + mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT; kern_return_t err = ::thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)basicInfoPtr, &info_count); if (err == KERN_SUCCESS) @@ -265,6 +264,26 @@ bool MachThread::GetBasicInfo(thread_t thread, return false; } +struct thread_extended_info *MachThread::GetExtendedInfo() { + if (MachThread::GetExtendedInfo(m_mach_port_number, &m_extended_info)) + return &m_extended_info; + return NULL; +} + +bool MachThread::GetExtendedInfo(thread_t thread, + struct thread_extended_info *extendedInfoPtr) { + if (MachPortNumberIsValid(thread)) { + mach_msg_type_number_t info_count = THREAD_EXTENDED_INFO_COUNT; + kern_return_t err = + ::thread_info(thread, THREAD_EXTENDED_INFO, + (thread_info_t)extendedInfoPtr, &info_count); + if (err == KERN_SUCCESS) + return true; + } + ::memset(extendedInfoPtr, 0, sizeof(struct thread_extended_info)); + return false; +} + bool MachThread::ThreadIDIsValid(uint64_t thread) { return thread != 0; } bool MachThread::MachPortNumberIsValid(thread_t thread) { @@ -579,28 +598,13 @@ uint32_t MachThread::NumSupportedHardwareWatchpoints() const { return m_arch_up->NumSupportedHardwareWatchpoints(); } -bool MachThread::GetIdentifierInfo() { +const char *MachThread::GetName() { // Don't try to get the thread info once and cache it for the life of the // thread. It changes over time, for instance // if the thread name changes, then the thread_handle also changes... So you // have to refetch it every time. - mach_msg_type_number_t count = THREAD_IDENTIFIER_INFO_COUNT; - kern_return_t kret = ::thread_info(m_mach_port_number, THREAD_IDENTIFIER_INFO, - (thread_info_t)&m_ident_info, &count); - return kret == KERN_SUCCESS; - - return false; -} - -const char *MachThread::GetName() { - if (GetIdentifierInfo()) { - int len = ::proc_pidinfo(m_process->ProcessID(), PROC_PIDTHREADINFO, - m_ident_info.thread_handle, &m_proc_threadinfo, - sizeof(m_proc_threadinfo)); - - if (len && m_proc_threadinfo.pth_name[0]) - return m_proc_threadinfo.pth_name; - } + if (GetExtendedInfo() && m_extended_info.pth_name[0]) + return m_extended_info.pth_name; return NULL; } diff --git a/lldb/tools/debugserver/source/MacOSX/MachThread.h b/lldb/tools/debugserver/source/MacOSX/MachThread.h index 5466c6f9f95095..0c78ef1a337ed3 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachThread.h +++ b/lldb/tools/debugserver/source/MacOSX/MachThread.h @@ -108,6 +108,7 @@ class MachThread { bool IsUserReady(); struct thread_basic_info *GetBasicInfo(); + struct thread_extended_info *GetExtendedInfo(); const char *GetBasicInfoAsString() const; const char *GetName(); @@ -126,8 +127,8 @@ class MachThread { protected: static bool GetBasicInfo(thread_t threadID, struct thread_basic_info *basic_info); - - bool GetIdentifierInfo(); + static bool GetExtendedInfo(thread_t threadID, + struct thread_extended_info *extended_info); // const char * // GetDispatchQueueName(); @@ -152,8 +153,7 @@ class MachThread { const DNBRegisterSetInfo *m_reg_sets; // Register set information for this thread nub_size_t m_num_reg_sets; - thread_identifier_info_data_t m_ident_info; - struct proc_threadinfo m_proc_threadinfo; + thread_extended_info_data_t m_extended_info; std::string m_dispatch_queue_name; bool m_is_64_bit; From c2063de1593610eda0f4de33c3b89324642ed54c Mon Sep 17 00:00:00 2001 From: Greg Roth Date: Thu, 10 Oct 2024 12:58:28 -0600 Subject: [PATCH 062/177] Switch DirectX Target to use the Itanium ABI (#111632) To consolidate behavior of function mangling and limit the number of places that ABI changes will need to be made, this switches the DirectX target used for HLSL to use the Itanium ABI from the Microsoft ABI. The Itanium ABI has greater flexibility in decisions regarding mangling of new types of which we have more than a few yet to add. One effect of this will be that linking library shaders compiled with DXC will not be possible with shaders compiled with clang. That isn't considered a terribly interesting use case and one that would likely have been onerous to maintain anyway. This involved adding a function to call all global destructors as the Microsoft ABI had done. This requires a few changes to tests. Most notably the mangling style has changed which accounts for most of the changes. In making those changes, I took the opportunity to harmonize some very similar tests for greater consistency. I also shaved off some unneeded run flags that had probably been copied over from one test to another. Other changes effected by using the new ABI include using different types when manipulating smaller bitfields, eliminating an unnecessary alloca in one instance in this-assignment.hlsl, changing the way static local initialization is guarded, and changing the order of inout parameters getting copied in and out. That last is a subtle change in functionality, but one where there was sufficient inconsistency in the past that standardizing is important, but the particular direction of the standardization is less important for the sake of existing shaders. fixes #110736 --- clang/lib/Basic/Targets/DirectX.h | 2 +- clang/lib/CodeGen/ItaniumCXXABI.cpp | 4 + clang/test/CodeGenHLSL/ArrayTemporary.hlsl | 8 +- .../BasicFeatures/OutputArguments.hlsl | 26 ++-- .../GlobalConstructorFunction.hlsl | 8 +- .../CodeGenHLSL/GlobalConstructorLib.hlsl | 12 +- .../test/CodeGenHLSL/GlobalConstructors.hlsl | 2 +- clang/test/CodeGenHLSL/GlobalDestructors.hlsl | 14 +- clang/test/CodeGenHLSL/basic_types.hlsl | 64 ++++----- .../builtins/RWBuffer-annotations.hlsl | 12 +- .../builtins/RWBuffer-elementtype.hlsl | 26 ++-- .../RasterizerOrderedBuffer-annotations.hlsl | 12 +- .../StructuredBuffer-annotations.hlsl | 12 +- .../StructuredBuffer-elementtype.hlsl | 26 ++-- clang/test/CodeGenHLSL/builtins/abs.hlsl | 73 ++++++----- clang/test/CodeGenHLSL/builtins/ceil.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/clamp.hlsl | 101 +++++++------- clang/test/CodeGenHLSL/builtins/cos.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/exp.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/exp2.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/floor.hlsl | 37 +++--- .../CodeGenHLSL/builtins/hlsl_resource_t.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/log.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/log10.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/log2.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/max.hlsl | 101 +++++++------- clang/test/CodeGenHLSL/builtins/min.hlsl | 101 +++++++------- clang/test/CodeGenHLSL/builtins/pow.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/round.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/saturate.hlsl | 123 +++++++----------- clang/test/CodeGenHLSL/builtins/sin.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/sqrt.hlsl | 37 +++--- clang/test/CodeGenHLSL/builtins/trunc.hlsl | 39 +++--- clang/test/CodeGenHLSL/export.hlsl | 11 +- clang/test/CodeGenHLSL/float3.hlsl | 2 +- clang/test/CodeGenHLSL/group_shared.hlsl | 2 +- clang/test/CodeGenHLSL/half.hlsl | 4 +- .../implicit-norecurse-attrib.hlsl | 8 +- .../test/CodeGenHLSL/inline-constructors.hlsl | 4 +- clang/test/CodeGenHLSL/inline-functions.hlsl | 10 +- .../semantics/GroupIndex-codegen.hlsl | 2 +- clang/test/CodeGenHLSL/shift-mask.hlsl | 43 +++++- clang/test/CodeGenHLSL/sret_output.hlsl | 7 +- clang/test/CodeGenHLSL/static-local-ctor.hlsl | 14 +- .../static_global_and_function_in_cb.hlsl | 7 +- .../CodeGenHLSL/this-assignment-overload.hlsl | 8 +- clang/test/CodeGenHLSL/this-assignment.hlsl | 7 +- clang/test/CodeGenHLSL/this-reference.hlsl | 4 +- 48 files changed, 667 insertions(+), 680 deletions(-) diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h index cf7ea5e83503dc..19b61252409b09 100644 --- a/clang/lib/Basic/Targets/DirectX.h +++ b/clang/lib/Basic/Targets/DirectX.h @@ -62,7 +62,7 @@ class LLVM_LIBRARY_VISIBILITY DirectXTargetInfo : public TargetInfo { PlatformName = llvm::Triple::getOSTypeName(Triple.getOS()); resetDataLayout("e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:" "32-f64:64-n8:16:32:64"); - TheCXXABI.set(TargetCXXABI::Microsoft); + TheCXXABI.set(TargetCXXABI::GenericItanium); } bool useFP16ConversionIntrinsics() const override { return false; } void getTargetDefines(const LangOptions &Opts, diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index 965e09a7a760ec..75dab596e1b2c4 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -2997,6 +2997,10 @@ void ItaniumCXXABI::registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D, if (D.isNoDestroy(CGM.getContext())) return; + // HLSL doesn't support atexit. + if (CGM.getLangOpts().HLSL) + return CGM.AddCXXDtorEntry(dtor, addr); + // OpenMP offloading supports C++ constructors and destructors but we do not // always have 'atexit' available. Instead lower these to use the LLVM global // destructors which we can handle directly in the runtime. Note that this is diff --git a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl index 63a30b61440eb5..7d77c0aff736cc 100644 --- a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl +++ b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl @@ -68,11 +68,11 @@ void call4(float Arr[2][2]) { // CHECK: [[Tmp2:%.*]] = alloca [4 x float] // CHECK: [[Tmp3:%.*]] = alloca [3 x i32] // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp1]], ptr align 4 [[FA2]], i32 8, i1 false) -// CHECK: call void @"??$template_fn@$$BY01M@@YAXY01M@Z"(ptr noundef byval([2 x float]) align 4 [[Tmp1]]) +// CHECK: call void @_Z11template_fnIA2_fEvT_(ptr noundef byval([2 x float]) align 4 [[Tmp1]]) // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp2]], ptr align 4 [[FA4]], i32 16, i1 false) -// CHECK: call void @"??$template_fn@$$BY03M@@YAXY03M@Z"(ptr noundef byval([4 x float]) align 4 [[Tmp2]]) +// CHECK: call void @_Z11template_fnIA4_fEvT_(ptr noundef byval([4 x float]) align 4 [[Tmp2]]) // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp3]], ptr align 4 [[IA3]], i32 12, i1 false) -// CHECK: call void @"??$template_fn@$$BY02H@@YAXY02H@Z"(ptr noundef byval([3 x i32]) align 4 [[Tmp3]]) +// CHECK: call void @_Z11template_fnIA3_iEvT_(ptr noundef byval([3 x i32]) align 4 [[Tmp3]]) template void template_fn(T Val) {} @@ -90,7 +90,7 @@ void template_call(float FA2[2], float FA4[4], int IA3[3]) { // CHECK: [[Addr:%.*]] = getelementptr inbounds [2 x float], ptr [[FA2]], i32 0, i32 0 // CHECK: [[Tmp:%.*]] = load float, ptr [[Addr]] -// CHECK: call void @"??$template_fn@M@@YAXM@Z"(float noundef [[Tmp]]) +// CHECK: call void @_Z11template_fnIfEvT_(float noundef [[Tmp]]) // CHECK: [[Idx0:%.*]] = getelementptr inbounds [2 x float], ptr [[FA2]], i32 0, i32 0 // CHECK: [[Val0:%.*]] = load float, ptr [[Idx0]] diff --git a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl index 58237889db1dca..6afead4f233660 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl @@ -260,10 +260,10 @@ void order_matters(inout int X, inout int Y) { // CHECK: store i32 [[VVal]], ptr [[Tmp0]] // CHECK: [[VVal:%.*]] = load i32, ptr [[V]] // CHECK: store i32 [[VVal]], ptr [[Tmp1]] -// CHECK: call void {{.*}}order_matters{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp1]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp0]]) -// CHECK: [[Arg1Val:%.*]] = load i32, ptr [[Tmp1]] +// CHECK: call void {{.*}}order_matters{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp0]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp1]]) +// CHECK: [[Arg1Val:%.*]] = load i32, ptr [[Tmp0]] // CHECK: store i32 [[Arg1Val]], ptr [[V]] -// CHECK: [[Arg2Val:%.*]] = load i32, ptr [[Tmp0]] +// CHECK: [[Arg2Val:%.*]] = load i32, ptr [[Tmp1]] // CHECK: store i32 [[Arg2Val]], ptr [[V]] // OPT: ret i32 2 @@ -289,17 +289,19 @@ void setFour(inout int I) { // CHECK: [[B:%.*]] = alloca %struct.B // CHECK: [[Tmp:%.*]] = alloca i32 -// CHECK: [[BFLoad:%.*]] = load i32, ptr [[B]] -// CHECK: [[BFshl:%.*]] = shl i32 [[BFLoad]], 24 -// CHECK: [[BFashr:%.*]] = ashr i32 [[BFshl]], 24 -// CHECK: store i32 [[BFashr]], ptr [[Tmp]] +// CHECK: [[BFLoad:%.*]] = load i16, ptr [[B]] +// CHECK: [[BFshl:%.*]] = shl i16 [[BFLoad]], 8 +// CHECK: [[BFashr:%.*]] = ashr i16 [[BFshl]], 8 +// CHECK: [[BFcast:%.*]] = sext i16 [[BFashr]] to i32 +// CHECK: store i32 [[BFcast]], ptr [[Tmp]] // CHECK: call void {{.*}}setFour{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp]]) // CHECK: [[RetVal:%.*]] = load i32, ptr [[Tmp]] -// CHECK: [[BFLoad:%.*]] = load i32, ptr [[B]] -// CHECK: [[BFValue:%.*]] = and i32 [[RetVal]], 255 -// CHECK: [[ZerodField:%.*]] = and i32 [[BFLoad]], -256 -// CHECK: [[BFSet:%.*]] = or i32 [[ZerodField]], [[BFValue]] -// CHECK: store i32 [[BFSet]], ptr [[B]] +// CHECK: [[TruncVal:%.*]] = trunc i32 [[RetVal]] to i16 +// CHECK: [[BFLoad:%.*]] = load i16, ptr [[B]] +// CHECK: [[BFValue:%.*]] = and i16 [[TruncVal]], 255 +// CHECK: [[ZerodField:%.*]] = and i16 [[BFLoad]], -256 +// CHECK: [[BFSet:%.*]] = or i16 [[ZerodField]], [[BFValue]] +// CHECK: store i16 [[BFSet]], ptr [[B]] // OPT: ret i32 8 export int case11() { diff --git a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl index b39311ad67cd62..c0eb1b138ed047 100644 --- a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl +++ b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl @@ -25,11 +25,11 @@ void main(unsigned GI : SV_GroupIndex) {} // CHECK: define void @main() // CHECK-NEXT: entry: // Verify function constructors are emitted -// NOINLINE-NEXT: call void @"?call_me_first@@YAXXZ"() -// NOINLINE-NEXT: call void @"?then_call_me@@YAXXZ"() +// NOINLINE-NEXT: call void @_Z13call_me_firstv() +// NOINLINE-NEXT: call void @_Z12then_call_mev() // NOINLINE-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() -// NOINLINE-NEXT: call void @"?main@@YAXI@Z"(i32 %0) -// NOINLINE-NEXT: call void @"?call_me_last@@YAXXZ"( +// NOINLINE-NEXT: call void @_Z4mainj(i32 %0) +// NOINLINE-NEXT: call void @_Z12call_me_lastv( // NOINLINE-NEXT: ret void // Verify constructor calls are inlined when AlwaysInline is run diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl index 78f6475462bc47..09c44f6242c53c 100644 --- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl +++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl @@ -13,7 +13,7 @@ void FirstEntry() {} // CHECK: define void @FirstEntry() // CHECK-NEXT: entry: // NOINLINE-NEXT: call void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() -// NOINLINE-NEXT: call void @"?FirstEntry@@YAXXZ"() +// NOINLINE-NEXT: call void @_Z10FirstEntryv() // Verify inlining leaves only calls to "llvm." intrinsics // INLINE-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} // CHECK: ret void @@ -25,7 +25,7 @@ void SecondEntry() {} // CHECK: define void @SecondEntry() // CHECK-NEXT: entry: // NOINLINE-NEXT: call void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() -// NOINLINE-NEXT: call void @"?SecondEntry@@YAXXZ"() +// NOINLINE-NEXT: call void @_Z11SecondEntryv() // Verify inlining leaves only calls to "llvm." intrinsics // INLINE-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} // CHECK: ret void @@ -33,6 +33,10 @@ void SecondEntry() {} // Verify the constructor is alwaysinline // NOINLINE: ; Function Attrs: {{.*}}alwaysinline -// NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[IntAttr:\#[0-9]+]] +// NOINLINE-NEXT: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ev({{.*}} [[CtorAttr:\#[0-9]+]] -// NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline +// NOINLINE: ; Function Attrs: {{.*}}alwaysinline +// NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[InitAttr:\#[0-9]+]] + +// NOINLINE-DAG: attributes [[InitAttr]] = {{.*}} alwaysinline +// NOINLINE-DAG: attributes [[CtorAttr]] = {{.*}} alwaysinline diff --git a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl index 7e2f288726c954..7b26dba0d19010 100644 --- a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl +++ b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl @@ -12,5 +12,5 @@ void main(unsigned GI : SV_GroupIndex) {} //CHECK-NEXT: entry: //CHECK-NEXT: call void @_GLOBAL__sub_I_GlobalConstructors.hlsl() //CHECK-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() -//CHECK-NEXT: call void @"?main@@YAXI@Z"(i32 %0) +//CHECK-NEXT: call void @_Z4mainj(i32 %0) //CHECK-NEXT: ret void diff --git a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl index ea28354222f885..f98318601134bb 100644 --- a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl +++ b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,NOINLINE,CHECK -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,NOINLINE,CHECK -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,NOINLINE,CHECK +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,NOINLINE,CHECK +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK // Tests that constructors and destructors are appropriately generated for globals // and that their calls are inlined when AlwaysInline is run @@ -59,7 +59,7 @@ void main(unsigned GI : SV_GroupIndex) { // Verify destructor is emitted // NOINLINE-NEXT: call void @_GLOBAL__sub_I_GlobalDestructors.hlsl() // NOINLINE-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() -// NOINLINE-NEXT: call void @"?main@@YAXI@Z"(i32 %0) +// NOINLINE-NEXT: call void @_Z4mainj(i32 %0) // NOINLINE-NEXT: call void @_GLOBAL__D_a() // NOINLINE-NEXT: ret void // Verify inlining leaves only calls to "llvm." intrinsics @@ -71,8 +71,8 @@ void main(unsigned GI : SV_GroupIndex) { // NOINLINE: define internal void @_GLOBAL__D_a() [[IntAttr:\#[0-9]+]] // NOINLINE-NEXT: entry: -// NOINLINE-NEXT: call void @"??1Tail@@QAA@XZ"(ptr @"?T@?1??Wag@@YAXXZ@4UTail@@A") -// NOINLINE-NEXT: call void @"??1Pupper@@QAA@XZ"(ptr @"?GlobalPup@@3UPupper@@A") +// NOINLINE-NEXT: call void @_ZN4TailD1Ev(ptr @_ZZ3WagvE1T) +// NOINLINE-NEXT: call void @_ZN6PupperD1Ev(ptr @GlobalPup) // NOINLINE-NEXT: ret void // NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl index 15c963dfa666f4..d987af45a649fb 100644 --- a/clang/test/CodeGenHLSL/basic_types.hlsl +++ b/clang/test/CodeGenHLSL/basic_types.hlsl @@ -6,38 +6,38 @@ // RUN: -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s -// CHECK:"?uint16_t_Val@@3GA" = global i16 0, align 2 -// CHECK:"?int16_t_Val@@3FA" = global i16 0, align 2 -// CHECK:"?uint_Val@@3IA" = global i32 0, align 4 -// CHECK:"?uint64_t_Val@@3KA" = global i64 0, align 8 -// CHECK:"?int64_t_Val@@3JA" = global i64 0, align 8 -// CHECK:"?int16_t2_Val@@3T?$__vector@F$01@__clang@@A" = global <2 x i16> zeroinitializer, align 4 -// CHECK:"?int16_t3_Val@@3T?$__vector@F$02@__clang@@A" = global <3 x i16> zeroinitializer, align 8 -// CHECK:"?int16_t4_Val@@3T?$__vector@F$03@__clang@@A" = global <4 x i16> zeroinitializer, align 8 -// CHECK:"?uint16_t2_Val@@3T?$__vector@G$01@__clang@@A" = global <2 x i16> zeroinitializer, align 4 -// CHECK:"?uint16_t3_Val@@3T?$__vector@G$02@__clang@@A" = global <3 x i16> zeroinitializer, align 8 -// CHECK:"?uint16_t4_Val@@3T?$__vector@G$03@__clang@@A" = global <4 x i16> zeroinitializer, align 8 -// CHECK:"?int2_Val@@3T?$__vector@H$01@__clang@@A" = global <2 x i32> zeroinitializer, align 8 -// CHECK:"?int3_Val@@3T?$__vector@H$02@__clang@@A" = global <3 x i32> zeroinitializer, align 16 -// CHECK:"?int4_Val@@3T?$__vector@H$03@__clang@@A" = global <4 x i32> zeroinitializer, align 16 -// CHECK:"?uint2_Val@@3T?$__vector@I$01@__clang@@A" = global <2 x i32> zeroinitializer, align 8 -// CHECK:"?uint3_Val@@3T?$__vector@I$02@__clang@@A" = global <3 x i32> zeroinitializer, align 16 -// CHECK:"?uint4_Val@@3T?$__vector@I$03@__clang@@A" = global <4 x i32> zeroinitializer, align 16 -// CHECK:"?int64_t2_Val@@3T?$__vector@J$01@__clang@@A" = global <2 x i64> zeroinitializer, align 16 -// CHECK:"?int64_t3_Val@@3T?$__vector@J$02@__clang@@A" = global <3 x i64> zeroinitializer, align 32 -// CHECK:"?int64_t4_Val@@3T?$__vector@J$03@__clang@@A" = global <4 x i64> zeroinitializer, align 32 -// CHECK:"?uint64_t2_Val@@3T?$__vector@K$01@__clang@@A" = global <2 x i64> zeroinitializer, align 16 -// CHECK:"?uint64_t3_Val@@3T?$__vector@K$02@__clang@@A" = global <3 x i64> zeroinitializer, align 32 -// CHECK:"?uint64_t4_Val@@3T?$__vector@K$03@__clang@@A" = global <4 x i64> zeroinitializer, align 32 -// CHECK:"?half2_Val@@3T?$__vector@$f16@$01@__clang@@A" = global <2 x half> zeroinitializer, align 4 -// CHECK:"?half3_Val@@3T?$__vector@$f16@$02@__clang@@A" = global <3 x half> zeroinitializer, align 8 -// CHECK:"?half4_Val@@3T?$__vector@$f16@$03@__clang@@A" = global <4 x half> zeroinitializer, align 8 -// CHECK:"?float2_Val@@3T?$__vector@M$01@__clang@@A" = global <2 x float> zeroinitializer, align 8 -// CHECK:"?float3_Val@@3T?$__vector@M$02@__clang@@A" = global <3 x float> zeroinitializer, align 16 -// CHECK:"?float4_Val@@3T?$__vector@M$03@__clang@@A" = global <4 x float> zeroinitializer, align 16 -// CHECK:"?double2_Val@@3T?$__vector@N$01@__clang@@A" = global <2 x double> zeroinitializer, align 16 -// CHECK:"?double3_Val@@3T?$__vector@N$02@__clang@@A" = global <3 x double> zeroinitializer, align 32 -// CHECK:"?double4_Val@@3T?$__vector@N$03@__clang@@A" = global <4 x double> zeroinitializer, align 32 +// CHECK: @uint16_t_Val = global i16 0, align 2 +// CHECK: @int16_t_Val = global i16 0, align 2 +// CHECK: @uint_Val = global i32 0, align 4 +// CHECK: @uint64_t_Val = global i64 0, align 8 +// CHECK: @int64_t_Val = global i64 0, align 8 +// CHECK: @int16_t2_Val = global <2 x i16> zeroinitializer, align 4 +// CHECK: @int16_t3_Val = global <3 x i16> zeroinitializer, align 8 +// CHECK: @int16_t4_Val = global <4 x i16> zeroinitializer, align 8 +// CHECK: @uint16_t2_Val = global <2 x i16> zeroinitializer, align 4 +// CHECK: @uint16_t3_Val = global <3 x i16> zeroinitializer, align 8 +// CHECK: @uint16_t4_Val = global <4 x i16> zeroinitializer, align 8 +// CHECK: @int2_Val = global <2 x i32> zeroinitializer, align 8 +// CHECK: @int3_Val = global <3 x i32> zeroinitializer, align 16 +// CHECK: @int4_Val = global <4 x i32> zeroinitializer, align 16 +// CHECK: @uint2_Val = global <2 x i32> zeroinitializer, align 8 +// CHECK: @uint3_Val = global <3 x i32> zeroinitializer, align 16 +// CHECK: @uint4_Val = global <4 x i32> zeroinitializer, align 16 +// CHECK: @int64_t2_Val = global <2 x i64> zeroinitializer, align 16 +// CHECK: @int64_t3_Val = global <3 x i64> zeroinitializer, align 32 +// CHECK: @int64_t4_Val = global <4 x i64> zeroinitializer, align 32 +// CHECK: @uint64_t2_Val = global <2 x i64> zeroinitializer, align 16 +// CHECK: @uint64_t3_Val = global <3 x i64> zeroinitializer, align 32 +// CHECK: @uint64_t4_Val = global <4 x i64> zeroinitializer, align 32 +// CHECK: @half2_Val = global <2 x half> zeroinitializer, align 4 +// CHECK: @half3_Val = global <3 x half> zeroinitializer, align 8 +// CHECK: @half4_Val = global <4 x half> zeroinitializer, align 8 +// CHECK: @float2_Val = global <2 x float> zeroinitializer, align 8 +// CHECK: @float3_Val = global <3 x float> zeroinitializer, align 16 +// CHECK: @float4_Val = global <4 x float> zeroinitializer, align 16 +// CHECK: @double2_Val = global <2 x double> zeroinitializer, align 16 +// CHECK: @double3_Val = global <3 x double> zeroinitializer, align 32 +// CHECK: @double4_Val = global <4 x double> zeroinitializer, align 32 #ifdef NAMESPACED #define TYPE_DECL(T) hlsl::T T##_Val diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl index 7ca78e60fb9c59..e1e047485e4df0 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl @@ -16,9 +16,9 @@ void main() { } // CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$RWBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$RWBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$RWBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1} +// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} +// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} +// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} +// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl index 036c9c28ef2779..eca4f1598fd658 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl @@ -37,16 +37,16 @@ void main(int GI : SV_GroupIndex) { BufF32x3[GI] = 0; } -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$RWBuffer@F@hlsl@@A", i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$RWBuffer@G@hlsl@@A", i32 10, i32 3, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$RWBuffer@H@hlsl@@A", i32 10, i32 4, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$RWBuffer@I@hlsl@@A", i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$RWBuffer@J@hlsl@@A", i32 10, i32 6, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$RWBuffer@K@hlsl@@A", i32 10, i32 7, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$RWBuffer@$f16@@hlsl@@A", i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$RWBuffer@N@hlsl@@A", i32 10, i32 10, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$RWBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$RWBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$RWBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$RWBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9, +// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, +// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, +// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, +// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, +// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl index bf70cc2456c8bc..5155f129025979 100644 --- a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl @@ -12,9 +12,9 @@ RasterizerOrderedBuffer > BufferArray3[4] : register(u4, space1 void main() {} // CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$RasterizerOrderedBuffer@M@hlsl@@A", i32 10, i32 9, i1 true, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$RasterizerOrderedBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 true, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$RasterizerOrderedBuffer@M@hlsl@@A", i32 10, i32 9, i1 true, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$RasterizerOrderedBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 true, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$RasterizerOrderedBuffer@M@hlsl@@A", i32 10, i32 9, i1 true, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$RasterizerOrderedBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 true, i32 4, i32 1} +// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 true, i32 -1, i32 0} +// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 true, i32 -1, i32 0} +// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 true, i32 3, i32 0} +// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 true, i32 4, i32 0} +// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 true, i32 3, i32 1} +// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 true, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl index 16b7295c985f77..4d3d4908c396e6 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl @@ -14,9 +14,9 @@ void main() { } // CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1} +// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} +// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} +// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} +// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl index 8ddf8a6004403e..326885efbeeaba 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl @@ -37,16 +37,16 @@ void main(int GI : SV_GroupIndex) { BufF32x3[GI] = 0; } -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$StructuredBuffer@F@hlsl@@A", i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$StructuredBuffer@G@hlsl@@A", i32 10, i32 3, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$StructuredBuffer@H@hlsl@@A", i32 10, i32 4, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$StructuredBuffer@I@hlsl@@A", i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$StructuredBuffer@J@hlsl@@A", i32 10, i32 6, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$StructuredBuffer@K@hlsl@@A", i32 10, i32 7, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$StructuredBuffer@$f16@@hlsl@@A", i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$StructuredBuffer@N@hlsl@@A", i32 10, i32 10, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$StructuredBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$StructuredBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$StructuredBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$StructuredBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9, +// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, +// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, +// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, +// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, +// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl index ad65cab2721a2b..912e8a28347237 100644 --- a/clang/test/CodeGenHLSL/builtins/abs.hlsl +++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl @@ -1,93 +1,96 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF using hlsl::abs; #ifdef __HLSL_ENABLE_16_BIT -// NATIVE_HALF: define noundef i16 @ +// NATIVE_HALF-LABEL: define noundef i16 @_Z16test_abs_int16_t // NATIVE_HALF: call i16 @llvm.abs.i16( int16_t test_abs_int16_t(int16_t p0) { return abs(p0); } -// NATIVE_HALF: define noundef <2 x i16> @ +// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z17test_abs_int16_t2 // NATIVE_HALF: call <2 x i16> @llvm.abs.v2i16( int16_t2 test_abs_int16_t2(int16_t2 p0) { return abs(p0); } -// NATIVE_HALF: define noundef <3 x i16> @ +// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z17test_abs_int16_t3 // NATIVE_HALF: call <3 x i16> @llvm.abs.v3i16( int16_t3 test_abs_int16_t3(int16_t3 p0) { return abs(p0); } -// NATIVE_HALF: define noundef <4 x i16> @ +// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z17test_abs_int16_t4 // NATIVE_HALF: call <4 x i16> @llvm.abs.v4i16( int16_t4 test_abs_int16_t4(int16_t4 p0) { return abs(p0); } #endif // __HLSL_ENABLE_16_BIT -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_abs_half // NATIVE_HALF: call half @llvm.fabs.f16( -// NO_HALF: define noundef float @"?test_abs_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z13test_abs_half // NO_HALF: call float @llvm.fabs.f32(float %0) half test_abs_half(half p0) { return abs(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_abs_half2 // NATIVE_HALF: call <2 x half> @llvm.fabs.v2f16( -// NO_HALF: define noundef <2 x float> @"?test_abs_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_abs_half2 // NO_HALF: call <2 x float> @llvm.fabs.v2f32( half2 test_abs_half2(half2 p0) { return abs(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_abs_half3 // NATIVE_HALF: call <3 x half> @llvm.fabs.v3f16( -// NO_HALF: define noundef <3 x float> @"?test_abs_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_abs_half3 // NO_HALF: call <3 x float> @llvm.fabs.v3f32( half3 test_abs_half3(half3 p0) { return abs(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_abs_half4 // NATIVE_HALF: call <4 x half> @llvm.fabs.v4f16( -// NO_HALF: define noundef <4 x float> @"?test_abs_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_abs_half4 // NO_HALF: call <4 x float> @llvm.fabs.v4f32( half4 test_abs_half4(half4 p0) { return abs(p0); } -// CHECK: define noundef i32 @ + +// CHECK-LABEL: define noundef i32 @_Z12test_abs_int // CHECK: call i32 @llvm.abs.i32( int test_abs_int(int p0) { return abs(p0); } -// CHECK: define noundef <2 x i32> @ +// CHECK-LABEL: define noundef <2 x i32> @_Z13test_abs_int2 // CHECK: call <2 x i32> @llvm.abs.v2i32( int2 test_abs_int2(int2 p0) { return abs(p0); } -// CHECK: define noundef <3 x i32> @ +// CHECK-LABEL: define noundef <3 x i32> @_Z13test_abs_int3 // CHECK: call <3 x i32> @llvm.abs.v3i32( int3 test_abs_int3(int3 p0) { return abs(p0); } -// CHECK: define noundef <4 x i32> @ +// CHECK-LABEL: define noundef <4 x i32> @_Z13test_abs_int4 // CHECK: call <4 x i32> @llvm.abs.v4i32( int4 test_abs_int4(int4 p0) { return abs(p0); } -// CHECK: define noundef float @ + +// CHECK-LABEL: define noundef float @_Z14test_abs_float // CHECK: call float @llvm.fabs.f32( float test_abs_float(float p0) { return abs(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z15test_abs_float2 // CHECK: call <2 x float> @llvm.fabs.v2f32( float2 test_abs_float2(float2 p0) { return abs(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z15test_abs_float3 // CHECK: call <3 x float> @llvm.fabs.v3f32( float3 test_abs_float3(float3 p0) { return abs(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z15test_abs_float4 // CHECK: call <4 x float> @llvm.fabs.v4f32( float4 test_abs_float4(float4 p0) { return abs(p0); } -// CHECK: define noundef i64 @ + +// CHECK-LABEL: define noundef i64 @_Z16test_abs_int64_t // CHECK: call i64 @llvm.abs.i64( int64_t test_abs_int64_t(int64_t p0) { return abs(p0); } -// CHECK: define noundef <2 x i64> @ +// CHECK-LABEL: define noundef <2 x i64> @_Z17test_abs_int64_t2 // CHECK: call <2 x i64> @llvm.abs.v2i64( int64_t2 test_abs_int64_t2(int64_t2 p0) { return abs(p0); } -// CHECK: define noundef <3 x i64> @ +// CHECK-LABEL: define noundef <3 x i64> @_Z17test_abs_int64_t3 // CHECK: call <3 x i64> @llvm.abs.v3i64( int64_t3 test_abs_int64_t3(int64_t3 p0) { return abs(p0); } -// CHECK: define noundef <4 x i64> @ +// CHECK-LABEL: define noundef <4 x i64> @_Z17test_abs_int64_t4 // CHECK: call <4 x i64> @llvm.abs.v4i64( int64_t4 test_abs_int64_t4(int64_t4 p0) { return abs(p0); } -// CHECK: define noundef double @ + +// CHECK-LABEL: define noundef double @_Z15test_abs_double // CHECK: call double @llvm.fabs.f64( double test_abs_double(double p0) { return abs(p0); } -// CHECK: define noundef <2 x double> @ +// CHECK-LABEL: define noundef <2 x double> @_Z16test_abs_double2 // CHECK: call <2 x double> @llvm.fabs.v2f64( double2 test_abs_double2(double2 p0) { return abs(p0); } -// CHECK: define noundef <3 x double> @ +// CHECK-LABEL: define noundef <3 x double> @_Z16test_abs_double3 // CHECK: call <3 x double> @llvm.fabs.v3f64( double3 test_abs_double3(double3 p0) { return abs(p0); } -// CHECK: define noundef <4 x double> @ +// CHECK-LABEL: define noundef <4 x double> @_Z16test_abs_double4 // CHECK: call <4 x double> @llvm.fabs.v4f64( double4 test_abs_double4(double4 p0) { return abs(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl index be7725cd4d66c1..3aa78ec0ebcca3 100644 --- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl +++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl @@ -1,43 +1,42 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF using hlsl::ceil; -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z14test_ceil_half // NATIVE_HALF: call half @llvm.ceil.f16( -// NO_HALF: define noundef float @"?test_ceil_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z14test_ceil_half // NO_HALF: call float @llvm.ceil.f32(float %0) half test_ceil_half(half p0) { return ceil(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_ceil_half2 // NATIVE_HALF: call <2 x half> @llvm.ceil.v2f16( -// NO_HALF: define noundef <2 x float> @"?test_ceil_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_ceil_half2 // NO_HALF: call <2 x float> @llvm.ceil.v2f32( half2 test_ceil_half2(half2 p0) { return ceil(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_ceil_half3 // NATIVE_HALF: call <3 x half> @llvm.ceil.v3f16( -// NO_HALF: define noundef <3 x float> @"?test_ceil_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_ceil_half3 // NO_HALF: call <3 x float> @llvm.ceil.v3f32( half3 test_ceil_half3(half3 p0) { return ceil(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_ceil_half4 // NATIVE_HALF: call <4 x half> @llvm.ceil.v4f16( -// NO_HALF: define noundef <4 x float> @"?test_ceil_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_ceil_half4 // NO_HALF: call <4 x float> @llvm.ceil.v4f32( half4 test_ceil_half4(half4 p0) { return ceil(p0); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z15test_ceil_float // CHECK: call float @llvm.ceil.f32( float test_ceil_float(float p0) { return ceil(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z16test_ceil_float2 // CHECK: call <2 x float> @llvm.ceil.v2f32( float2 test_ceil_float2(float2 p0) { return ceil(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z16test_ceil_float3 // CHECK: call <3 x float> @llvm.ceil.v3f32( float3 test_ceil_float3(float3 p0) { return ceil(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z16test_ceil_float4 // CHECK: call <4 x float> @llvm.ceil.v4f32( float4 test_ceil_float4(float4 p0) { return ceil(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl index 186114581e9c18..af8f6b9733a071 100644 --- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl @@ -1,134 +1,133 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF #ifdef __HLSL_ENABLE_16_BIT -// NATIVE_HALF: define noundef i16 @ +// NATIVE_HALF-LABEL: define noundef i16 @_Z16test_clamp_short // NATIVE_HALF: call i16 @llvm.dx.clamp.i16( int16_t test_clamp_short(int16_t p0, int16_t p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <2 x i16> @ +// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z17test_clamp_short2 // NATIVE_HALF: call <2 x i16> @llvm.dx.clamp.v2i16( int16_t2 test_clamp_short2(int16_t2 p0, int16_t2 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <3 x i16> @ +// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z17test_clamp_short3 // NATIVE_HALF: call <3 x i16> @llvm.dx.clamp.v3i16 int16_t3 test_clamp_short3(int16_t3 p0, int16_t3 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <4 x i16> @ +// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z17test_clamp_short4 // NATIVE_HALF: call <4 x i16> @llvm.dx.clamp.v4i16 int16_t4 test_clamp_short4(int16_t4 p0, int16_t4 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef i16 @ +// NATIVE_HALF-LABEL: define noundef i16 @_Z17test_clamp_ushort // NATIVE_HALF: call i16 @llvm.dx.uclamp.i16( uint16_t test_clamp_ushort(uint16_t p0, uint16_t p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <2 x i16> @ +// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z18test_clamp_ushort2 // NATIVE_HALF: call <2 x i16> @llvm.dx.uclamp.v2i16 uint16_t2 test_clamp_ushort2(uint16_t2 p0, uint16_t2 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <3 x i16> @ +// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z18test_clamp_ushort3 // NATIVE_HALF: call <3 x i16> @llvm.dx.uclamp.v3i16 uint16_t3 test_clamp_ushort3(uint16_t3 p0, uint16_t3 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <4 x i16> @ +// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z18test_clamp_ushort4 // NATIVE_HALF: call <4 x i16> @llvm.dx.uclamp.v4i16 uint16_t4 test_clamp_ushort4(uint16_t4 p0, uint16_t4 p1) { return clamp(p0, p1,p1); } #endif -// CHECK: define noundef i32 @ +// CHECK-LABEL: define noundef i32 @_Z14test_clamp_int // CHECK: call i32 @llvm.dx.clamp.i32( int test_clamp_int(int p0, int p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <2 x i32> @ +// CHECK-LABEL: define noundef <2 x i32> @_Z15test_clamp_int2 // CHECK: call <2 x i32> @llvm.dx.clamp.v2i32 int2 test_clamp_int2(int2 p0, int2 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <3 x i32> @ +// CHECK-LABEL: define noundef <3 x i32> @_Z15test_clamp_int3 // CHECK: call <3 x i32> @llvm.dx.clamp.v3i32 int3 test_clamp_int3(int3 p0, int3 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <4 x i32> @ +// CHECK-LABEL: define noundef <4 x i32> @_Z15test_clamp_int4 // CHECK: call <4 x i32> @llvm.dx.clamp.v4i32 int4 test_clamp_int4(int4 p0, int4 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef i32 @ +// CHECK-LABEL: define noundef i32 @_Z15test_clamp_uint // CHECK: call i32 @llvm.dx.uclamp.i32( int test_clamp_uint(uint p0, uint p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <2 x i32> @ +// CHECK-LABEL: define noundef <2 x i32> @_Z16test_clamp_uint2 // CHECK: call <2 x i32> @llvm.dx.uclamp.v2i32 uint2 test_clamp_uint2(uint2 p0, uint2 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <3 x i32> @ +// CHECK-LABEL: define noundef <3 x i32> @_Z16test_clamp_uint3 // CHECK: call <3 x i32> @llvm.dx.uclamp.v3i32 uint3 test_clamp_uint3(uint3 p0, uint3 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <4 x i32> @ +// CHECK-LABEL: define noundef <4 x i32> @_Z16test_clamp_uint4 // CHECK: call <4 x i32> @llvm.dx.uclamp.v4i32 uint4 test_clamp_uint4(uint4 p0, uint4 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef i64 @ +// CHECK-LABEL: define noundef i64 @_Z15test_clamp_long // CHECK: call i64 @llvm.dx.clamp.i64( int64_t test_clamp_long(int64_t p0, int64_t p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <2 x i64> @ +// CHECK-LABEL: define noundef <2 x i64> @_Z16test_clamp_long2 // CHECK: call <2 x i64> @llvm.dx.clamp.v2i64 int64_t2 test_clamp_long2(int64_t2 p0, int64_t2 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <3 x i64> @ +// CHECK-LABEL: define noundef <3 x i64> @_Z16test_clamp_long3 // CHECK: call <3 x i64> @llvm.dx.clamp.v3i64 int64_t3 test_clamp_long3(int64_t3 p0, int64_t3 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <4 x i64> @ +// CHECK-LABEL: define noundef <4 x i64> @_Z16test_clamp_long4 // CHECK: call <4 x i64> @llvm.dx.clamp.v4i64 int64_t4 test_clamp_long4(int64_t4 p0, int64_t4 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef i64 @ +// CHECK-LABEL: define noundef i64 @_Z16test_clamp_ulong // CHECK: call i64 @llvm.dx.uclamp.i64( -uint64_t test_clamp_long(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <2 x i64> @ +uint64_t test_clamp_ulong(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); } +// CHECK-LABEL: define noundef <2 x i64> @_Z17test_clamp_ulong2 // CHECK: call <2 x i64> @llvm.dx.uclamp.v2i64 -uint64_t2 test_clamp_long2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <3 x i64> @ +uint64_t2 test_clamp_ulong2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); } +// CHECK-LABEL: define noundef <3 x i64> @_Z17test_clamp_ulong3 // CHECK: call <3 x i64> @llvm.dx.uclamp.v3i64 -uint64_t3 test_clamp_long3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <4 x i64> @ +uint64_t3 test_clamp_ulong3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); } +// CHECK-LABEL: define noundef <4 x i64> @_Z17test_clamp_ulong4 // CHECK: call <4 x i64> @llvm.dx.uclamp.v4i64 -uint64_t4 test_clamp_long4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); } +uint64_t4 test_clamp_ulong4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z15test_clamp_half // NATIVE_HALF: call half @llvm.dx.clamp.f16( -// NO_HALF: define noundef float @"?test_clamp_half +// NO_HALF-LABEL: define noundef float @_Z15test_clamp_half // NO_HALF: call float @llvm.dx.clamp.f32( half test_clamp_half(half p0, half p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_clamp_half2 // NATIVE_HALF: call <2 x half> @llvm.dx.clamp.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_clamp_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_clamp_half2 // NO_HALF: call <2 x float> @llvm.dx.clamp.v2f32( half2 test_clamp_half2(half2 p0, half2 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_clamp_half3 // NATIVE_HALF: call <3 x half> @llvm.dx.clamp.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_clamp_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_clamp_half3 // NO_HALF: call <3 x float> @llvm.dx.clamp.v3f32( half3 test_clamp_half3(half3 p0, half3 p1) { return clamp(p0, p1,p1); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_clamp_half4 // NATIVE_HALF: call <4 x half> @llvm.dx.clamp.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_clamp_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_clamp_half4 // NO_HALF: call <4 x float> @llvm.dx.clamp.v4f32( half4 test_clamp_half4(half4 p0, half4 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef float @"?test_clamp_float +// CHECK-LABEL: define noundef float @_Z16test_clamp_float // CHECK: call float @llvm.dx.clamp.f32( float test_clamp_float(float p0, float p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <2 x float> @"?test_clamp_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z17test_clamp_float2 // CHECK: call <2 x float> @llvm.dx.clamp.v2f32 float2 test_clamp_float2(float2 p0, float2 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <3 x float> @"?test_clamp_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z17test_clamp_float3 // CHECK: call <3 x float> @llvm.dx.clamp.v3f32 float3 test_clamp_float3(float3 p0, float3 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <4 x float> @"?test_clamp_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z17test_clamp_float4 // CHECK: call <4 x float> @llvm.dx.clamp.v4f32 float4 test_clamp_float4(float4 p0, float4 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef double @ +// CHECK-LABEL: define noundef double @_Z17test_clamp_double // CHECK: call double @llvm.dx.clamp.f64( double test_clamp_double(double p0, double p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <2 x double> @ +// CHECK-LABEL: define noundef <2 x double> @_Z18test_clamp_double2 // CHECK: call <2 x double> @llvm.dx.clamp.v2f64 double2 test_clamp_double2(double2 p0, double2 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <3 x double> @ +// CHECK-LABEL: define noundef <3 x double> @_Z18test_clamp_double3 // CHECK: call <3 x double> @llvm.dx.clamp.v3f64 double3 test_clamp_double3(double3 p0, double3 p1) { return clamp(p0, p1,p1); } -// CHECK: define noundef <4 x double> @ +// CHECK-LABEL: define noundef <4 x double> @_Z18test_clamp_double4 // CHECK: call <4 x double> @llvm.dx.clamp.v4f64 double4 test_clamp_double4(double4 p0, double4 p1) { return clamp(p0, p1,p1); } diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl index 58b63097788136..4a41a9ec4a7cac 100644 --- a/clang/test/CodeGenHLSL/builtins/cos.hlsl +++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl @@ -1,41 +1,40 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_cos_half // NATIVE_HALF: call half @llvm.cos.f16( -// NO_HALF: define noundef float @"?test_cos_half +// NO_HALF-LABEL: define noundef float @_Z13test_cos_half // NO_HALF: call float @llvm.cos.f32( half test_cos_half(half p0) { return cos(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_cos_half2 // NATIVE_HALF: call <2 x half> @llvm.cos.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_cos_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_cos_half2 // NO_HALF: call <2 x float> @llvm.cos.v2f32( half2 test_cos_half2(half2 p0) { return cos(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_cos_half3 // NATIVE_HALF: call <3 x half> @llvm.cos.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_cos_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_cos_half3 // NO_HALF: call <3 x float> @llvm.cos.v3f32( half3 test_cos_half3(half3 p0) { return cos(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_cos_half4 // NATIVE_HALF: call <4 x half> @llvm.cos.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_cos_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_cos_half4 // NO_HALF: call <4 x float> @llvm.cos.v4f32( half4 test_cos_half4(half4 p0) { return cos(p0); } -// CHECK: define noundef float @"?test_cos_float +// CHECK-LABEL: define noundef float @_Z14test_cos_float // CHECK: call float @llvm.cos.f32( float test_cos_float(float p0) { return cos(p0); } -// CHECK: define noundef <2 x float> @"?test_cos_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z15test_cos_float2 // CHECK: call <2 x float> @llvm.cos.v2f32 float2 test_cos_float2(float2 p0) { return cos(p0); } -// CHECK: define noundef <3 x float> @"?test_cos_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z15test_cos_float3 // CHECK: call <3 x float> @llvm.cos.v3f32 float3 test_cos_float3(float3 p0) { return cos(p0); } -// CHECK: define noundef <4 x float> @"?test_cos_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z15test_cos_float4 // CHECK: call <4 x float> @llvm.cos.v4f32 float4 test_cos_float4(float4 p0) { return cos(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl index 773edbe3364fd2..3445cfd2e71f60 100644 --- a/clang/test/CodeGenHLSL/builtins/exp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl @@ -1,53 +1,52 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_exp_half // NATIVE_HALF: %elt.exp = call half @llvm.exp.f16( // NATIVE_HALF: ret half %elt.exp -// NO_HALF: define noundef float @"?test_exp_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z13test_exp_half // NO_HALF: %elt.exp = call float @llvm.exp.f32( // NO_HALF: ret float %elt.exp half test_exp_half(half p0) { return exp(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_exp_half2 // NATIVE_HALF: %elt.exp = call <2 x half> @llvm.exp.v2f16 // NATIVE_HALF: ret <2 x half> %elt.exp -// NO_HALF: define noundef <2 x float> @ +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_exp_half2 // NO_HALF: %elt.exp = call <2 x float> @llvm.exp.v2f32( // NO_HALF: ret <2 x float> %elt.exp half2 test_exp_half2(half2 p0) { return exp(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_exp_half3 // NATIVE_HALF: %elt.exp = call <3 x half> @llvm.exp.v3f16 // NATIVE_HALF: ret <3 x half> %elt.exp -// NO_HALF: define noundef <3 x float> @ +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_exp_half3 // NO_HALF: %elt.exp = call <3 x float> @llvm.exp.v3f32( // NO_HALF: ret <3 x float> %elt.exp half3 test_exp_half3(half3 p0) { return exp(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_exp_half4 // NATIVE_HALF: %elt.exp = call <4 x half> @llvm.exp.v4f16 // NATIVE_HALF: ret <4 x half> %elt.exp -// NO_HALF: define noundef <4 x float> @ +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_exp_half4 // NO_HALF: %elt.exp = call <4 x float> @llvm.exp.v4f32( // NO_HALF: ret <4 x float> %elt.exp half4 test_exp_half4(half4 p0) { return exp(p0); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z14test_exp_float // CHECK: %elt.exp = call float @llvm.exp.f32( // CHECK: ret float %elt.exp float test_exp_float(float p0) { return exp(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z15test_exp_float2 // CHECK: %elt.exp = call <2 x float> @llvm.exp.v2f32 // CHECK: ret <2 x float> %elt.exp float2 test_exp_float2(float2 p0) { return exp(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z15test_exp_float3 // CHECK: %elt.exp = call <3 x float> @llvm.exp.v3f32 // CHECK: ret <3 x float> %elt.exp float3 test_exp_float3(float3 p0) { return exp(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z15test_exp_float4 // CHECK: %elt.exp = call <4 x float> @llvm.exp.v4f32 // CHECK: ret <4 x float> %elt.exp float4 test_exp_float4(float4 p0) { return exp(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl index f21cdd95774ab6..7bfc897beee16d 100644 --- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl +++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl @@ -1,53 +1,52 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z14test_exp2_half // NATIVE_HALF: %elt.exp2 = call half @llvm.exp2.f16( // NATIVE_HALF: ret half %elt.exp2 -// NO_HALF: define noundef float @"?test_exp2_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z14test_exp2_half // NO_HALF: %elt.exp2 = call float @llvm.exp2.f32( // NO_HALF: ret float %elt.exp2 half test_exp2_half(half p0) { return exp2(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_exp2_half2 // NATIVE_HALF: %elt.exp2 = call <2 x half> @llvm.exp2.v2f16 // NATIVE_HALF: ret <2 x half> %elt.exp2 -// NO_HALF: define noundef <2 x float> @ +// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_exp2_half2 // NO_HALF: %elt.exp2 = call <2 x float> @llvm.exp2.v2f32( // NO_HALF: ret <2 x float> %elt.exp2 half2 test_exp2_half2(half2 p0) { return exp2(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_exp2_half3 // NATIVE_HALF: %elt.exp2 = call <3 x half> @llvm.exp2.v3f16 // NATIVE_HALF: ret <3 x half> %elt.exp2 -// NO_HALF: define noundef <3 x float> @ +// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_exp2_half3 // NO_HALF: %elt.exp2 = call <3 x float> @llvm.exp2.v3f32( // NO_HALF: ret <3 x float> %elt.exp2 half3 test_exp2_half3(half3 p0) { return exp2(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_exp2_half4 // NATIVE_HALF: %elt.exp2 = call <4 x half> @llvm.exp2.v4f16 // NATIVE_HALF: ret <4 x half> %elt.exp2 -// NO_HALF: define noundef <4 x float> @ +// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_exp2_half4 // NO_HALF: %elt.exp2 = call <4 x float> @llvm.exp2.v4f32( // NO_HALF: ret <4 x float> %elt.exp2 half4 test_exp2_half4(half4 p0) { return exp2(p0); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z15test_exp2_float // CHECK: %elt.exp2 = call float @llvm.exp2.f32( // CHECK: ret float %elt.exp2 float test_exp2_float(float p0) { return exp2(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z16test_exp2_float2 // CHECK: %elt.exp2 = call <2 x float> @llvm.exp2.v2f32 // CHECK: ret <2 x float> %elt.exp2 float2 test_exp2_float2(float2 p0) { return exp2(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z16test_exp2_float3 // CHECK: %elt.exp2 = call <3 x float> @llvm.exp2.v3f32 // CHECK: ret <3 x float> %elt.exp2 float3 test_exp2_float3(float3 p0) { return exp2(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z16test_exp2_float4 // CHECK: %elt.exp2 = call <4 x float> @llvm.exp2.v4f32 // CHECK: ret <4 x float> %elt.exp2 float4 test_exp2_float4(float4 p0) { return exp2(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl index 48ddf713bcf504..c2d6f1bcc335c9 100644 --- a/clang/test/CodeGenHLSL/builtins/floor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl @@ -1,43 +1,42 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF using hlsl::floor; -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z15test_floor_half // NATIVE_HALF: call half @llvm.floor.f16( -// NO_HALF: define noundef float @"?test_floor_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z15test_floor_half // NO_HALF: call float @llvm.floor.f32(float %0) half test_floor_half(half p0) { return floor(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_floor_half2 // NATIVE_HALF: call <2 x half> @llvm.floor.v2f16( -// NO_HALF: define noundef <2 x float> @"?test_floor_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_floor_half2 // NO_HALF: call <2 x float> @llvm.floor.v2f32( half2 test_floor_half2(half2 p0) { return floor(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_floor_half3 // NATIVE_HALF: call <3 x half> @llvm.floor.v3f16( -// NO_HALF: define noundef <3 x float> @"?test_floor_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_floor_half3 // NO_HALF: call <3 x float> @llvm.floor.v3f32( half3 test_floor_half3(half3 p0) { return floor(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_floor_half4 // NATIVE_HALF: call <4 x half> @llvm.floor.v4f16( -// NO_HALF: define noundef <4 x float> @"?test_floor_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"( +// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_floor_half4 // NO_HALF: call <4 x float> @llvm.floor.v4f32( half4 test_floor_half4(half4 p0) { return floor(p0); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z16test_floor_float // CHECK: call float @llvm.floor.f32( float test_floor_float(float p0) { return floor(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z17test_floor_float2 // CHECK: call <2 x float> @llvm.floor.v2f32( float2 test_floor_float2(float2 p0) { return floor(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z17test_floor_float3 // CHECK: call <3 x float> @llvm.floor.v3f32( float3 test_floor_float3(float3 p0) { return floor(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z17test_floor_float4 // CHECK: call <4 x float> @llvm.floor.v4f32( float4 test_floor_float4(float4 p0) { return floor(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl index ce973309034781..e735a85b589f87 100644 --- a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl +++ b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl @@ -2,8 +2,8 @@ void foo(__hlsl_resource_t res); -// CHECK: define void @"?bar@@YAXU__hlsl_resource_t@@@Z"(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM:[a-zA-Z0-9]+]]) -// CHECK: call void @"?foo@@YAXU__hlsl_resource_t@@@Z"(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM]]) +// CHECK: define void @_Z3baru17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM:[a-zA-Z0-9]+]]) +// CHECK: call void @_Z3foou17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM]]) void bar(__hlsl_resource_t a) { foo(a); } diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl index c89eda683403b4..71ce502eb8c4a8 100644 --- a/clang/test/CodeGenHLSL/builtins/log.hlsl +++ b/clang/test/CodeGenHLSL/builtins/log.hlsl @@ -1,41 +1,40 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_log_half // NATIVE_HALF: call half @llvm.log.f16( -// NO_HALF: define noundef float @"?test_log_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z13test_log_half // NO_HALF: call float @llvm.log.f32( half test_log_half(half p0) { return log(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_log_half2 // NATIVE_HALF: call <2 x half> @llvm.log.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_log_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_log_half2 // NO_HALF: call <2 x float> @llvm.log.v2f32( half2 test_log_half2(half2 p0) { return log(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_log_half3 // NATIVE_HALF: call <3 x half> @llvm.log.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_log_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_log_half3 // NO_HALF: call <3 x float> @llvm.log.v3f32( half3 test_log_half3(half3 p0) { return log(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_log_half4 // NATIVE_HALF: call <4 x half> @llvm.log.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_log_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_log_half4 // NO_HALF: call <4 x float> @llvm.log.v4f32( half4 test_log_half4(half4 p0) { return log(p0); } -// CHECK: define noundef float @"?test_log_float +// CHECK-LABEL: define noundef float @_Z14test_log_float // CHECK: call float @llvm.log.f32( float test_log_float(float p0) { return log(p0); } -// CHECK: define noundef <2 x float> @"?test_log_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z15test_log_float2 // CHECK: call <2 x float> @llvm.log.v2f32 float2 test_log_float2(float2 p0) { return log(p0); } -// CHECK: define noundef <3 x float> @"?test_log_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z15test_log_float3 // CHECK: call <3 x float> @llvm.log.v3f32 float3 test_log_float3(float3 p0) { return log(p0); } -// CHECK: define noundef <4 x float> @"?test_log_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z15test_log_float4 // CHECK: call <4 x float> @llvm.log.v4f32 float4 test_log_float4(float4 p0) { return log(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl index 638b86e8d5eaf7..e15b6f5747b0a8 100644 --- a/clang/test/CodeGenHLSL/builtins/log10.hlsl +++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl @@ -1,41 +1,40 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z15test_log10_half // NATIVE_HALF: call half @llvm.log10.f16( -// NO_HALF: define noundef float @"?test_log10_half +// NO_HALF-LABEL: define noundef float @_Z15test_log10_half // NO_HALF: call float @llvm.log10.f32( half test_log10_half(half p0) { return log10(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_log10_half2 // NATIVE_HALF: call <2 x half> @llvm.log10.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_log10_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_log10_half2 // NO_HALF: call <2 x float> @llvm.log10.v2f32( half2 test_log10_half2(half2 p0) { return log10(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_log10_half3 // NATIVE_HALF: call <3 x half> @llvm.log10.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_log10_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_log10_half3 // NO_HALF: call <3 x float> @llvm.log10.v3f32( half3 test_log10_half3(half3 p0) { return log10(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_log10_half4 // NATIVE_HALF: call <4 x half> @llvm.log10.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_log10_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_log10_half4 // NO_HALF: call <4 x float> @llvm.log10.v4f32( half4 test_log10_half4(half4 p0) { return log10(p0); } -// CHECK: define noundef float @"?test_log10_float +// CHECK-LABEL: define noundef float @_Z16test_log10_float // CHECK: call float @llvm.log10.f32( float test_log10_float(float p0) { return log10(p0); } -// CHECK: define noundef <2 x float> @"?test_log10_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z17test_log10_float2 // CHECK: call <2 x float> @llvm.log10.v2f32 float2 test_log10_float2(float2 p0) { return log10(p0); } -// CHECK: define noundef <3 x float> @"?test_log10_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z17test_log10_float3 // CHECK: call <3 x float> @llvm.log10.v3f32 float3 test_log10_float3(float3 p0) { return log10(p0); } -// CHECK: define noundef <4 x float> @"?test_log10_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z17test_log10_float4 // CHECK: call <4 x float> @llvm.log10.v4f32 float4 test_log10_float4(float4 p0) { return log10(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl index 31c7bff214c61f..575761a5f637c0 100644 --- a/clang/test/CodeGenHLSL/builtins/log2.hlsl +++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl @@ -1,41 +1,40 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z14test_log2_half // NATIVE_HALF: call half @llvm.log2.f16( -// NO_HALF: define noundef float @"?test_log2_half +// NO_HALF-LABEL: define noundef float @_Z14test_log2_half // NO_HALF: call float @llvm.log2.f32( half test_log2_half(half p0) { return log2(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_log2_half2 // NATIVE_HALF: call <2 x half> @llvm.log2.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_log2_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_log2_half2 // NO_HALF: call <2 x float> @llvm.log2.v2f32( half2 test_log2_half2(half2 p0) { return log2(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_log2_half3 // NATIVE_HALF: call <3 x half> @llvm.log2.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_log2_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_log2_half3 // NO_HALF: call <3 x float> @llvm.log2.v3f32( half3 test_log2_half3(half3 p0) { return log2(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_log2_half4 // NATIVE_HALF: call <4 x half> @llvm.log2.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_log2_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_log2_half4 // NO_HALF: call <4 x float> @llvm.log2.v4f32( half4 test_log2_half4(half4 p0) { return log2(p0); } -// CHECK: define noundef float @"?test_log2_float +// CHECK-LABEL: define noundef float @_Z15test_log2_float // CHECK: call float @llvm.log2.f32( float test_log2_float(float p0) { return log2(p0); } -// CHECK: define noundef <2 x float> @"?test_log2_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z16test_log2_float2 // CHECK: call <2 x float> @llvm.log2.v2f32 float2 test_log2_float2(float2 p0) { return log2(p0); } -// CHECK: define noundef <3 x float> @"?test_log2_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z16test_log2_float3 // CHECK: call <3 x float> @llvm.log2.v3f32 float3 test_log2_float3(float3 p0) { return log2(p0); } -// CHECK: define noundef <4 x float> @"?test_log2_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z16test_log2_float4 // CHECK: call <4 x float> @llvm.log2.v4f32 float4 test_log2_float4(float4 p0) { return log2(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl index f17062f7bb0115..d462fda2ccb09f 100644 --- a/clang/test/CodeGenHLSL/builtins/max.hlsl +++ b/clang/test/CodeGenHLSL/builtins/max.hlsl @@ -1,134 +1,133 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF #ifdef __HLSL_ENABLE_16_BIT -// NATIVE_HALF: define noundef i16 @ +// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_max_short // NATIVE_HALF: call i16 @llvm.smax.i16( int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <2 x i16> @ +// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_max_short2 // NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16( int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <3 x i16> @ +// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_max_short3 // NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16 int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <4 x i16> @ +// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_max_short4 // NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16 int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef i16 @ +// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_max_ushort // NATIVE_HALF: call i16 @llvm.umax.i16( uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <2 x i16> @ +// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_max_ushort2 // NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16 uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <3 x i16> @ +// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_max_ushort3 // NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16 uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <4 x i16> @ +// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_max_ushort4 // NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16 uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); } #endif -// CHECK: define noundef i32 @ +// CHECK-LABEL: define noundef i32 @_Z12test_max_int // CHECK: call i32 @llvm.smax.i32( int test_max_int(int p0, int p1) { return max(p0, p1); } -// CHECK: define noundef <2 x i32> @ +// CHECK-LABEL: define noundef <2 x i32> @_Z13test_max_int2 // CHECK: call <2 x i32> @llvm.smax.v2i32 int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); } -// CHECK: define noundef <3 x i32> @ +// CHECK-LABEL: define noundef <3 x i32> @_Z13test_max_int3 // CHECK: call <3 x i32> @llvm.smax.v3i32 int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); } -// CHECK: define noundef <4 x i32> @ +// CHECK-LABEL: define noundef <4 x i32> @_Z13test_max_int4 // CHECK: call <4 x i32> @llvm.smax.v4i32 int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); } -// CHECK: define noundef i32 @ +// CHECK-LABEL: define noundef i32 @_Z13test_max_uint // CHECK: call i32 @llvm.umax.i32( int test_max_uint(uint p0, uint p1) { return max(p0, p1); } -// CHECK: define noundef <2 x i32> @ +// CHECK-LABEL: define noundef <2 x i32> @_Z14test_max_uint2 // CHECK: call <2 x i32> @llvm.umax.v2i32 uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); } -// CHECK: define noundef <3 x i32> @ +// CHECK-LABEL: define noundef <3 x i32> @_Z14test_max_uint3 // CHECK: call <3 x i32> @llvm.umax.v3i32 uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); } -// CHECK: define noundef <4 x i32> @ +// CHECK-LABEL: define noundef <4 x i32> @_Z14test_max_uint4 // CHECK: call <4 x i32> @llvm.umax.v4i32 uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); } -// CHECK: define noundef i64 @ +// CHECK-LABEL: define noundef i64 @_Z13test_max_long // CHECK: call i64 @llvm.smax.i64( int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); } -// CHECK: define noundef <2 x i64> @ +// CHECK-LABEL: define noundef <2 x i64> @_Z14test_max_long2 // CHECK: call <2 x i64> @llvm.smax.v2i64 int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); } -// CHECK: define noundef <3 x i64> @ +// CHECK-LABEL: define noundef <3 x i64> @_Z14test_max_long3 // CHECK: call <3 x i64> @llvm.smax.v3i64 int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); } -// CHECK: define noundef <4 x i64> @ +// CHECK-LABEL: define noundef <4 x i64> @_Z14test_max_long4 // CHECK: call <4 x i64> @llvm.smax.v4i64 int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); } -// CHECK: define noundef i64 @ +// CHECK-LABEL: define noundef i64 @_Z14test_max_ulong // CHECK: call i64 @llvm.umax.i64( -uint64_t test_max_long(uint64_t p0, uint64_t p1) { return max(p0, p1); } -// CHECK: define noundef <2 x i64> @ +uint64_t test_max_ulong(uint64_t p0, uint64_t p1) { return max(p0, p1); } +// CHECK-LABEL: define noundef <2 x i64> @_Z15test_max_ulong2 // CHECK: call <2 x i64> @llvm.umax.v2i64 -uint64_t2 test_max_long2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); } -// CHECK: define noundef <3 x i64> @ +uint64_t2 test_max_ulong2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); } +// CHECK-LABEL: define noundef <3 x i64> @_Z15test_max_ulong3 // CHECK: call <3 x i64> @llvm.umax.v3i64 -uint64_t3 test_max_long3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); } -// CHECK: define noundef <4 x i64> @ +uint64_t3 test_max_ulong3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); } +// CHECK-LABEL: define noundef <4 x i64> @_Z15test_max_ulong4 // CHECK: call <4 x i64> @llvm.umax.v4i64 -uint64_t4 test_max_long4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); } +uint64_t4 test_max_ulong4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_max_half // NATIVE_HALF: call half @llvm.maxnum.f16( -// NO_HALF: define noundef float @"?test_max_half +// NO_HALF-LABEL: define noundef float @_Z13test_max_half // NO_HALF: call float @llvm.maxnum.f32( half test_max_half(half p0, half p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_max_half2 // NATIVE_HALF: call <2 x half> @llvm.maxnum.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_max_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_max_half2 // NO_HALF: call <2 x float> @llvm.maxnum.v2f32( half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_max_half3 // NATIVE_HALF: call <3 x half> @llvm.maxnum.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_max_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_max_half3 // NO_HALF: call <3 x float> @llvm.maxnum.v3f32( half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_max_half4 // NATIVE_HALF: call <4 x half> @llvm.maxnum.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_max_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_max_half4 // NO_HALF: call <4 x float> @llvm.maxnum.v4f32( half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); } -// CHECK: define noundef float @"?test_max_float +// CHECK-LABEL: define noundef float @_Z14test_max_float // CHECK: call float @llvm.maxnum.f32( float test_max_float(float p0, float p1) { return max(p0, p1); } -// CHECK: define noundef <2 x float> @"?test_max_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z15test_max_float2 // CHECK: call <2 x float> @llvm.maxnum.v2f32 float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); } -// CHECK: define noundef <3 x float> @"?test_max_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z15test_max_float3 // CHECK: call <3 x float> @llvm.maxnum.v3f32 float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); } -// CHECK: define noundef <4 x float> @"?test_max_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z15test_max_float4 // CHECK: call <4 x float> @llvm.maxnum.v4f32 float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); } -// CHECK: define noundef double @ +// CHECK-LABEL: define noundef double @_Z15test_max_double // CHECK: call double @llvm.maxnum.f64( double test_max_double(double p0, double p1) { return max(p0, p1); } -// CHECK: define noundef <2 x double> @ +// CHECK-LABEL: define noundef <2 x double> @_Z16test_max_double2 // CHECK: call <2 x double> @llvm.maxnum.v2f64 double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); } -// CHECK: define noundef <3 x double> @ +// CHECK-LABEL: define noundef <3 x double> @_Z16test_max_double3 // CHECK: call <3 x double> @llvm.maxnum.v3f64 double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); } -// CHECK: define noundef <4 x double> @ +// CHECK-LABEL: define noundef <4 x double> @_Z16test_max_double4 // CHECK: call <4 x double> @llvm.maxnum.v4f64 double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); } diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl index a0c233dac4d5fc..02d20d13f916de 100644 --- a/clang/test/CodeGenHLSL/builtins/min.hlsl +++ b/clang/test/CodeGenHLSL/builtins/min.hlsl @@ -1,134 +1,133 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF #ifdef __HLSL_ENABLE_16_BIT -// NATIVE_HALF: define noundef i16 @ +// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_min_short // NATIVE_HALF: call i16 @llvm.smin.i16( int16_t test_min_short(int16_t p0, int16_t p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <2 x i16> @ +// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_min_short2 // NATIVE_HALF: call <2 x i16> @llvm.smin.v2i16( int16_t2 test_min_short2(int16_t2 p0, int16_t2 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <3 x i16> @ +// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_min_short3 // NATIVE_HALF: call <3 x i16> @llvm.smin.v3i16 int16_t3 test_min_short3(int16_t3 p0, int16_t3 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <4 x i16> @ +// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_min_short4 // NATIVE_HALF: call <4 x i16> @llvm.smin.v4i16 int16_t4 test_min_short4(int16_t4 p0, int16_t4 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef i16 @ +// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_min_ushort // NATIVE_HALF: call i16 @llvm.umin.i16( uint16_t test_min_ushort(uint16_t p0, uint16_t p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <2 x i16> @ +// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_min_ushort2 // NATIVE_HALF: call <2 x i16> @llvm.umin.v2i16 uint16_t2 test_min_ushort2(uint16_t2 p0, uint16_t2 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <3 x i16> @ +// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_min_ushort3 // NATIVE_HALF: call <3 x i16> @llvm.umin.v3i16 uint16_t3 test_min_ushort3(uint16_t3 p0, uint16_t3 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <4 x i16> @ +// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_min_ushort4 // NATIVE_HALF: call <4 x i16> @llvm.umin.v4i16 uint16_t4 test_min_ushort4(uint16_t4 p0, uint16_t4 p1) { return min(p0, p1); } #endif -// CHECK: define noundef i32 @ +// CHECK-LABEL: define noundef i32 @_Z12test_min_int // CHECK: call i32 @llvm.smin.i32( int test_min_int(int p0, int p1) { return min(p0, p1); } -// CHECK: define noundef <2 x i32> @ +// CHECK-LABEL: define noundef <2 x i32> @_Z13test_min_int2 // CHECK: call <2 x i32> @llvm.smin.v2i32 int2 test_min_int2(int2 p0, int2 p1) { return min(p0, p1); } -// CHECK: define noundef <3 x i32> @ +// CHECK-LABEL: define noundef <3 x i32> @_Z13test_min_int3 // CHECK: call <3 x i32> @llvm.smin.v3i32 int3 test_min_int3(int3 p0, int3 p1) { return min(p0, p1); } -// CHECK: define noundef <4 x i32> @ +// CHECK-LABEL: define noundef <4 x i32> @_Z13test_min_int4 // CHECK: call <4 x i32> @llvm.smin.v4i32 int4 test_min_int4(int4 p0, int4 p1) { return min(p0, p1); } -// CHECK: define noundef i32 @ +// CHECK-LABEL: define noundef i32 @_Z13test_min_uint // CHECK: call i32 @llvm.umin.i32( int test_min_uint(uint p0, uint p1) { return min(p0, p1); } -// CHECK: define noundef <2 x i32> @ +// CHECK-LABEL: define noundef <2 x i32> @_Z14test_min_uint2 // CHECK: call <2 x i32> @llvm.umin.v2i32 uint2 test_min_uint2(uint2 p0, uint2 p1) { return min(p0, p1); } -// CHECK: define noundef <3 x i32> @ +// CHECK-LABEL: define noundef <3 x i32> @_Z14test_min_uint3 // CHECK: call <3 x i32> @llvm.umin.v3i32 uint3 test_min_uint3(uint3 p0, uint3 p1) { return min(p0, p1); } -// CHECK: define noundef <4 x i32> @ +// CHECK-LABEL: define noundef <4 x i32> @_Z14test_min_uint4 // CHECK: call <4 x i32> @llvm.umin.v4i32 uint4 test_min_uint4(uint4 p0, uint4 p1) { return min(p0, p1); } -// CHECK: define noundef i64 @ +// CHECK-LABEL: define noundef i64 @_Z13test_min_long // CHECK: call i64 @llvm.smin.i64( int64_t test_min_long(int64_t p0, int64_t p1) { return min(p0, p1); } -// CHECK: define noundef <2 x i64> @ +// CHECK-LABEL: define noundef <2 x i64> @_Z14test_min_long2 // CHECK: call <2 x i64> @llvm.smin.v2i64 int64_t2 test_min_long2(int64_t2 p0, int64_t2 p1) { return min(p0, p1); } -// CHECK: define noundef <3 x i64> @ +// CHECK-LABEL: define noundef <3 x i64> @_Z14test_min_long3 // CHECK: call <3 x i64> @llvm.smin.v3i64 int64_t3 test_min_long3(int64_t3 p0, int64_t3 p1) { return min(p0, p1); } -// CHECK: define noundef <4 x i64> @ +// CHECK-LABEL: define noundef <4 x i64> @_Z14test_min_long4 // CHECK: call <4 x i64> @llvm.smin.v4i64 int64_t4 test_min_long4(int64_t4 p0, int64_t4 p1) { return min(p0, p1); } -// CHECK: define noundef i64 @ +// CHECK-LABEL: define noundef i64 @_Z14test_min_ulong // CHECK: call i64 @llvm.umin.i64( -uint64_t test_min_long(uint64_t p0, uint64_t p1) { return min(p0, p1); } -// CHECK: define noundef <2 x i64> @ +uint64_t test_min_ulong(uint64_t p0, uint64_t p1) { return min(p0, p1); } +// CHECK-LABEL: define noundef <2 x i64> @_Z15test_min_ulong2 // CHECK: call <2 x i64> @llvm.umin.v2i64 -uint64_t2 test_min_long2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); } -// CHECK: define noundef <3 x i64> @ +uint64_t2 test_min_ulong2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); } +// CHECK-LABEL: define noundef <3 x i64> @_Z15test_min_ulong3 // CHECK: call <3 x i64> @llvm.umin.v3i64 -uint64_t3 test_min_long3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); } -// CHECK: define noundef <4 x i64> @ +uint64_t3 test_min_ulong3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); } +// CHECK-LABEL: define noundef <4 x i64> @_Z15test_min_ulong4 // CHECK: call <4 x i64> @llvm.umin.v4i64 -uint64_t4 test_min_long4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); } +uint64_t4 test_min_ulong4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_min_half // NATIVE_HALF: call half @llvm.minnum.f16( -// NO_HALF: define noundef float @"?test_min_half +// NO_HALF-LABEL: define noundef float @_Z13test_min_half // NO_HALF: call float @llvm.minnum.f32( half test_min_half(half p0, half p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_min_half2 // NATIVE_HALF: call <2 x half> @llvm.minnum.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_min_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_min_half2 // NO_HALF: call <2 x float> @llvm.minnum.v2f32( half2 test_min_half2(half2 p0, half2 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_min_half3 // NATIVE_HALF: call <3 x half> @llvm.minnum.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_min_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_min_half3 // NO_HALF: call <3 x float> @llvm.minnum.v3f32( half3 test_min_half3(half3 p0, half3 p1) { return min(p0, p1); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_min_half4 // NATIVE_HALF: call <4 x half> @llvm.minnum.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_min_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_min_half4 // NO_HALF: call <4 x float> @llvm.minnum.v4f32( half4 test_min_half4(half4 p0, half4 p1) { return min(p0, p1); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z14test_min_float // CHECK: call float @llvm.minnum.f32( float test_min_float(float p0, float p1) { return min(p0, p1); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z15test_min_float2 // CHECK: call <2 x float> @llvm.minnum.v2f32 float2 test_min_float2(float2 p0, float2 p1) { return min(p0, p1); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z15test_min_float3 // CHECK: call <3 x float> @llvm.minnum.v3f32 float3 test_min_float3(float3 p0, float3 p1) { return min(p0, p1); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z15test_min_float4 // CHECK: call <4 x float> @llvm.minnum.v4f32 float4 test_min_float4(float4 p0, float4 p1) { return min(p0, p1); } -// CHECK: define noundef double @ +// CHECK-LABEL: define noundef double @_Z15test_min_double // CHECK: call double @llvm.minnum.f64( double test_min_double(double p0, double p1) { return min(p0, p1); } -// CHECK: define noundef <2 x double> @ +// CHECK-LABEL: define noundef <2 x double> @_Z16test_min_double2 // CHECK: call <2 x double> @llvm.minnum.v2f64 double2 test_min_double2(double2 p0, double2 p1) { return min(p0, p1); } -// CHECK: define noundef <3 x double> @ +// CHECK-LABEL: define noundef <3 x double> @_Z16test_min_double3 // CHECK: call <3 x double> @llvm.minnum.v3f64 double3 test_min_double3(double3 p0, double3 p1) { return min(p0, p1); } -// CHECK: define noundef <4 x double> @ +// CHECK-LABEL: define noundef <4 x double> @_Z16test_min_double4 // CHECK: call <4 x double> @llvm.minnum.v4f64 double4 test_min_double4(double4 p0, double4 p1) { return min(p0, p1); } diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl index 9a2264e740751c..4e184807633438 100644 --- a/clang/test/CodeGenHLSL/builtins/pow.hlsl +++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl @@ -1,41 +1,40 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_pow_half // NATIVE_HALF: call half @llvm.pow.f16( -// NO_HALF: define noundef float @"?test_pow_half +// NO_HALF-LABEL: define noundef float @_Z13test_pow_half // NO_HALF: call float @llvm.pow.f32( half test_pow_half(half p0, half p1) { return pow(p0, p1); } -// NATIVE_HALF: define noundef <2 x half> @"?test_pow_half2 +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_pow_half2 // NATIVE_HALF: call <2 x half> @llvm.pow.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_pow_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_pow_half2 // NO_HALF: call <2 x float> @llvm.pow.v2f32( half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); } -// NATIVE_HALF: define noundef <3 x half> @"?test_pow_half3 +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_pow_half3 // NATIVE_HALF: call <3 x half> @llvm.pow.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_pow_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_pow_half3 // NO_HALF: call <3 x float> @llvm.pow.v3f32( half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); } -// NATIVE_HALF: define noundef <4 x half> @"?test_pow_half4 +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_pow_half4 // NATIVE_HALF: call <4 x half> @llvm.pow.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_pow_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_pow_half4 // NO_HALF: call <4 x float> @llvm.pow.v4f32( half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); } -// CHECK: define noundef float @"?test_pow_float +// CHECK-LABEL: define noundef float @_Z14test_pow_float // CHECK: call float @llvm.pow.f32( float test_pow_float(float p0, float p1) { return pow(p0, p1); } -// CHECK: define noundef <2 x float> @"?test_pow_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z15test_pow_float2 // CHECK: call <2 x float> @llvm.pow.v2f32 float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); } -// CHECK: define noundef <3 x float> @"?test_pow_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z15test_pow_float3 // CHECK: call <3 x float> @llvm.pow.v3f32 float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); } -// CHECK: define noundef <4 x float> @"?test_pow_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z15test_pow_float4 // CHECK: call <4 x float> @llvm.pow.v4f32 float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); } diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl index 33d761dbdfbeae..6da63a394a8fdc 100644 --- a/clang/test/CodeGenHLSL/builtins/round.hlsl +++ b/clang/test/CodeGenHLSL/builtins/round.hlsl @@ -1,53 +1,52 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z15test_round_half // NATIVE_HALF: %elt.roundeven = call half @llvm.roundeven.f16( // NATIVE_HALF: ret half %elt.roundeven -// NO_HALF: define noundef float @"?test_round_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z15test_round_half // NO_HALF: %elt.roundeven = call float @llvm.roundeven.f32( // NO_HALF: ret float %elt.roundeven half test_round_half(half p0) { return round(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_round_half2 // NATIVE_HALF: %elt.roundeven = call <2 x half> @llvm.roundeven.v2f16 // NATIVE_HALF: ret <2 x half> %elt.roundeven -// NO_HALF: define noundef <2 x float> @ +// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_round_half2 // NO_HALF: %elt.roundeven = call <2 x float> @llvm.roundeven.v2f32( // NO_HALF: ret <2 x float> %elt.roundeven half2 test_round_half2(half2 p0) { return round(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_round_half3 // NATIVE_HALF: %elt.roundeven = call <3 x half> @llvm.roundeven.v3f16 // NATIVE_HALF: ret <3 x half> %elt.roundeven -// NO_HALF: define noundef <3 x float> @ +// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_round_half3 // NO_HALF: %elt.roundeven = call <3 x float> @llvm.roundeven.v3f32( // NO_HALF: ret <3 x float> %elt.roundeven half3 test_round_half3(half3 p0) { return round(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_round_half4 // NATIVE_HALF: %elt.roundeven = call <4 x half> @llvm.roundeven.v4f16 // NATIVE_HALF: ret <4 x half> %elt.roundeven -// NO_HALF: define noundef <4 x float> @ +// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_round_half4 // NO_HALF: %elt.roundeven = call <4 x float> @llvm.roundeven.v4f32( // NO_HALF: ret <4 x float> %elt.roundeven half4 test_round_half4(half4 p0) { return round(p0); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z16test_round_float // CHECK: %elt.roundeven = call float @llvm.roundeven.f32( // CHECK: ret float %elt.roundeven float test_round_float(float p0) { return round(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z17test_round_float2 // CHECK: %elt.roundeven = call <2 x float> @llvm.roundeven.v2f32 // CHECK: ret <2 x float> %elt.roundeven float2 test_round_float2(float2 p0) { return round(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z17test_round_float3 // CHECK: %elt.roundeven = call <3 x float> @llvm.roundeven.v3f32 // CHECK: ret <3 x float> %elt.roundeven float3 test_round_float3(float3 p0) { return round(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z17test_round_float4 // CHECK: %elt.roundeven = call <4 x float> @llvm.roundeven.v4f32 // CHECK: ret <4 x float> %elt.roundeven float4 test_round_float4(float4 p0) { return round(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/saturate.hlsl b/clang/test/CodeGenHLSL/builtins/saturate.hlsl index 65a3cd74621cc0..c221f6e0f2c36f 100644 --- a/clang/test/CodeGenHLSL/builtins/saturate.hlsl +++ b/clang/test/CodeGenHLSL/builtins/saturate.hlsl @@ -1,95 +1,60 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=dx +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=dx -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=SPIRV,SPIRV_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-library %s \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=SPIRV,SPIRV_NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=spv +// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=spv -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: call half @llvm.dx.saturate.f16( -// NO_HALF: define noundef float @"?test_saturate_half -// NO_HALF: call float @llvm.dx.saturate.f32( -// SPIRV_HALF: define spir_func noundef half @_Z18test_saturate_halfDh(half -// SPIRV_HALF: call half @llvm.spv.saturate.f16(half -// SPIRV_NO_HALF: define spir_func noundef float @_Z18test_saturate_halfDh(float -// SPIRV_NO_HALF: call float @llvm.spv.saturate.f32(float +// NATIVE_HALF-LABEL: define{{.*}} half @_Z18test_saturate_halfDh +// NATIVE_HALF: call half @llvm.[[tar]].saturate.f16( +// NO_HALF-LABEL: define{{.*}} float @_Z18test_saturate_halfDh +// NO_HALF: call float @llvm.[[tar]].saturate.f32( half test_saturate_half(half p0) { return saturate(p0); } -// NATIVE_HALF: define noundef <2 x half> @ -// NATIVE_HALF: call <2 x half> @llvm.dx.saturate.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_saturate_half2 -// NO_HALF: call <2 x float> @llvm.dx.saturate.v2f32( -// SPIRV_HALF: define spir_func noundef <2 x half> @_Z19test_saturate_half2Dv2_Dh( -// SPIRV_HALF: call <2 x half> @llvm.spv.saturate.v2f16(<2 x half> -// SPIRV_NO_HALF: define spir_func noundef <2 x float> @_Z19test_saturate_half2Dv2_Dh(<2 x float> -// SPIRV_NO_HALF: call <2 x float> @llvm.spv.saturate.v2f32(<2 x float> +// NATIVE_HALF-LABEL: define{{.*}} <2 x half> @_Z19test_saturate_half2Dv2_Dh +// NATIVE_HALF: call <2 x half> @llvm.[[tar]].saturate.v2f16 +// NO_HALF-LABEL: define{{.*}} <2 x float> @_Z19test_saturate_half2Dv2_Dh +// NO_HALF: call <2 x float> @llvm.[[tar]].saturate.v2f32( half2 test_saturate_half2(half2 p0) { return saturate(p0); } -// NATIVE_HALF: define noundef <3 x half> @ -// NATIVE_HALF: call <3 x half> @llvm.dx.saturate.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_saturate_half3 -// NO_HALF: call <3 x float> @llvm.dx.saturate.v3f32( -// SPIRV_HALF: define spir_func noundef <3 x half> @_Z19test_saturate_half3Dv3_Dh( -// SPIRV_HALF: call <3 x half> @llvm.spv.saturate.v3f16(<3 x half> -// SPIRV_NO_HALF: define spir_func noundef <3 x float> @_Z19test_saturate_half3Dv3_Dh(<3 x float> -// SPIRV_NO_HALF: call <3 x float> @llvm.spv.saturate.v3f32(<3 x float> +// NATIVE_HALF-LABEL: define{{.*}} <3 x half> @_Z19test_saturate_half3Dv3_Dh( +// NATIVE_HALF: call <3 x half> @llvm.[[tar]].saturate.v3f16 +// NO_HALF-LABEL: define{{.*}} <3 x float> @_Z19test_saturate_half3Dv3_Dh(<3 x float> +// NO_HALF: call <3 x float> @llvm.[[tar]].saturate.v3f32( half3 test_saturate_half3(half3 p0) { return saturate(p0); } -// NATIVE_HALF: define noundef <4 x half> @ -// NATIVE_HALF: call <4 x half> @llvm.dx.saturate.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_saturate_half4 -// NO_HALF: call <4 x float> @llvm.dx.saturate.v4f32( -// SPIRV_HALF: define spir_func noundef <4 x half> @_Z19test_saturate_half4Dv4_Dh( -// SPIRV_HALF: call <4 x half> @llvm.spv.saturate.v4f16(<4 x half> -// SPIRV_NO_HALF: define spir_func noundef <4 x float> @_Z19test_saturate_half4Dv4_Dh(<4 x float> -// SPIRV_NO_HALF: call <4 x float> @llvm.spv.saturate.v4f32(<4 x float> +// NATIVE_HALF-LABEL: define{{.*}} <4 x half> @_Z19test_saturate_half4Dv4_Dh( +// NATIVE_HALF: call <4 x half> @llvm.[[tar]].saturate.v4f16 +// NO_HALF-LABEL: define{{.*}} <4 x float> @_Z19test_saturate_half4Dv4_Dh(<4 x float> +// NO_HALF: call <4 x float> @llvm.[[tar]].saturate.v4f32( half4 test_saturate_half4(half4 p0) { return saturate(p0); } -// CHECK: define noundef float @"?test_saturate_float -// CHECK: call float @llvm.dx.saturate.f32( -// SPIRV: define spir_func noundef float @_Z19test_saturate_floatf(float -// SPIRV: call float @llvm.spv.saturate.f32(float +// CHECK-LABEL: define{{.*}} float @_Z19test_saturate_floatf( +// CHECK: call float @llvm.[[tar]].saturate.f32( float test_saturate_float(float p0) { return saturate(p0); } -// CHECK: define noundef <2 x float> @"?test_saturate_float2 -// CHECK: call <2 x float> @llvm.dx.saturate.v2f32 -// SPIRV: define spir_func noundef <2 x float> @_Z20test_saturate_float2Dv2_f(<2 x float> -// SPIRV: call <2 x float> @llvm.spv.saturate.v2f32(<2 x float> +// CHECK-LABEL: define{{.*}} <2 x float> @_Z20test_saturate_float2Dv2_f(<2 x float> +// CHECK: call <2 x float> @llvm.[[tar]].saturate.v2f32 float2 test_saturate_float2(float2 p0) { return saturate(p0); } -// CHECK: define noundef <3 x float> @"?test_saturate_float3 -// CHECK: call <3 x float> @llvm.dx.saturate.v3f32 -// SPIRV: define spir_func noundef <3 x float> @_Z20test_saturate_float3Dv3_f(<3 x float> -// SPIRV: call <3 x float> @llvm.spv.saturate.v3f32(<3 x float> +// CHECK-LABEL: define{{.*}} <3 x float> @_Z20test_saturate_float3Dv3_f(<3 x float> +// CHECK: call <3 x float> @llvm.[[tar]].saturate.v3f32 float3 test_saturate_float3(float3 p0) { return saturate(p0); } -// CHECK: define noundef <4 x float> @"?test_saturate_float4 -// CHECK: call <4 x float> @llvm.dx.saturate.v4f32 -// SPIRV: define spir_func noundef <4 x float> @_Z20test_saturate_float4Dv4_f(<4 x float> -// SPIRV: call <4 x float> @llvm.spv.saturate.v4f32(<4 x float> +// CHECK-LABEL: define{{.*}} <4 x float> @_Z20test_saturate_float4Dv4_f(<4 x float> +// CHECK: call <4 x float> @llvm.[[tar]].saturate.v4f32 float4 test_saturate_float4(float4 p0) { return saturate(p0); } -// CHECK: define noundef double @ -// CHECK: call double @llvm.dx.saturate.f64( -// SPIRV: define spir_func noundef double @_Z20test_saturate_doubled(double -// SPIRV: call double @llvm.spv.saturate.f64(double +// CHECK-LABEL: define{{.*}} double @_Z20test_saturate_doubled(double +// CHECK: call double @llvm.[[tar]].saturate.f64( double test_saturate_double(double p0) { return saturate(p0); } -// CHECK: define noundef <2 x double> @ -// CHECK: call <2 x double> @llvm.dx.saturate.v2f64 -// SPIRV: define spir_func noundef <2 x double> @_Z21test_saturate_double2Dv2_d(<2 x double> -// SPIRV: call <2 x double> @llvm.spv.saturate.v2f64(<2 x double> +// CHECK-LABEL: define{{.*}} <2 x double> @_Z21test_saturate_double2Dv2_d(<2 x double> +// CHECK: call <2 x double> @llvm.[[tar]].saturate.v2f64 double2 test_saturate_double2(double2 p0) { return saturate(p0); } -// CHECK: define noundef <3 x double> @ -// CHECK: call <3 x double> @llvm.dx.saturate.v3f64 -// SPIRV: define spir_func noundef <3 x double> @_Z21test_saturate_double3Dv3_d(<3 x double> -// SPIRV: call <3 x double> @llvm.spv.saturate.v3f64(<3 x double> +// CHECK-LABEL: define{{.*}} <3 x double> @_Z21test_saturate_double3Dv3_d(<3 x double> +// CHECK: call <3 x double> @llvm.[[tar]].saturate.v3f64 double3 test_saturate_double3(double3 p0) { return saturate(p0); } -// CHECK: define noundef <4 x double> @ -// CHECK: call <4 x double> @llvm.dx.saturate.v4f64 -// SPIRV: define spir_func noundef <4 x double> @_Z21test_saturate_double4Dv4_d(<4 x double> -// SPIRV: call <4 x double> @llvm.spv.saturate.v4f64(<4 x double> +// CHECK-LABEL: define{{.*}} <4 x double> @_Z21test_saturate_double4Dv4_d(<4 x double> +// CHECK: call <4 x double> @llvm.[[tar]].saturate.v4f64 double4 test_saturate_double4(double4 p0) { return saturate(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl index 83e8a5be39d069..9f7fa5043bdc7d 100644 --- a/clang/test/CodeGenHLSL/builtins/sin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl @@ -1,41 +1,40 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z13test_sin_half // NATIVE_HALF: call half @llvm.sin.f16( -// NO_HALF: define noundef float @"?test_sin_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z13test_sin_half // NO_HALF: call float @llvm.sin.f32( half test_sin_half(half p0) { return sin(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_sin_half2 // NATIVE_HALF: call <2 x half> @llvm.sin.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_sin_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_sin_half2 // NO_HALF: call <2 x float> @llvm.sin.v2f32( half2 test_sin_half2(half2 p0) { return sin(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_sin_half3 // NATIVE_HALF: call <3 x half> @llvm.sin.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_sin_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_sin_half3 // NO_HALF: call <3 x float> @llvm.sin.v3f32( half3 test_sin_half3(half3 p0) { return sin(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_sin_half4 // NATIVE_HALF: call <4 x half> @llvm.sin.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_sin_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_sin_half4 // NO_HALF: call <4 x float> @llvm.sin.v4f32( half4 test_sin_half4(half4 p0) { return sin(p0); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z14test_sin_float // CHECK: call float @llvm.sin.f32( float test_sin_float(float p0) { return sin(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z15test_sin_float2 // CHECK: call <2 x float> @llvm.sin.v2f32 float2 test_sin_float2(float2 p0) { return sin(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z15test_sin_float3 // CHECK: call <3 x float> @llvm.sin.v3f32 float3 test_sin_float3(float3 p0) { return sin(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z15test_sin_float4 // CHECK: call <4 x float> @llvm.sin.v4f32 float4 test_sin_float4(float4 p0) { return sin(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl index adbbf69a8e0685..63454cea3fe6fb 100644 --- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl +++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl @@ -1,53 +1,52 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -// NATIVE_HALF: define noundef half @ +// NATIVE_HALF-LABEL: define noundef half @_Z14test_sqrt_half // NATIVE_HALF: %{{.*}} = call half @llvm.sqrt.f16( // NATIVE_HALF: ret half %{{.*}} -// NO_HALF: define noundef float @"?test_sqrt_half@@YA$halff@$halff@@Z"( +// NO_HALF-LABEL: define noundef float @_Z14test_sqrt_half // NO_HALF: %{{.*}} = call float @llvm.sqrt.f32( // NO_HALF: ret float %{{.*}} half test_sqrt_half(half p0) { return sqrt(p0); } -// NATIVE_HALF: define noundef <2 x half> @ +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_sqrt_half2 // NATIVE_HALF: %{{.*}} = call <2 x half> @llvm.sqrt.v2f16 // NATIVE_HALF: ret <2 x half> %{{.*}} -// NO_HALF: define noundef <2 x float> @ +// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_sqrt_half2 // NO_HALF: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32( // NO_HALF: ret <2 x float> %{{.*}} half2 test_sqrt_half2(half2 p0) { return sqrt(p0); } -// NATIVE_HALF: define noundef <3 x half> @ +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_sqrt_half3 // NATIVE_HALF: %{{.*}} = call <3 x half> @llvm.sqrt.v3f16 // NATIVE_HALF: ret <3 x half> %{{.*}} -// NO_HALF: define noundef <3 x float> @ +// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_sqrt_half3 // NO_HALF: %{{.*}} = call <3 x float> @llvm.sqrt.v3f32( // NO_HALF: ret <3 x float> %{{.*}} half3 test_sqrt_half3(half3 p0) { return sqrt(p0); } -// NATIVE_HALF: define noundef <4 x half> @ +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_sqrt_half4 // NATIVE_HALF: %{{.*}} = call <4 x half> @llvm.sqrt.v4f16 // NATIVE_HALF: ret <4 x half> %{{.*}} -// NO_HALF: define noundef <4 x float> @ +// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_sqrt_half4 // NO_HALF: %{{.*}} = call <4 x float> @llvm.sqrt.v4f32( // NO_HALF: ret <4 x float> %{{.*}} half4 test_sqrt_half4(half4 p0) { return sqrt(p0); } -// CHECK: define noundef float @ +// CHECK-LABEL: define noundef float @_Z15test_sqrt_float // CHECK: %{{.*}} = call float @llvm.sqrt.f32( // CHECK: ret float %{{.*}} float test_sqrt_float(float p0) { return sqrt(p0); } -// CHECK: define noundef <2 x float> @ +// CHECK-LABEL: define noundef <2 x float> @_Z16test_sqrt_float2 // CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32 // CHECK: ret <2 x float> %{{.*}} float2 test_sqrt_float2(float2 p0) { return sqrt(p0); } -// CHECK: define noundef <3 x float> @ +// CHECK-LABEL: define noundef <3 x float> @_Z16test_sqrt_float3 // CHECK: %{{.*}} = call <3 x float> @llvm.sqrt.v3f32 // CHECK: ret <3 x float> %{{.*}} float3 test_sqrt_float3(float3 p0) { return sqrt(p0); } -// CHECK: define noundef <4 x float> @ +// CHECK-LABEL: define noundef <4 x float> @_Z16test_sqrt_float4 // CHECK: %{{.*}} = call <4 x float> @llvm.sqrt.v4f32 // CHECK: ret <4 x float> %{{.*}} float4 test_sqrt_float4(float4 p0) { return sqrt(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl index 40b71f45a9ccb2..3da12c88aa7fec 100644 --- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl +++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl @@ -1,47 +1,46 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF - -// NATIVE_HALF: define noundef half @"?test_trunc_half +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF + +// NATIVE_HALF-LABEL: define noundef half @_Z15test_trunc_half // NATIVE_HALF: call half @llvm.trunc.f16( -// NO_HALF: define noundef float @"?test_trunc_half +// NO_HALF-LABEL: define noundef float @_Z15test_trunc_half // NO_HALF: call float @llvm.trunc.f32( half test_trunc_half(half p0) { return trunc(p0); } -// NATIVE_HALF: define noundef <2 x half> @"?test_trunc_half2 +// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_trunc_half2 // NATIVE_HALF: call <2 x half> @llvm.trunc.v2f16 -// NO_HALF: define noundef <2 x float> @"?test_trunc_half2 +// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_trunc_half2 // NO_HALF: call <2 x float> @llvm.trunc.v2f32( half2 test_trunc_half2(half2 p0) { return trunc(p0); } -// NATIVE_HALF: define noundef <3 x half> @"?test_trunc_half3 +// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_trunc_half3 // NATIVE_HALF: call <3 x half> @llvm.trunc.v3f16 -// NO_HALF: define noundef <3 x float> @"?test_trunc_half3 +// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_trunc_half3 // NO_HALF: call <3 x float> @llvm.trunc.v3f32( half3 test_trunc_half3(half3 p0) { return trunc(p0); } -// NATIVE_HALF: define noundef <4 x half> @"?test_trunc_half4 +// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_trunc_half4 // NATIVE_HALF: call <4 x half> @llvm.trunc.v4f16 -// NO_HALF: define noundef <4 x float> @"?test_trunc_half4 +// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_trunc_half4 // NO_HALF: call <4 x float> @llvm.trunc.v4f32( half4 test_trunc_half4(half4 p0) { return trunc(p0); } -// CHECK: define noundef float @"?test_trunc_float +// CHECK-LABEL: define noundef float @_Z16test_trunc_float // CHECK: call float @llvm.trunc.f32( float test_trunc_float(float p0) { return trunc(p0); } -// CHECK: define noundef <2 x float> @"?test_trunc_float2 +// CHECK-LABEL: define noundef <2 x float> @_Z17test_trunc_float2 // CHECK: call <2 x float> @llvm.trunc.v2f32 float2 test_trunc_float2(float2 p0) { return trunc(p0); } -// CHECK: define noundef <3 x float> @"?test_trunc_float3 +// CHECK-LABEL: define noundef <3 x float> @_Z17test_trunc_float3 // CHECK: call <3 x float> @llvm.trunc.v3f32 float3 test_trunc_float3(float3 p0) { return trunc(p0); } -// CHECK: define noundef <4 x float> @"?test_trunc_float4 +// CHECK-LABEL: define noundef <4 x float> @_Z17test_trunc_float4 // CHECK: call <4 x float> @llvm.trunc.v4f32 float4 test_trunc_float4(float4 p0) { return trunc(p0); } diff --git a/clang/test/CodeGenHLSL/export.hlsl b/clang/test/CodeGenHLSL/export.hlsl index 63f9f9066f9277..770618ff2e0703 100644 --- a/clang/test/CodeGenHLSL/export.hlsl +++ b/clang/test/CodeGenHLSL/export.hlsl @@ -1,20 +1,19 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s -// CHECK: define void @"?f1@@YAXXZ"() [[Attr:\#[0-9]+]] +// CHECK: define void @_Z2f1v() [[Attr:\#[0-9]+]] export void f1() { } -// CHECK: define void @"?f2@MyNamespace@@YAXXZ"() [[Attr]] +// CHECK: define void @_ZN11MyNamespace2f2Ev() [[Attr]] namespace MyNamespace { export void f2() { } } export { -// CHECK: define void @"?f3@@YAXXZ"() [[Attr]] -// CHECK: define void @"?f4@@YAXXZ"() [[Attr]] +// CHECK: define void @_Z2f3v() [[Attr]] +// CHECK: define void @_Z2f4v() [[Attr]] void f3() {} void f4() {} } diff --git a/clang/test/CodeGenHLSL/float3.hlsl b/clang/test/CodeGenHLSL/float3.hlsl index 63379349d9bd76..767720b049152d 100644 --- a/clang/test/CodeGenHLSL/float3.hlsl +++ b/clang/test/CodeGenHLSL/float3.hlsl @@ -3,7 +3,7 @@ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s // Make sure float3 is not changed into float4. -// CHECK:<3 x float> @"?foo@@YAT?$__vector@M$02@__clang@@T12@@Z"(<3 x float> noundef %[[PARAM:[0-9a-zA-Z]+]]) +// CHECK:<3 x float> @_Z3fooDv3_f(<3 x float> noundef %[[PARAM:[0-9a-zA-Z]+]]) // CHECK:%[[A_ADDR:.+]] = alloca <3 x float>, align 16 // CHECK-NEXT:store <3 x float> %[[PARAM]], ptr %[[A_ADDR]], align 16 // CHECK-NEXT:%[[V:[0-9]+]] = load <3 x float>, ptr %[[A_ADDR]], align 16 diff --git a/clang/test/CodeGenHLSL/group_shared.hlsl b/clang/test/CodeGenHLSL/group_shared.hlsl index 48d14b2506fbc7..4b2e2beba4f12b 100644 --- a/clang/test/CodeGenHLSL/group_shared.hlsl +++ b/clang/test/CodeGenHLSL/group_shared.hlsl @@ -4,7 +4,7 @@ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s // Make sure groupshared translated into address space 3. -// CHECK:@"?a@@3PAMA" = addrspace(3) global [10 x float] +// CHECK:@a = addrspace(3) global [10 x float] groupshared float a[10]; diff --git a/clang/test/CodeGenHLSL/half.hlsl b/clang/test/CodeGenHLSL/half.hlsl index e83a6fc715b8ae..08df6f31fd12fd 100644 --- a/clang/test/CodeGenHLSL/half.hlsl +++ b/clang/test/CodeGenHLSL/half.hlsl @@ -8,12 +8,12 @@ // Make sure use float when not enable-16bit-types. -// FLOAT:define {{.*}}float @"?foo@@YA$halff@$halff@0@Z"(float{{[^,]+}}, float{{[^,)]+}}) +// FLOAT:define {{.*}}float @_Z3fooDhDh(float{{[^,]+}}, float{{[^,)]+}}) // FLOAT-NOT:half // FLOAT:ret float % // Make sure use half when enable-16bit-types. -// HALF:define {{.*}}half @"?foo@@YA$f16@$f16@0@Z"(half{{[^,]+}}, half{{[^,)]+}}) +// HALF:define {{.*}}half @_Z3fooDhDh(half{{[^,]+}}, half{{[^,)]+}}) // HALF-NOT:float // HALF:ret half % half foo(half a, half b) { diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl index f72fe059cb5763..5efecc1489afca 100644 --- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl +++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl @@ -12,7 +12,7 @@ struct Node { }; // CHECK: Function Attrs:{{.*}}norecurse -// CHECK: define noundef i32 @"?Find@@YAIY0GE@UNode@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]] +// CHECK: define noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 4 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]] // CHECK: ret i32 // Find and return value corresponding to key in the SortedTree uint Find(Node SortedTree[MAX], uint key) { @@ -31,7 +31,7 @@ uint Find(Node SortedTree[MAX], uint key) { } // CHECK: Function Attrs:{{.*}}norecurse -// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]] +// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]] // CHECK: ret i1 // Initialize tree with given buffer // Imagine the inout works @@ -52,7 +52,7 @@ RWBuffer gTree; // Mangled entry points are internal // CHECK: Function Attrs:{{.*}}norecurse -// CHECK: define internal void @"?main@@YAXI@Z"(i32 noundef %GI) [[IntAttr]] +// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]] // CHECK: ret void // Canonical entry points are external and shader attributed @@ -71,7 +71,7 @@ void main(uint GI : SV_GroupIndex) { // Mangled entry points are internal // CHECK: Function Attrs:{{.*}}norecurse -// CHECK: define internal void @"?defaultMain@@YAXXZ"() [[IntAttr]] +// CHECK: define internal void @_Z11defaultMainv() [[IntAttr]] // CHECK: ret void // Canonical entry points are external and shader attributed diff --git a/clang/test/CodeGenHLSL/inline-constructors.hlsl b/clang/test/CodeGenHLSL/inline-constructors.hlsl index 995878a9c0f798..b0d5a783fb3725 100644 --- a/clang/test/CodeGenHLSL/inline-constructors.hlsl +++ b/clang/test/CodeGenHLSL/inline-constructors.hlsl @@ -49,7 +49,7 @@ void NionsDay(int hours) { // Verify constructor is emitted // NOINLINE-NEXT: call void @_GLOBAL__sub_I_inline_constructors.hlsl() // NOINLINE-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() -// NOINLINE-NEXT: call void @"?main@@YAXI@Z"(i32 %0) +// NOINLINE-NEXT: call void @_Z4mainj(i32 %0) // Verify inlining leaves only calls to "llvm." intrinsics // INLINE-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} // CHECK: ret void @@ -64,7 +64,7 @@ void main(unsigned GI : SV_GroupIndex) { // CHECK-NEXT: entry: // Verify constructor is emitted // NOINLINE-NEXT: call void @_GLOBAL__sub_I_inline_constructors.hlsl() -// NOINLINE-NEXT: call void @"?rainyMain@@YAXXZ"() +// NOINLINE-NEXT: call void @_Z9rainyMainv() // Verify inlining leaves only calls to "llvm." intrinsics // INLINE-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} // CHECK: ret void diff --git a/clang/test/CodeGenHLSL/inline-functions.hlsl b/clang/test/CodeGenHLSL/inline-functions.hlsl index 7dd905e966e069..fa9c88db26dfc2 100644 --- a/clang/test/CodeGenHLSL/inline-functions.hlsl +++ b/clang/test/CodeGenHLSL/inline-functions.hlsl @@ -15,7 +15,7 @@ float nums[MAX]; // Verify that all functions have the alwaysinline attribute // NOINLINE: Function Attrs: alwaysinline -// NOINLINE: define void @"?swap@@YAXY0GE@III@Z"(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[IntAttr:\#[0-9]+]] +// NOINLINE: define void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[IntAttr:\#[0-9]+]] // NOINLINE: ret void // Swap the values of Buf at indices ix1 and ix2 void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) { @@ -25,7 +25,7 @@ void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) { } // NOINLINE: Function Attrs: alwaysinline -// NOINLINE: define void @"?BubbleSort@@YAXY0GE@II@Z"(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[IntAttr]] +// NOINLINE: define void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[IntAttr]] // NOINLINE: ret void // Inefficiently sort Buf in place void BubbleSort(unsigned Buf[MAX], unsigned size) { @@ -43,7 +43,7 @@ void BubbleSort(unsigned Buf[MAX], unsigned size) { // Note ExtAttr is the inlined export set of attribs // CHECK: Function Attrs: alwaysinline -// CHECK: define noundef i32 @"?RemoveDupes@@YAIY0GE@II@Z"(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) {{[a-z_ ]*}}[[ExtAttr:\#[0-9]+]] +// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) {{[a-z_ ]*}}[[ExtAttr:\#[0-9]+]] // CHECK: ret i32 // Sort Buf and remove any duplicate values // returns the number of values left @@ -67,7 +67,7 @@ RWBuffer Indices; // because it has internal linkage from the start // Note main functions get the norecurse attrib, which IntAttr reflects // NOINLINE: Function Attrs: alwaysinline -// NOINLINE: define internal void @"?main@@YAXI@Z"(i32 noundef %GI) [[IntAttr]] +// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]] // NOINLINE: ret void // The unmangled version is not inlined, EntryAttr reflects that @@ -94,7 +94,7 @@ void main(unsigned int GI : SV_GroupIndex) { // because it has internal linkage from the start // Note main functions get the norecurse attrib, which IntAttr reflects // NOINLINE: Function Attrs: alwaysinline -// NOINLINE: define internal void @"?main10@@YAXXZ"() [[IntAttr]] +// NOINLINE: define internal void @_Z6main10v() [[IntAttr]] // NOINLINE: ret void // The unmangled version is not inlined, EntryAttr reflects that diff --git a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl index 7e7ebe930bd96e..ea358c411997d0 100644 --- a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl +++ b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl @@ -13,7 +13,7 @@ void main(unsigned GI : SV_GroupIndex) { //CHECK: define void @main() #[[#ENTRY_ATTR:]] { //CHECK-NEXT: entry: //CHECK-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() -//CHECK-NEXT: call void @"?main@@YAXI@Z"(i32 %0) +//CHECK-NEXT: call void @_Z4mainj(i32 %0) //CHECK-NEXT: ret void //CHECK-NEXT: } diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl index d046efaf9c1f9c..7b3890ae560d22 100644 --- a/clang/test/CodeGenHLSL/shift-mask.hlsl +++ b/clang/test/CodeGenHLSL/shift-mask.hlsl @@ -1,12 +1,11 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s int shl32(int V, int S) { return V << S; } -// CHECK: define noundef i32 @"?shl32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 { +// CHECK-LABEL: define noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 { // CHECK-DAG: %[[Masked:.*]] = and i32 %{{.*}}, 31 // CHECK-DAG: %{{.*}} = shl i32 %{{.*}}, %[[Masked]] @@ -14,7 +13,7 @@ int shr32(int V, int S) { return V >> S; } -// CHECK: define noundef i32 @"?shr32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 { +// CHECK-LABEL: define noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 { // CHECK-DAG: %[[Masked:.*]] = and i32 %{{.*}}, 31 // CHECK-DAG: %{{.*}} = ashr i32 %{{.*}}, %[[Masked]] @@ -22,7 +21,7 @@ int64_t shl64(int64_t V, int64_t S) { return V << S; } -// CHECK: define noundef i64 @"?shl64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 { +// CHECK-LABEL: define noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 { // CHECK-DAG: %[[Masked:.*]] = and i64 %{{.*}}, 63 // CHECK-DAG: %{{.*}} = shl i64 %{{.*}}, %[[Masked]] @@ -30,6 +29,38 @@ int64_t shr64(int64_t V, int64_t S) { return V >> S; } -// CHECK: define noundef i64 @"?shr64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 { +// CHECK-LABEL: define noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 { // CHECK-DAG: %[[Masked:.*]] = and i64 %{{.*}}, 63 // CHECK-DAG: %{{.*}} = ashr i64 %{{.*}}, %[[Masked]] + +uint shlu32(uint V, uint S) { + return V << S; +} + +// CHECK-LABEL: define noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i32 %{{.*}}, 31 +// CHECK-DAG: %{{.*}} = shl i32 %{{.*}}, %[[Masked]] + +uint shru32(uint V, uint S) { + return V >> S; +} + +// CHECK-LABEL: define noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i32 %{{.*}}, 31 +// CHECK-DAG: %{{.*}} = lshr i32 %{{.*}}, %[[Masked]] + +uint64_t shlu64(uint64_t V, uint64_t S) { + return V << S; +} + +// CHECK-LABEL: define noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i64 %{{.*}}, 63 +// CHECK-DAG: %{{.*}} = shl i64 %{{.*}}, %[[Masked]] + +uint64_t shru64(uint64_t V, uint64_t S) { + return V >> S; +} + +// CHECK-LABEL: define noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 { +// CHECK-DAG: %[[Masked:.*]] = and i64 %{{.*}}, 63 +// CHECK-DAG: %{{.*}} = lshr i64 %{{.*}}, %[[Masked]] diff --git a/clang/test/CodeGenHLSL/sret_output.hlsl b/clang/test/CodeGenHLSL/sret_output.hlsl index c44914f963a90f..c324790ba016df 100644 --- a/clang/test/CodeGenHLSL/sret_output.hlsl +++ b/clang/test/CodeGenHLSL/sret_output.hlsl @@ -1,5 +1,4 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s // FIXME: add semantic to a. @@ -10,10 +9,10 @@ struct S { // Make sure sret parameter is generated. -// CHECK:define internal void @"?ps_main@@YA?AUS@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S) align 4 %agg.result) +// CHECK:define internal void @_Z7ps_mainv(ptr dead_on_unwind noalias writable sret(%struct.S) align 4 %agg.result) // FIXME: change it to real value instead of poison value once semantic is add to a. // Make sure the function with sret is called. -// CHECK:call void @"?ps_main@@YA?AUS@@XZ"(ptr poison) +// CHECK:call void @_Z7ps_mainv(ptr poison) [shader("pixel")] S ps_main() { S s; diff --git a/clang/test/CodeGenHLSL/static-local-ctor.hlsl b/clang/test/CodeGenHLSL/static-local-ctor.hlsl index f55f6808672dea..eba37e3f4c6b83 100644 --- a/clang/test/CodeGenHLSL/static-local-ctor.hlsl +++ b/clang/test/CodeGenHLSL/static-local-ctor.hlsl @@ -13,16 +13,16 @@ void InitBuf(RWBuffer buf) { } // CHECK-NOT: _Init_thread_epoch -// CHECK: define internal void @"?main@@YAXXZ" +// CHECK: define internal void @_Z4mainv // CHECK-NEXT: entry: // CHECK-NEXT: [[Tmp1:%.*]] = alloca %"class.hlsl::RWBuffer" -// CHECK-NEXT: [[Tmp2:%.*]] = load i32, ptr -// CHECK-NEXT: [[Tmp3:%.*]] = and i32 [[Tmp2]], 1 -// CHECK-NEXT: [[Tmp4:%.*]] = icmp eq i32 [[Tmp3]], 0 -// CHECK-NEXT: br i1 [[Tmp4]] +// CHECK-NEXT: [[Tmp2:%.*]] = load i8, ptr @_ZGVZ4mainvE5mybuf +// CHECK-NEXT: [[Tmp3:%.*]] = icmp eq i8 [[Tmp2]], 0 +// CHECK-NEXT: br i1 [[Tmp3]] // CHECK-NOT: _Init_thread_header -// CHECK: init: -// CHECK-NEXT: = or i32 [[Tmp2]], 1 +// CHECK: init.check: +// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ev +// CHECK-NEXT: store i8 1, ptr @_ZGVZ4mainvE5mybuf // CHECK-NOT: _Init_thread_footer diff --git a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl index eabd0faff6a87e..f85bab2113170b 100644 --- a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl +++ b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl @@ -1,15 +1,14 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-DAG: @[[CB:.+]] = external constant { float } cbuffer A { float a; - // CHECK-DAG:@b = internal global float 3.000000e+00, align 4 + // CHECK-DAG:@_ZL1b = internal global float 3.000000e+00, align 4 static float b = 3; // CHECK:load float, ptr @[[CB]], align 4 - // CHECK:load float, ptr @b, align 4 + // CHECK:load float, ptr @_ZL1b, align 4 float foo() { return a + b; } } diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl index f0affcb69a3fcd..5a3bdc3d4d38ee 100644 --- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl +++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl @@ -25,7 +25,7 @@ void main() { } // This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators. -// CHECK: define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { +// CHECK: define linkonce_odr noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { // CHECK-NEXT:entry: // CHECK-NEXT:%this.addr = alloca ptr, align 4 // CHECK-NEXT:%Another = alloca %struct.Pair, align 4 @@ -37,19 +37,19 @@ void main() { // CHECK-NEXT:%Second = getelementptr inbounds nuw %struct.Pair, ptr %Another, i32 0, i32 1 // CHECK-NEXT:store i32 10, ptr %Second, align 4 // CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %agg.tmp, ptr align 4 %Another, i32 8, i1 false) -// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp) +// CHECK-NEXT:call void @_ZN4PairaSES_(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp) // CHECK-NEXT:%First2 = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 0 // CHECK-NEXT:%0 = load i32, ptr %First2, align 4 // CHECK-NEXT:ret i32 %0 -// CHECK: define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { +// CHECK: define linkonce_odr noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { // CHECK-NEXT:entry: // CHECK-NEXT:%this.addr = alloca ptr, align 4 // CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4 // CHECK-NEXT:store ptr %this, ptr %this.addr, align 4 // CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4 // CHECK-NEXT:call void @llvm.memset.p0.i32(ptr align 4 %agg.tmp, i8 0, i32 8, i1 false) -// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp) +// CHECK-NEXT:call void @_ZN4PairaSES_(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp) // CHECK-NEXT:%Second = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 1 // CHECK-NEXT:%0 = load i32, ptr %Second, align 4 // CHECK-NEXT:ret i32 %0 diff --git a/clang/test/CodeGenHLSL/this-assignment.hlsl b/clang/test/CodeGenHLSL/this-assignment.hlsl index 7408d199910e5c..72bd2f8e70af8f 100644 --- a/clang/test/CodeGenHLSL/this-assignment.hlsl +++ b/clang/test/CodeGenHLSL/this-assignment.hlsl @@ -1,5 +1,4 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s struct Pair { int First; @@ -40,7 +39,7 @@ void main() { // CHECK-NEXT:%Another = alloca %struct.Pair, align 4 // CHECK-NEXT:store ptr %this, ptr %this.addr, align 4 // CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %Another, ptr align 4 @"__const.?getFirst@Pair@@QAAHXZ.Another", i32 8, i1 false) +// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %Another, ptr align 4 @__const._ZN4Pair8getFirstEv.Another, i32 8, i1 false) // CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %Another, i32 8, i1 false) // CHECK-NEXT:%First = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 0 @@ -56,9 +55,7 @@ void main() { // CHECK-LABEL: define {{.*}}DoSilly // CHECK-NEXT:entry: -// CHECK-NEXT: [[ResPtr:%.*]] = alloca ptr // CHECK-NEXT: [[ThisPtrAddr:%.*]] = alloca ptr -// CHECK-NEXT: store ptr [[AggRes:%.*]], ptr [[ResPtr]] // CHECK-NEXT: store ptr {{.*}}, ptr [[ThisPtrAddr]] // CHECK-NEXT: [[ThisPtr:%.*]] = load ptr, ptr [[ThisPtrAddr]] // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ThisPtr]], ptr align 4 [[Obj:%.*]], i32 8, i1 false) @@ -66,4 +63,4 @@ void main() { // CHECK-NEXT: [[First:%.*]] = load i32, ptr [[FirstAddr]] // CHECK-NEXT: [[FirstPlusTwo:%.*]] = add nsw i32 [[First]], 2 // CHECK-NEXT: store i32 [[FirstPlusTwo]], ptr [[FirstAddr]] -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AggRes]], ptr align 4 [[Obj]], i32 8, i1 false) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 {{.*}}, ptr align 4 [[Obj]], i32 8, i1 false) diff --git a/clang/test/CodeGenHLSL/this-reference.hlsl b/clang/test/CodeGenHLSL/this-reference.hlsl index 032ee34ec65d3b..66b79d42500122 100644 --- a/clang/test/CodeGenHLSL/this-reference.hlsl +++ b/clang/test/CodeGenHLSL/this-reference.hlsl @@ -21,10 +21,10 @@ void main() { } // This tests reference like `this` in HLSL - // CHECK: %call = call noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals) + // CHECK: %call = call noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 4 dereferenceable(8) %Vals) // CHECK-NEXT: %First = getelementptr inbounds nuw %struct.Pair, ptr %Vals, i32 0, i32 0 // CHECK-NEXT: store i32 %call, ptr %First, align 4 - // CHECK-NEXT: %call1 = call noundef float @"?getSecond@Pair@@QAAMXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals) + // CHECK-NEXT: %call1 = call noundef float @_ZN4Pair9getSecondEv(ptr noundef nonnull align 4 dereferenceable(8) %Vals) // CHECK-NEXT: %Second = getelementptr inbounds nuw %struct.Pair, ptr %Vals, i32 0, i32 1 // CHECK: [[Pair:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Pair" From 747d8f3fc93d912183059142631a343fb20bd07f Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 10 Oct 2024 12:01:56 -0700 Subject: [PATCH 063/177] [SandboxVec][DAG] Implement PredIterator (#111604) This patch implements an iterator for iterating over both use-def and mem dependencies of MemDGNodes. --- .../SandboxVectorizer/DependencyGraph.h | 73 +++++++++++++++++++ .../SandboxVectorizer/DependencyGraph.cpp | 44 +++++++++++ .../SandboxVectorizer/DependencyGraphTest.cpp | 41 +++++++++++ 3 files changed, 158 insertions(+) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index 134adc4b21ab12..eba6d7562e41de 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -40,6 +40,54 @@ enum class DGNodeID { MemDGNode, }; +class DGNode; +class MemDGNode; +class DependencyGraph; + +/// While OpIt points to a Value that is not an Instruction keep incrementing +/// it. \Returns the first iterator that points to an Instruction, or end. +[[nodiscard]] static User::op_iterator skipNonInstr(User::op_iterator OpIt, + User::op_iterator OpItE) { + while (OpIt != OpItE && !isa((*OpIt).get())) + ++OpIt; + return OpIt; +} + +/// Iterate over both def-use and mem dependencies. +class PredIterator { + User::op_iterator OpIt; + User::op_iterator OpItE; + DenseSet::iterator MemIt; + DGNode *N = nullptr; + DependencyGraph *DAG = nullptr; + + PredIterator(const User::op_iterator &OpIt, const User::op_iterator &OpItE, + const DenseSet::iterator &MemIt, DGNode *N, + DependencyGraph &DAG) + : OpIt(OpIt), OpItE(OpItE), MemIt(MemIt), N(N), DAG(&DAG) {} + PredIterator(const User::op_iterator &OpIt, const User::op_iterator &OpItE, + DGNode *N, DependencyGraph &DAG) + : OpIt(OpIt), OpItE(OpItE), N(N), DAG(&DAG) {} + friend class DGNode; // For constructor + friend class MemDGNode; // For constructor + +public: + using difference_type = std::ptrdiff_t; + using value_type = DGNode *; + using pointer = value_type *; + using reference = value_type &; + using iterator_category = std::input_iterator_tag; + value_type operator*(); + PredIterator &operator++(); + PredIterator operator++(int) { + auto Copy = *this; + ++(*this); + return Copy; + } + bool operator==(const PredIterator &Other) const; + bool operator!=(const PredIterator &Other) const { return !(*this == Other); } +}; + /// A DependencyGraph Node that points to an Instruction and contains memory /// dependency edges. class DGNode { @@ -63,6 +111,23 @@ class DGNode { virtual ~DGNode() = default; /// \Returns true if this is before \p Other in program order. bool comesBefore(const DGNode *Other) { return I->comesBefore(Other->I); } + using iterator = PredIterator; + virtual iterator preds_begin(DependencyGraph &DAG) { + return PredIterator(skipNonInstr(I->op_begin(), I->op_end()), I->op_end(), + this, DAG); + } + virtual iterator preds_end(DependencyGraph &DAG) { + return PredIterator(I->op_end(), I->op_end(), this, DAG); + } + iterator preds_begin(DependencyGraph &DAG) const { + return const_cast(this)->preds_begin(DAG); + } + iterator preds_end(DependencyGraph &DAG) const { + return const_cast(this)->preds_end(DAG); + } + iterator_range preds(DependencyGraph &DAG) const { + return make_range(preds_begin(DAG), preds_end(DAG)); + } static bool isStackSaveOrRestoreIntrinsic(Instruction *I) { if (auto *II = dyn_cast(I)) { @@ -145,6 +210,14 @@ class MemDGNode final : public DGNode { static bool classof(const DGNode *Other) { return Other->SubclassID == DGNodeID::MemDGNode; } + iterator preds_begin(DependencyGraph &DAG) override { + auto OpEndIt = I->op_end(); + return PredIterator(skipNonInstr(I->op_begin(), OpEndIt), OpEndIt, + MemPreds.begin(), this, DAG); + } + iterator preds_end(DependencyGraph &DAG) override { + return PredIterator(I->op_end(), I->op_end(), MemPreds.end(), this, DAG); + } /// \Returns the previous Mem DGNode in instruction order. MemDGNode *getPrevNode() const { return PrevMemN; } /// \Returns the next Mem DGNode in instruction order. diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 82f253d4c63231..7aea466ed6d8db 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -8,10 +8,54 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/SandboxIR/Instruction.h" #include "llvm/SandboxIR/Utils.h" namespace llvm::sandboxir { +PredIterator::value_type PredIterator::operator*() { + // If it's a DGNode then we dereference the operand iterator. + if (!isa(N)) { + assert(OpIt != OpItE && "Can't dereference end iterator!"); + return DAG->getNode(cast((Value *)*OpIt)); + } + // It's a MemDGNode, so we check if we return either the use-def operand, + // or a mem predecessor. + if (OpIt != OpItE) + return DAG->getNode(cast((Value *)*OpIt)); + assert(MemIt != cast(N)->memPreds().end() && + "Cant' dereference end iterator!"); + return *MemIt; +} + +PredIterator &PredIterator::operator++() { + // If it's a DGNode then we increment the use-def iterator. + if (!isa(N)) { + assert(OpIt != OpItE && "Already at end!"); + ++OpIt; + // Skip operands that are not instructions. + OpIt = skipNonInstr(OpIt, OpItE); + return *this; + } + // It's a MemDGNode, so if we are not at the end of the use-def iterator we + // need to first increment that. + if (OpIt != OpItE) { + ++OpIt; + // Skip operands that are not instructions. + OpIt = skipNonInstr(OpIt, OpItE); + return *this; + } + assert(MemIt != cast(N)->memPreds().end() && "Already at end!"); + ++MemIt; + return *this; +} + +bool PredIterator::operator==(const PredIterator &Other) const { + assert(DAG == Other.DAG && "Iterators of different DAGs!"); + assert(N == Other.N && "Iterators of different nodes!"); + return OpIt == Other.OpIt && MemIt == Other.MemIt; +} + #ifndef NDEBUG void DGNode::print(raw_ostream &OS, bool PrintDeps) const { I->dumpOS(OS); diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp index e2f16919a5cddd..6b3d9cc77c9955 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp @@ -240,12 +240,53 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) { EXPECT_TRUE(N1->hasMemPred(N0)); EXPECT_FALSE(N0->hasMemPred(N1)); + // Check preds(). + EXPECT_TRUE(N0->preds(DAG).empty()); + EXPECT_THAT(N1->preds(DAG), testing::ElementsAre(N0)); + // Check memPreds(). EXPECT_TRUE(N0->memPreds().empty()); EXPECT_THAT(N1->memPreds(), testing::ElementsAre(N0)); EXPECT_TRUE(N2->memPreds().empty()); } +TEST_F(DependencyGraphTest, Preds) { + parseIR(C, R"IR( +declare ptr @bar(i8) +define i8 @foo(i8 %v0, i8 %v1) { + %add0 = add i8 %v0, %v0 + %add1 = add i8 %v1, %v1 + %add2 = add i8 %add0, %add1 + %ptr = call ptr @bar(i8 %add1) + store i8 %add2, ptr %ptr + ret i8 %add2 +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + sandboxir::DependencyGraph DAG(getAA(*LLVMF)); + DAG.extend({&*BB->begin(), BB->getTerminator()}); + + auto *AddN0 = DAG.getNode(cast(&*It++)); + auto *AddN1 = DAG.getNode(cast(&*It++)); + auto *AddN2 = DAG.getNode(cast(&*It++)); + auto *CallN = DAG.getNode(cast(&*It++)); + auto *StN = DAG.getNode(cast(&*It++)); + auto *RetN = DAG.getNode(cast(&*It++)); + + // Check preds(). + EXPECT_THAT(AddN0->preds(DAG), testing::ElementsAre()); + EXPECT_THAT(AddN1->preds(DAG), testing::ElementsAre()); + EXPECT_THAT(AddN2->preds(DAG), testing::ElementsAre(AddN0, AddN1)); + EXPECT_THAT(CallN->preds(DAG), testing::ElementsAre(AddN1)); + EXPECT_THAT(StN->preds(DAG), + testing::UnorderedElementsAre(CallN, CallN, AddN2)); + EXPECT_THAT(RetN->preds(DAG), testing::ElementsAre(AddN2)); +} + TEST_F(DependencyGraphTest, MemDGNode_getPrevNode_getNextNode) { parseIR(C, R"IR( define void @foo(ptr %ptr, i8 %v0, i8 %v1) { From bb937e276da11c6d85318b32006f6510877c1a2c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 10 Oct 2024 20:04:46 +0100 Subject: [PATCH 064/177] [LV] Compute value of escaped induction based on the computed end value. (#110576) Update fixupIVUsers to compute the value for escaped inductions using the already computed end value of the induction (EndValue), but subtracting the step. This results in slightly simpler codegen, as we avoid computing the full transformed index at VectorTripCount - 1. PR: https://github.com/llvm/llvm-project/pull/110576 --- .../Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++++---- .../AArch64/sve-live-out-pointer-induction.ll | 4 +-- .../LoopVectorize/X86/float-induction-x86.ll | 14 ++++---- .../LoopVectorize/iv_outside_user.ll | 8 ++--- ...o-fold-tail-by-masking-iv-external-uses.ll | 3 +- .../LoopVectorize/pr58811-scev-expansion.ll | 34 +++++++++---------- 6 files changed, 42 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index db650b23e271e2..f2bee2c67a2353 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2747,17 +2747,24 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, if (isa_and_nonnull(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - Value *CountMinusOne = B.CreateSub( - VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); - CountMinusOne->setName("cmo"); - VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); assert(StepVPV && "step must have been expanded during VPlan execution"); Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() : State.get(StepVPV, VPLane(0)); - Value *Escape = - emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, - II.getKind(), II.getInductionBinOp()); + Value *Escape = nullptr; + if (EndValue->getType()->isIntegerTy()) + Escape = B.CreateSub(EndValue, Step); + else if (EndValue->getType()->isPointerTy()) + Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step)); + else if (EndValue->getType()->isFloatingPointTy()) { + Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() == + Instruction::FAdd + ? Instruction::FSub + : Instruction::FAdd, + EndValue, Step); + } else { + llvm_unreachable("all possible induction types must be handled"); + } Escape->setName("ind.escape"); MissingVals[UI] = Escape; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll index c28776e82776b7..64b69be5f52598 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll @@ -42,9 +42,7 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[CMO:%.*]] = sub i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[CMO]], 8 -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP37]] +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[IND_END]], i64 -8 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START_1]], [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index cb4f5f6d9eabaf..54dd9c870a1709 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -208,25 +208,23 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], +; AUTO_VEC-NEXT: [[STEP_ADD_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], +; AUTO_VEC-NEXT: [[STEP_ADD_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]] ; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32 ; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 64 ; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96 ; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], ptr [[TMP1]], align 8 ; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], ptr [[TMP2]], align 8 -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], ptr [[TMP3]], align 8 -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], ptr [[TMP4]], align 8 +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_2]], ptr [[TMP3]], align 8 +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_3]], ptr [[TMP4]], align 8 ; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; AUTO_VEC-NEXT: [[CMO:%.*]] = add nsw i64 [[N_VEC]], -1 -; AUTO_VEC-NEXT: [[DOTCAST6:%.*]] = sitofp i64 [[CMO]] to double -; AUTO_VEC-NEXT: [[TMP6:%.*]] = fmul fast double [[DOTCAST6]], 3.000000e+00 +; AUTO_VEC-NEXT: [[IND_ESCAPE:%.*]] = fadd fast double [[TMP0]], -3.000000e+00 ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AUTO_VEC: for.body: ; AUTO_VEC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] @@ -238,7 +236,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[SMAX]] ; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AUTO_VEC: for.end: -; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] +; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index bf27c146ec9ce1..02fdbc05ed5188 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -63,7 +63,7 @@ for.end: ; CHECK-LABEL: @geppre ; CHECK-LABEL: middle.block: -; CHECK: %ind.escape = getelementptr i8, ptr %ptr, i64 496 +; CHECK: %ind.escape = getelementptr i8, ptr %ind.end, i64 -16 ; CHECK-LABEL: for.end: ; CHECK: %[[RET:.*]] = phi ptr [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ] ; CHECK: ret ptr %[[RET]] @@ -85,9 +85,7 @@ for.end: ; CHECK-LABEL: @both ; CHECK-LABEL: middle.block: -; CHECK: %[[END:.*]] = sub i64 %n.vec, 1 -; CHECK: %[[END_OFFSET:.*]] = mul i64 %[[END]], 4 -; CHECK: %ind.escape = getelementptr i8, ptr %base, i64 %[[END_OFFSET]] +; CHECK: %ind.escape = getelementptr i8, ptr %ind.end1, i64 -4 ; CHECK-LABEL: for.end: ; CHECK: %[[RET:.*]] = phi ptr [ %inc.lag1, %for.body ], [ %ind.escape, %middle.block ] ; CHECK: ret ptr %[[RET]] @@ -142,7 +140,7 @@ for.end: ; CHECK: %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]] ; CHECK: middle.block ; CHECK: %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]] -; CHECK: %ind.escape = add i32 %[[T15]], +; CHECK: %ind.escape = sub i32 %ind.end8, -8 ; CHECK: br i1 %[[CMP]], label %BB3, label %scalar.ph define void @PR30742() { BB0: diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll index 80a6bb50ca91b6..d462d3aa650d28 100644 --- a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll +++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll @@ -51,8 +51,7 @@ define i32 @test(ptr %arr, i64 %n) { ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[CMO:%.*]] = sub i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = add i64 1, [[CMO]] +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = sub i64 [[IND_END]], 1 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOAD_VAL:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll index c0eb4ccdd6d7e5..af1c146c2c6c4c 100644 --- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll @@ -28,10 +28,10 @@ define void @test1_pr58811() { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = mul i32 195, [[INDUCTION_IV_LCSSA]] +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[INDUCTION_IV_LCSSA]] ; CHECK-NEXT: br i1 false, label [[LOOP_3_PREHEADER:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_2_PREHEADER]] ] @@ -123,28 +123,28 @@ define void @test2_pr58811() { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = mul i32 195, [[INDUCTION_IV_LCSSA]] +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[INDUCTION_IV_LCSSA]] ; CHECK-NEXT: br i1 false, label [[LOOP_4_PREHEADER:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP_3:%.*]] ; CHECK: loop.3: -; CHECK-NEXT: [[INT16_TINDARRAYSAFEVAR_186_0747_1:%.*]] = phi i16 [ [[INC_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[UINT32_TVAR_177_2745_1:%.*]] = phi i32 [ [[SUB93_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUB93_1]] = sub i32 [[UINT32_TVAR_177_2745_1]], [[IV_2_LCSSA]] -; CHECK-NEXT: [[INC_1]] = add i16 [[INT16_TINDARRAYSAFEVAR_186_0747_1]], 1 -; CHECK-NEXT: [[CMP88_1:%.*]] = icmp ult i16 [[INT16_TINDARRAYSAFEVAR_186_0747_1]], 198 +; CHECK-NEXT: [[IV_4:%.*]] = phi i16 [ [[INC_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IV_5:%.*]] = phi i32 [ [[SUB93_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SUB93_1]] = sub i32 [[IV_5]], [[IV_2_LCSSA]] +; CHECK-NEXT: [[INC_1]] = add i16 [[IV_4]], 1 +; CHECK-NEXT: [[CMP88_1:%.*]] = icmp ult i16 [[IV_4]], 198 ; CHECK-NEXT: br i1 [[CMP88_1]], label [[LOOP_3]], label [[LOOP_4_PREHEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.4.preheader: -; CHECK-NEXT: [[UINT32_TVAR_177_2745_1_LCSSA:%.*]] = phi i32 [ [[UINT32_TVAR_177_2745_1]], [[LOOP_3]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[IV_5_LCSSA:%.*]] = phi i32 [ [[IV_5]], [[LOOP_3]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_4]] ; CHECK: loop.4: -; CHECK-NEXT: [[UINT32_TVAR_177_2745_2:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_4]] ], [ 0, [[LOOP_4_PREHEADER]] ] -; CHECK-NEXT: [[SUB93_2]] = sub i32 [[UINT32_TVAR_177_2745_2]], [[UINT32_TVAR_177_2745_1_LCSSA]] +; CHECK-NEXT: [[IV_6:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_4]] ], [ 0, [[LOOP_4_PREHEADER]] ] +; CHECK-NEXT: [[SUB93_2]] = sub i32 [[IV_6]], [[IV_5_LCSSA]] ; CHECK-NEXT: br i1 false, label [[LOOP_4]], label [[LOOP_1_HEADER_LOOPEXIT]] ; entry: @@ -201,10 +201,10 @@ define void @test3_pr58811() { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = mul i32 195, [[TMP3]] +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[TMP3]] ; CHECK-NEXT: br i1 false, label [[LOOP_4_PREHEADER:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ] From 125262312f366bd776b668b24026dbbc8e6b4c75 Mon Sep 17 00:00:00 2001 From: Tyler Nowicki Date: Thu, 10 Oct 2024 15:11:27 -0400 Subject: [PATCH 065/177] [Coroutines] Improve use of unique_ptr (#111870) * Replace usage of unique_ptr<>(new ...) -> make_unique<>(); --- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 12 ++++-------- .../Transforms/Coroutines/ExtraRematTest.cpp | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 88ce331c8cfb64..0395ee62ae988b 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -2211,17 +2211,13 @@ CreateNewABI(Function &F, coro::Shape &S, switch (S.ABI) { case coro::ABI::Switch: - return std::unique_ptr( - new coro::SwitchABI(F, S, IsMatCallback)); + return std::make_unique(F, S, IsMatCallback); case coro::ABI::Async: - return std::unique_ptr( - new coro::AsyncABI(F, S, IsMatCallback)); + return std::make_unique(F, S, IsMatCallback); case coro::ABI::Retcon: - return std::unique_ptr( - new coro::AnyRetconABI(F, S, IsMatCallback)); + return std::make_unique(F, S, IsMatCallback); case coro::ABI::RetconOnce: - return std::unique_ptr( - new coro::AnyRetconABI(F, S, IsMatCallback)); + return std::make_unique(F, S, IsMatCallback); } llvm_unreachable("Unknown ABI"); } diff --git a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp index c3394fdaa940ba..68bf640334b5f2 100644 --- a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp +++ b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp @@ -247,7 +247,7 @@ TEST_F(ExtraRematTest, TestCoroRematWithCustomABI) { ASSERT_TRUE(M); CoroSplitPass::BaseABITy GenCustomABI = [](Function &F, coro::Shape &S) { - return std::unique_ptr(new ExtraCustomABI(F, S)); + return std::make_unique(F, S); }; CGSCCPassManager CGPM; From f6e93b8147a94a595293b47c39d20d2038c812d1 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 10 Oct 2024 12:13:36 -0700 Subject: [PATCH 066/177] AMDGPU: Minor improvement and cleanup for waterfall loop generation (#111886) First, ReadlanePieces should be in the scope of each MachineOperand. It is not correct if we declare in a outer scope without clearing after the use for a MachineOperand. Additionally, we do not need the OrigBB argyment for emitLoadScalarOpsFromVGPRLoop, since MachineFunction (the only use) can be obtained from LoopBB (or BodyBB). --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0c2ae382f53a19..d676d561d08180 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6302,11 +6302,14 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, // Emit the actual waterfall loop, executing the wrapped instruction for each // unique value of \p ScalarOps across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane). -static void emitLoadScalarOpsFromVGPRLoop( - const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, - MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, - ArrayRef ScalarOps) { - MachineFunction &MF = *OrigBB.getParent(); +static void +emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, + MachineRegisterInfo &MRI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &BodyBB, + const DebugLoc &DL, + ArrayRef ScalarOps) { + MachineFunction &MF = *LoopBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; @@ -6319,8 +6322,6 @@ static void emitLoadScalarOpsFromVGPRLoop( const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); MachineBasicBlock::iterator I = LoopBB.begin(); - - SmallVector ReadlanePieces; Register CondReg; for (MachineOperand *ScalarOp : ScalarOps) { @@ -6355,6 +6356,7 @@ static void emitLoadScalarOpsFromVGPRLoop( ScalarOp->setReg(CurReg); ScalarOp->setIsKill(); } else { + SmallVector ReadlanePieces; unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef()); assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size"); @@ -6535,7 +6537,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } } - emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps); + emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps); MachineBasicBlock::iterator First = RemainderBB->begin(); // Restore SCC From e34d614e7d8616f165f3f5d349db98d9924826f2 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Thu, 10 Oct 2024 15:28:46 -0400 Subject: [PATCH 067/177] [Passes] Remove -enable-infer-alignment-pass flag (#111873) This flag has been on for a while without any complaints. --- llvm/lib/Passes/PassBuilderPipelines.cpp | 8 ++----- .../InstCombineLoadStoreAlloca.cpp | 23 ------------------- 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 8f151a99b11709..0167d1058c3ac1 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -300,8 +300,6 @@ extern cl::opt UseCtxProfile; namespace llvm { extern cl::opt EnableMemProfContextDisambiguation; - -extern cl::opt EnableInferAlignmentPass; } // namespace llvm PipelineTuningOptions::PipelineTuningOptions() { @@ -1250,8 +1248,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, FPM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); - if (EnableInferAlignmentPass) - FPM.addPass(InferAlignmentPass()); + FPM.addPass(InferAlignmentPass()); if (IsFullLTO) { // The vectorizer may have significantly shortened a loop body; unroll // again. Unroll small loops to hide loop backedge latency and saturate any @@ -1369,8 +1366,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, FPM.addPass(SROAPass(SROAOptions::PreserveCFG)); } - if (EnableInferAlignmentPass) - FPM.addPass(InferAlignmentPass()); + FPM.addPass(InferAlignmentPass()); FPM.addPass(InstCombinePass()); // This is needed for two reasons: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 0b51845ab5e257..93d183837d6f43 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -37,13 +37,6 @@ static cl::opt MaxCopiedFromConstantUsers( cl::desc("Maximum users to visit in copy from constant transform"), cl::Hidden); -namespace llvm { -cl::opt EnableInferAlignmentPass( - "enable-infer-alignment-pass", cl::init(true), cl::Hidden, cl::ZeroOrMore, - cl::desc("Enable the InferAlignment pass, disabling alignment inference in " - "InstCombine")); -} - /// isOnlyCopiedFromConstantMemory - Recursively walk the uses of a (derived) /// pointer to an alloca. Ignore any reads of the pointer, return false if we /// see any stores or other unknown uses. If we see pointer arithmetic, keep @@ -1010,14 +1003,6 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) { if (Instruction *Res = combineLoadToOperationType(*this, LI)) return Res; - if (!EnableInferAlignmentPass) { - // Attempt to improve the alignment. - Align KnownAlign = getOrEnforceKnownAlignment( - Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT); - if (KnownAlign > LI.getAlign()) - LI.setAlignment(KnownAlign); - } - // Replace GEP indices if possible. if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) return replaceOperand(LI, 0, NewGEPI); @@ -1358,14 +1343,6 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) { if (combineStoreToValueType(*this, SI)) return eraseInstFromFunction(SI); - if (!EnableInferAlignmentPass) { - // Attempt to improve the alignment. - const Align KnownAlign = getOrEnforceKnownAlignment( - Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT); - if (KnownAlign > SI.getAlign()) - SI.setAlignment(KnownAlign); - } - // Try to canonicalize the stored type. if (unpackStoreToAggregate(*this, SI)) return eraseInstFromFunction(SI); From 07892aaf04032e7a18368bc8320f93f7d46ab20f Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 9 Oct 2024 11:18:01 -0700 Subject: [PATCH 068/177] [NFC][sanitizer] Clang format sanitizer_thread_registry.cpp --- .../sanitizer_thread_registry.cpp | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp index 741e0731c41559..df04822b28851c 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp @@ -18,9 +18,16 @@ namespace __sanitizer { ThreadContextBase::ThreadContextBase(u32 tid) - : tid(tid), unique_id(0), reuse_count(), os_id(0), user_id(0), - status(ThreadStatusInvalid), detached(false), - thread_type(ThreadType::Regular), parent_tid(0), next(0) { + : tid(tid), + unique_id(0), + reuse_count(), + os_id(0), + user_id(0), + status(ThreadStatusInvalid), + detached(false), + thread_type(ThreadType::Regular), + parent_tid(0), + next(0) { name[0] = '\0'; atomic_store(&thread_destroyed, 0, memory_order_release); } @@ -39,8 +46,7 @@ void ThreadContextBase::SetName(const char *new_name) { } void ThreadContextBase::SetDead() { - CHECK(status == ThreadStatusRunning || - status == ThreadStatusFinished); + CHECK(status == ThreadStatusRunning || status == ThreadStatusFinished); status = ThreadStatusDead; user_id = 0; OnDead(); @@ -68,7 +74,8 @@ void ThreadContextBase::SetFinished() { // for a thread that never actually started. In that case the thread // should go to ThreadStatusFinished regardless of whether it was created // as detached. - if (!detached || status == ThreadStatusCreated) status = ThreadStatusFinished; + if (!detached || status == ThreadStatusCreated) + status = ThreadStatusFinished; OnFinished(); } @@ -124,8 +131,10 @@ void ThreadRegistry::GetNumberOfThreads(uptr *total, uptr *running, ThreadRegistryLock l(this); if (total) *total = threads_.size(); - if (running) *running = running_threads_; - if (alive) *alive = alive_threads_; + if (running) + *running = running_threads_; + if (alive) + *alive = alive_threads_; } uptr ThreadRegistry::GetMaxAliveThreads() { @@ -150,8 +159,10 @@ u32 ThreadRegistry::CreateThread(uptr user_id, bool detached, u32 parent_tid, Report("%s: Thread limit (%u threads) exceeded. Dying.\n", SanitizerToolName, max_threads_); #else - Printf("race: limit on %u simultaneously alive goroutines is exceeded," - " dying\n", max_threads_); + Printf( + "race: limit on %u simultaneously alive goroutines is exceeded," + " dying\n", + max_threads_); #endif Die(); } @@ -170,8 +181,7 @@ u32 ThreadRegistry::CreateThread(uptr user_id, bool detached, u32 parent_tid, // positives later (e.g. if we join a wrong thread). CHECK(live_.try_emplace(user_id, tid).second); } - tctx->SetCreated(user_id, total_threads_++, detached, - parent_tid, arg); + tctx->SetCreated(user_id, total_threads_++, detached, parent_tid, arg); return tid; } @@ -196,8 +206,8 @@ u32 ThreadRegistry::FindThread(FindThreadCallback cb, void *arg) { return kInvalidTid; } -ThreadContextBase * -ThreadRegistry::FindThreadContextLocked(FindThreadCallback cb, void *arg) { +ThreadContextBase *ThreadRegistry::FindThreadContextLocked( + FindThreadCallback cb, void *arg) { CheckLocked(); for (u32 tid = 0; tid < threads_.size(); tid++) { ThreadContextBase *tctx = threads_[tid]; @@ -210,7 +220,7 @@ ThreadRegistry::FindThreadContextLocked(FindThreadCallback cb, void *arg) { static bool FindThreadContextByOsIdCallback(ThreadContextBase *tctx, void *arg) { return (tctx->os_id == (uptr)arg && tctx->status != ThreadStatusInvalid && - tctx->status != ThreadStatusDead); + tctx->status != ThreadStatusDead); } ThreadContextBase *ThreadRegistry::FindThreadContextByOsIDLocked(tid_t os_id) { From a4916d200518ac077be93995af18bd80fcb89cc2 Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 10 Oct 2024 12:42:28 -0700 Subject: [PATCH 069/177] [SandboxVec][DAG] Refactoring: Move MemPreds from DGNode to MemDGNode (#111897) --- .../SandboxVectorizer/DependencyGraph.h | 36 +++++---- .../SandboxVectorizer/DependencyGraph.cpp | 20 ++--- .../SandboxVectorizer/DependencyGraphTest.cpp | 73 +++++++++++-------- 3 files changed, 72 insertions(+), 57 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index eba6d7562e41de..da50e5326ea069 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -1,4 +1,4 @@ -//===- DependencyGraph.h ----------------------------------*- C++ -*-===// +//===- DependencyGraph.h ----------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -96,9 +96,6 @@ class DGNode { // TODO: Use a PointerIntPair for SubclassID and I. /// For isa/dyn_cast etc. DGNodeID SubclassID; - // TODO: Move MemPreds to MemDGNode. - /// Memory predecessors. - DenseSet MemPreds; DGNode(Instruction *I, DGNodeID ID) : I(I), SubclassID(ID) {} friend class MemDGNode; // For constructor. @@ -170,17 +167,6 @@ class DGNode { } Instruction *getInstruction() const { return I; } - void addMemPred(MemDGNode *PredN) { MemPreds.insert(PredN); } - /// \Returns all memory dependency predecessors. - iterator_range::const_iterator> memPreds() const { - return make_range(MemPreds.begin(), MemPreds.end()); - } - /// \Returns true if there is a memory dependency N->this. - bool hasMemPred(DGNode *N) const { - if (auto *MN = dyn_cast(N)) - return MemPreds.count(MN); - return false; - } #ifndef NDEBUG virtual void print(raw_ostream &OS, bool PrintDeps = true) const; @@ -198,6 +184,9 @@ class DGNode { class MemDGNode final : public DGNode { MemDGNode *PrevMemN = nullptr; MemDGNode *NextMemN = nullptr; + /// Memory predecessors. + DenseSet MemPreds; + friend class PredIterator; // For MemPreds. void setNextNode(MemDGNode *N) { NextMemN = N; } void setPrevNode(MemDGNode *N) { PrevMemN = N; } @@ -222,6 +211,21 @@ class MemDGNode final : public DGNode { MemDGNode *getPrevNode() const { return PrevMemN; } /// \Returns the next Mem DGNode in instruction order. MemDGNode *getNextNode() const { return NextMemN; } + /// Adds the mem dependency edge PredN->this. + void addMemPred(MemDGNode *PredN) { MemPreds.insert(PredN); } + /// \Returns true if there is a memory dependency N->this. + bool hasMemPred(DGNode *N) const { + if (auto *MN = dyn_cast(N)) + return MemPreds.count(MN); + return false; + } + /// \Returns all memory dependency predecessors. Used by tests. + iterator_range::const_iterator> memPreds() const { + return make_range(MemPreds.begin(), MemPreds.end()); + } +#ifndef NDEBUG + virtual void print(raw_ostream &OS, bool PrintDeps = true) const override; +#endif // NDEBUG }; /// Convenience builders for a MemDGNode interval. @@ -266,7 +270,7 @@ class DependencyGraph { /// Go through all mem nodes in \p SrcScanRange and try to add dependencies to /// \p DstN. - void scanAndAddDeps(DGNode &DstN, const Interval &SrcScanRange); + void scanAndAddDeps(MemDGNode &DstN, const Interval &SrcScanRange); public: DependencyGraph(AAResults &AA) diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 7aea466ed6d8db..70843812ff65bc 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -23,7 +23,8 @@ PredIterator::value_type PredIterator::operator*() { // or a mem predecessor. if (OpIt != OpItE) return DAG->getNode(cast((Value *)*OpIt)); - assert(MemIt != cast(N)->memPreds().end() && + // It's a MemDGNode with OpIt == end, so we need to use MemIt. + assert(MemIt != cast(N)->MemPreds.end() && "Cant' dereference end iterator!"); return *MemIt; } @@ -45,7 +46,8 @@ PredIterator &PredIterator::operator++() { OpIt = skipNonInstr(OpIt, OpItE); return *this; } - assert(MemIt != cast(N)->memPreds().end() && "Already at end!"); + // It's a MemDGNode with OpIt == end, so we need to increment MemIt. + assert(MemIt != cast(N)->MemPreds.end() && "Already at end!"); ++MemIt; return *this; } @@ -57,10 +59,14 @@ bool PredIterator::operator==(const PredIterator &Other) const { } #ifndef NDEBUG -void DGNode::print(raw_ostream &OS, bool PrintDeps) const { +void DGNode::print(raw_ostream &OS, bool PrintDeps) const { I->dumpOS(OS); } +void DGNode::dump() const { + print(dbgs()); + dbgs() << "\n"; +} +void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { I->dumpOS(OS); if (PrintDeps) { - OS << "\n"; // Print memory preds. static constexpr const unsigned Indent = 4; for (auto *Pred : MemPreds) { @@ -70,10 +76,6 @@ void DGNode::print(raw_ostream &OS, bool PrintDeps) const { } } } -void DGNode::dump() const { - print(dbgs()); - dbgs() << "\n"; -} #endif // NDEBUG Interval @@ -179,7 +181,7 @@ bool DependencyGraph::hasDep(Instruction *SrcI, Instruction *DstI) { llvm_unreachable("Unknown DependencyType enum"); } -void DependencyGraph::scanAndAddDeps(DGNode &DstN, +void DependencyGraph::scanAndAddDeps(MemDGNode &DstN, const Interval &SrcScanRange) { assert(isa(DstN) && "DstN is the mem dep destination, so it must be mem"); diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp index 6b3d9cc77c9955..5a9c9815ca42fa 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp @@ -50,10 +50,10 @@ struct DependencyGraphTest : public testing::Test { return *AA; } /// \Returns true if there is a dependency: SrcN->DstN. - bool dependency(sandboxir::DGNode *SrcN, sandboxir::DGNode *DstN) { - const auto &Preds = DstN->memPreds(); - auto It = find(Preds, SrcN); - return It != Preds.end(); + bool memDependency(sandboxir::DGNode *SrcN, sandboxir::DGNode *DstN) { + if (auto *MemDstN = dyn_cast(DstN)) + return MemDstN->hasMemPred(SrcN); + return false; } }; @@ -230,9 +230,10 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) { EXPECT_EQ(Span.top(), &*BB->begin()); EXPECT_EQ(Span.bottom(), BB->getTerminator()); - sandboxir::DGNode *N0 = DAG.getNode(S0); - sandboxir::DGNode *N1 = DAG.getNode(S1); - sandboxir::DGNode *N2 = DAG.getNode(Ret); + auto *N0 = cast(DAG.getNode(S0)); + auto *N1 = cast(DAG.getNode(S1)); + auto *N2 = DAG.getNode(Ret); + // Check getInstruction(). EXPECT_EQ(N0->getInstruction(), S0); EXPECT_EQ(N1->getInstruction(), S1); @@ -247,7 +248,7 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) { // Check memPreds(). EXPECT_TRUE(N0->memPreds().empty()); EXPECT_THAT(N1->memPreds(), testing::ElementsAre(N0)); - EXPECT_TRUE(N2->memPreds().empty()); + EXPECT_TRUE(N2->preds(DAG).empty()); } TEST_F(DependencyGraphTest, Preds) { @@ -399,12 +400,14 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) { sandboxir::DependencyGraph DAG(getAA(*LLVMF)); DAG.extend({&*BB->begin(), BB->getTerminator()}); auto It = BB->begin(); - auto *Store0N = DAG.getNode(cast(&*It++)); - auto *Store1N = DAG.getNode(cast(&*It++)); + auto *Store0N = cast( + DAG.getNode(cast(&*It++))); + auto *Store1N = cast( + DAG.getNode(cast(&*It++))); auto *RetN = DAG.getNode(cast(&*It++)); EXPECT_TRUE(Store0N->memPreds().empty()); EXPECT_THAT(Store1N->memPreds(), testing::ElementsAre(Store0N)); - EXPECT_TRUE(RetN->memPreds().empty()); + EXPECT_TRUE(RetN->preds(DAG).empty()); } TEST_F(DependencyGraphTest, NonAliasingStores) { @@ -422,13 +425,15 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v0, i8 %v1) { sandboxir::DependencyGraph DAG(getAA(*LLVMF)); DAG.extend({&*BB->begin(), BB->getTerminator()}); auto It = BB->begin(); - auto *Store0N = DAG.getNode(cast(&*It++)); - auto *Store1N = DAG.getNode(cast(&*It++)); + auto *Store0N = cast( + DAG.getNode(cast(&*It++))); + auto *Store1N = cast( + DAG.getNode(cast(&*It++))); auto *RetN = DAG.getNode(cast(&*It++)); // We expect no dependencies because the stores don't alias. EXPECT_TRUE(Store0N->memPreds().empty()); EXPECT_TRUE(Store1N->memPreds().empty()); - EXPECT_TRUE(RetN->memPreds().empty()); + EXPECT_TRUE(RetN->preds(DAG).empty()); } TEST_F(DependencyGraphTest, VolatileLoads) { @@ -446,12 +451,14 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) { sandboxir::DependencyGraph DAG(getAA(*LLVMF)); DAG.extend({&*BB->begin(), BB->getTerminator()}); auto It = BB->begin(); - auto *Ld0N = DAG.getNode(cast(&*It++)); - auto *Ld1N = DAG.getNode(cast(&*It++)); + auto *Ld0N = cast( + DAG.getNode(cast(&*It++))); + auto *Ld1N = cast( + DAG.getNode(cast(&*It++))); auto *RetN = DAG.getNode(cast(&*It++)); EXPECT_TRUE(Ld0N->memPreds().empty()); EXPECT_THAT(Ld1N->memPreds(), testing::ElementsAre(Ld0N)); - EXPECT_TRUE(RetN->memPreds().empty()); + EXPECT_TRUE(RetN->preds(DAG).empty()); } TEST_F(DependencyGraphTest, VolatileSotres) { @@ -469,12 +476,14 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v) { sandboxir::DependencyGraph DAG(getAA(*LLVMF)); DAG.extend({&*BB->begin(), BB->getTerminator()}); auto It = BB->begin(); - auto *Store0N = DAG.getNode(cast(&*It++)); - auto *Store1N = DAG.getNode(cast(&*It++)); + auto *Store0N = cast( + DAG.getNode(cast(&*It++))); + auto *Store1N = cast( + DAG.getNode(cast(&*It++))); auto *RetN = DAG.getNode(cast(&*It++)); EXPECT_TRUE(Store0N->memPreds().empty()); EXPECT_THAT(Store1N->memPreds(), testing::ElementsAre(Store0N)); - EXPECT_TRUE(RetN->memPreds().empty()); + EXPECT_TRUE(RetN->preds(DAG).empty()); } TEST_F(DependencyGraphTest, Call) { @@ -498,12 +507,12 @@ define void @foo(float %v1, float %v2) { DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()}); auto It = BB->begin(); - auto *Call1N = DAG.getNode(&*It++); + auto *Call1N = cast(DAG.getNode(&*It++)); auto *AddN = DAG.getNode(&*It++); - auto *Call2N = DAG.getNode(&*It++); + auto *Call2N = cast(DAG.getNode(&*It++)); EXPECT_THAT(Call1N->memPreds(), testing::ElementsAre()); - EXPECT_THAT(AddN->memPreds(), testing::ElementsAre()); + EXPECT_THAT(AddN->preds(DAG), testing::ElementsAre()); EXPECT_THAT(Call2N->memPreds(), testing::ElementsAre(Call1N)); } @@ -534,8 +543,8 @@ define void @foo() { auto *AllocaN = DAG.getNode(&*It++); auto *StackRestoreN = DAG.getNode(&*It++); - EXPECT_TRUE(dependency(AllocaN, StackRestoreN)); - EXPECT_TRUE(dependency(StackSaveN, AllocaN)); + EXPECT_TRUE(memDependency(AllocaN, StackRestoreN)); + EXPECT_TRUE(memDependency(StackSaveN, AllocaN)); } // Checks that stacksave and stackrestore depend on other mem instrs. @@ -567,9 +576,9 @@ define void @foo(i8 %v0, i8 %v1, ptr %ptr) { auto *StackRestoreN = DAG.getNode(&*It++); auto *Store1N = DAG.getNode(&*It++); - EXPECT_TRUE(dependency(Store0N, StackSaveN)); - EXPECT_TRUE(dependency(StackSaveN, StackRestoreN)); - EXPECT_TRUE(dependency(StackRestoreN, Store1N)); + EXPECT_TRUE(memDependency(Store0N, StackSaveN)); + EXPECT_TRUE(memDependency(StackSaveN, StackRestoreN)); + EXPECT_TRUE(memDependency(StackRestoreN, Store1N)); } // Make sure there is a dependency between a stackrestore and an alloca. @@ -596,7 +605,7 @@ define void @foo(ptr %ptr) { auto *StackRestoreN = DAG.getNode(&*It++); auto *AllocaN = DAG.getNode(&*It++); - EXPECT_TRUE(dependency(StackRestoreN, AllocaN)); + EXPECT_TRUE(memDependency(StackRestoreN, AllocaN)); } // Make sure there is a dependency between the alloca and stacksave @@ -623,7 +632,7 @@ define void @foo(ptr %ptr) { auto *AllocaN = DAG.getNode(&*It++); auto *StackSaveN = DAG.getNode(&*It++); - EXPECT_TRUE(dependency(AllocaN, StackSaveN)); + EXPECT_TRUE(memDependency(AllocaN, StackSaveN)); } // A non-InAlloca in a stacksave-stackrestore region does not need extra @@ -655,6 +664,6 @@ define void @foo() { auto *AllocaN = DAG.getNode(&*It++); auto *StackRestoreN = DAG.getNode(&*It++); - EXPECT_FALSE(dependency(StackSaveN, AllocaN)); - EXPECT_FALSE(dependency(AllocaN, StackRestoreN)); + EXPECT_FALSE(memDependency(StackSaveN, AllocaN)); + EXPECT_FALSE(memDependency(AllocaN, StackRestoreN)); } From d832a1c744fddad93ec4d8d2739c2a49a3623e02 Mon Sep 17 00:00:00 2001 From: Justin Fargnoli Date: Thu, 10 Oct 2024 12:57:43 -0700 Subject: [PATCH 070/177] [NVPTX] Only run LowerUnreachable when necessary (#109868) Before CUDA 12.3 `ptxas` did not recognize that the trap instruction terminates a basic block. Instead, it would assume that control flow continued to the next instruction. The next instruction could be in the block that's lexically below it. This would lead to phantom CFG edges being created within ptxas. [NVPTX: Lower unreachable to exit to allow ptxas to accurately reconstruct the CFG.](https://github.com/llvm/llvm-project/commit/1ee4d880e8760256c606fe55b7af85a4f70d006d) added the LowerUnreachable pass to NVPTX to work around this. Several other WAR patches followed. This bug in `ptxas` was fixed in CUDA 12.3 and is thus impossible to encounter when targeting PTX ISA v8.3+ This commit reverts the WARs for the `ptxas` bug when targeting PTX ISA v8.3+ CC @maleadt --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 5 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 8 ++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 10 +- llvm/test/CodeGen/NVPTX/unreachable.ll | 103 +++++++++++++++---- 4 files changed, 100 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 8f4eddb5142740..8b34ce4f1001c1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -139,6 +139,8 @@ def hasVote : Predicate<"Subtarget->hasVote()">; def hasDouble : Predicate<"Subtarget->hasDouble()">; def hasLDG : Predicate<"Subtarget->hasLDG()">; def hasLDU : Predicate<"Subtarget->hasLDU()">; +def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; +def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">; def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; @@ -3736,9 +3738,10 @@ def Callseq_End : [(callseq_end timm:$amt1, timm:$amt2)]>; // trap instruction +def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>, Requires<[noPTXASUnreachableBug]>; // Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. // This won't be necessary in a future version of ptxas. -def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>; +def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[hasPTXASUnreachableBug]>; // brkpt instruction def debugtrapinst : NVPTXInst<(outs), (ins), "brkpt;", [(debugtrap)]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 8b9059bd60cbd4..e785bbf830da62 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -95,6 +95,14 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } + // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction + // terminates a basic block. Instead, it would assume that control flow + // continued to the next instruction. The next instruction could be in the + // block that's lexically below it. This would lead to a phantom CFG edges + // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when + // PTX ISA versions 8.3+ we can confidently say that the bug will not be + // present. + bool hasPTXASUnreachableBug() const { return PTXVersion < 83; } bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 8e6e4395efb559..2eb8b17f1b0f40 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -367,9 +367,13 @@ void NVPTXPassConfig::addIRPasses() { addPass(createSROAPass()); } - const auto &Options = getNVPTXTargetMachine().Options; - addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable, - Options.NoTrapAfterNoreturn)); + if (ST.hasPTXASUnreachableBug()) { + // Run LowerUnreachable to WAR a ptxas bug. See the commit description of + // 1ee4d880e8760256c606fe55b7af85a4f70d006d for more details. + const auto &Options = getNVPTXTargetMachine().Options; + addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable, + Options.NoTrapAfterNoreturn)); + } } bool NVPTXPassConfig::addInstSelector() { diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll index f9118900cb7372..6bd583c8d50d8a 100644 --- a/llvm/test/CodeGen/NVPTX/unreachable.ll +++ b/llvm/test/CodeGen/NVPTX/unreachable.ll @@ -1,48 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs -trap-unreachable=false \ -; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTRAP +; RUN: | FileCheck %s --check-prefixes=CHECK,NO-TRAP-UNREACHABLE ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable=false \ -; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTRAP +; RUN: | FileCheck %s --check-prefixes=CHECK,NO-TRAP-UNREACHABLE ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn \ -; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTRAP +; RUN: | FileCheck %s --check-prefixes=CHECK,NO-TRAP-AFTER-NORETURN ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn \ -; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTRAP +; RUN: | FileCheck %s --check-prefixes=CHECK,NO-TRAP-AFTER-NORETURN ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn=false \ -; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-TRAP +; RUN: | FileCheck %s --check-prefixes=CHECK,TRAP ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn=false \ -; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-TRAP +; RUN: | FileCheck %s --check-prefixes=CHECK,TRAP +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -mattr=+ptx83 \ +; RUN: | FileCheck %s --check-prefixes=BUG-FIXED ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} -; CHECK: .extern .func throw +target triple = "nvptx-unknown-cuda" + declare void @throw() #0 declare void @llvm.trap() #0 -; CHECK-LABEL: .entry kernel_func define void @kernel_func() { -; CHECK: call.uni -; CHECK: throw, +; NO-TRAP-UNREACHABLE-LABEL: kernel_func( +; NO-TRAP-UNREACHABLE: { +; NO-TRAP-UNREACHABLE-EMPTY: +; NO-TRAP-UNREACHABLE-EMPTY: +; NO-TRAP-UNREACHABLE-NEXT: // %bb.0: +; NO-TRAP-UNREACHABLE-NEXT: { // callseq 0, 0 +; NO-TRAP-UNREACHABLE-NEXT: call.uni +; NO-TRAP-UNREACHABLE-NEXT: throw, +; NO-TRAP-UNREACHABLE-NEXT: ( +; NO-TRAP-UNREACHABLE-NEXT: ); +; NO-TRAP-UNREACHABLE-NEXT: } // callseq 0 +; NO-TRAP-UNREACHABLE-NEXT: // begin inline asm +; NO-TRAP-UNREACHABLE-NEXT: exit; +; NO-TRAP-UNREACHABLE-NEXT: // end inline asm +; +; NO-TRAP-AFTER-NORETURN-LABEL: kernel_func( +; NO-TRAP-AFTER-NORETURN: { +; NO-TRAP-AFTER-NORETURN-EMPTY: +; NO-TRAP-AFTER-NORETURN-EMPTY: +; NO-TRAP-AFTER-NORETURN-NEXT: // %bb.0: +; NO-TRAP-AFTER-NORETURN-NEXT: { // callseq 0, 0 +; NO-TRAP-AFTER-NORETURN-NEXT: call.uni +; NO-TRAP-AFTER-NORETURN-NEXT: throw, +; NO-TRAP-AFTER-NORETURN-NEXT: ( +; NO-TRAP-AFTER-NORETURN-NEXT: ); +; NO-TRAP-AFTER-NORETURN-NEXT: } // callseq 0 +; NO-TRAP-AFTER-NORETURN-NEXT: // begin inline asm +; NO-TRAP-AFTER-NORETURN-NEXT: exit; +; NO-TRAP-AFTER-NORETURN-NEXT: // end inline asm +; NO-TRAP-AFTER-NORETURN-NEXT: trap; exit; +; +; TRAP-LABEL: kernel_func( +; TRAP: { +; TRAP-EMPTY: +; TRAP-EMPTY: +; TRAP-NEXT: // %bb.0: +; TRAP-NEXT: { // callseq 0, 0 +; TRAP-NEXT: call.uni +; TRAP-NEXT: throw, +; TRAP-NEXT: ( +; TRAP-NEXT: ); +; TRAP-NEXT: } // callseq 0 +; TRAP-NEXT: trap; exit; +; +; BUG-FIXED-LABEL: kernel_func( +; BUG-FIXED: { +; BUG-FIXED-EMPTY: +; BUG-FIXED-EMPTY: +; BUG-FIXED-NEXT: // %bb.0: +; BUG-FIXED-NEXT: { // callseq 0, 0 +; BUG-FIXED-NEXT: call.uni +; BUG-FIXED-NEXT: throw, +; BUG-FIXED-NEXT: ( +; BUG-FIXED-NEXT: ); +; BUG-FIXED-NEXT: } // callseq 0 +; BUG-FIXED-NEXT: trap; call void @throw() -; CHECK-TRAP-NOT: exit; -; CHECK-TRAP: trap; -; CHECK-NOTRAP-NOT: trap; -; CHECK: exit; unreachable } -; CHECK-LABEL: kernel_func_2 define void @kernel_func_2() { -; CHECK: trap; exit; +; CHECK-LABEL: kernel_func_2( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: trap; exit; +; +; BUG-FIXED-LABEL: kernel_func_2( +; BUG-FIXED: { +; BUG-FIXED-EMPTY: +; BUG-FIXED-EMPTY: +; BUG-FIXED-NEXT: // %bb.0: +; BUG-FIXED-NEXT: trap; call void @llvm.trap() - -;; Make sure we avoid emitting two trap instructions. -; CHECK-NOT: trap; -; CHECK-NOT: exit; +; Make sure we avoid emitting two trap instructions. unreachable } attributes #0 = { noreturn } - !nvvm.annotations = !{!1} - !1 = !{ptr @kernel_func, !"kernel", i32 1} From 29e192a0bfbc75fa66498d3b1c1d1329009f1dd2 Mon Sep 17 00:00:00 2001 From: Tyler Nowicki Date: Thu, 10 Oct 2024 15:59:24 -0400 Subject: [PATCH 071/177] [Coroutines] Documentation for custom ABIs (#111781) Update the llvm/docs/Coroutines.rst docs to include a full description of Custom ABI objects. This documentation describes the how ABI objects allow users (plugin libraries) to create custom ABI objects for their needs. --- llvm/docs/Coroutines.rst | 90 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst index 5679aefcb421d8..8794df65504fa2 100644 --- a/llvm/docs/Coroutines.rst +++ b/llvm/docs/Coroutines.rst @@ -312,6 +312,7 @@ lowered to a constant representing the size required for the coroutine frame. The `coro.begin`_ intrinsic initializes the coroutine frame and returns the coroutine handle. The second parameter of `coro.begin` is given a block of memory to be used if the coroutine frame needs to be allocated dynamically. + The `coro.id`_ intrinsic serves as coroutine identity useful in cases when the `coro.begin`_ intrinsic get duplicated by optimization passes such as jump-threading. @@ -749,6 +750,65 @@ and python iterator `__next__` would look like: return *(int*)coro.promise(hdl, 4, false); } +Custom ABIs and Plugin Libraries +-------------------------------- + +Plugin libraries can extend coroutine lowering enabling a wide variety of users +to utilize the coroutine transformation passes. An existing coroutine lowering +is extended by: + +#. defining custom ABIs that inherit from the existing ABIs, +#. give a list of generators for the custom ABIs when constructing the `CoroSplit`_ pass, and +#. use `coro.begin.custom.abi`_ in place of `coro.begin`_ that has an additional parameter for the index of the generator/ABI to be used for the coroutine. + +A custom ABI overriding the SwitchABI's materialization looks like: + +.. code-block:: c++ + + class CustomSwitchABI : public coro::SwitchABI { + public: + CustomSwitchABI(Function &F, coro::Shape &S) + : coro::SwitchABI(F, S, ExtraMaterializable) {} + }; + +Giving a list of custom ABI generators while constructing the `CoroSplit` +pass looks like: + +.. code-block:: c++ + + CoroSplitPass::BaseABITy GenCustomABI = [](Function &F, coro::Shape &S) { + return std::make_unique(F, S); + }; + + CGSCCPassManager CGPM; + CGPM.addPass(CoroSplitPass({GenCustomABI})); + +The LLVM IR for a coroutine using a Coroutine with a custom ABI looks like: + +.. code-block:: llvm + + define ptr @f(i32 %n) presplitcoroutine_custom_abi { + entry: + %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call ptr @malloc(i32 %size) + %hdl = call noalias ptr @llvm.coro.begin.custom.abi(token %id, ptr %alloc, i32 0) + br label %loop + loop: + %n.val = phi i32 [ %n, %entry ], [ %inc, %loop ] + %inc = add nsw i32 %n.val, 1 + call void @print(i32 %n.val) + %0 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %0, label %suspend [i8 0, label %loop + i8 1, label %cleanup] + cleanup: + %mem = call ptr @llvm.coro.free(token %id, ptr %hdl) + call void @free(ptr %mem) + br label %suspend + suspend: + %unused = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + ret ptr %hdl + } Intrinsics ========== @@ -1007,6 +1067,36 @@ with small positive and negative offsets). A frontend should emit exactly one `coro.begin` intrinsic per coroutine. +.. _coro.begin.custom.abi: + +'llvm.coro.begin.custom.abi' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:: + + declare ptr @llvm.coro.begin.custom.abi(token , ptr , i32) + +Overview: +""""""""" + +The '``llvm.coro.begin.custom.abi``' intrinsic is used in place of the +`coro.begin` intrinsic that has an additional parameter to specify the custom +ABI for the coroutine. The return is identical to that of the `coro.begin` +intrinsic. + +Arguments: +"""""""""" + +The first and second arguments are identical to those of the `coro.begin` +intrinsic. + +The third argument is an i32 index of the generator list given to the +`CoroSplit` pass specifying the custom ABI generator lor this coroutine. + +Semantics: +"""""""""" + +The semantics are identical to those of the `coro.begin` intrinsic. + .. _coro.free: 'llvm.coro.free' Intrinsic From d5e1de6da96c1ab3b8cae68447e8ed3696a7006e Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Fri, 11 Oct 2024 09:01:47 +1300 Subject: [PATCH 072/177] [lldb] Implement basic support for reverse-continue (#99736) This commit only adds support for the `SBProcess::ReverseContinue()` API. A user-accessible command for this will follow in a later commit. This feature depends on a gdbserver implementation (e.g. `rr`) providing support for the `bc` and `bs` packets. `lldb-server` does not support those packets, and there is no plan to change that. So, for testing purposes, `lldbreverse.py` wraps `lldb-server` with a Python implementation of *very limited* record-and-replay functionality for use by *tests only*. The majority of this PR is test infrastructure (about 700 of the 950 lines added). --- lldb/include/lldb/API/SBProcess.h | 1 + lldb/include/lldb/Target/Process.h | 21 +- lldb/include/lldb/Target/StopInfo.h | 6 + lldb/include/lldb/lldb-enumerations.h | 6 + .../Python/lldbsuite/test/gdbclientutils.py | 5 +- .../Python/lldbsuite/test/lldbgdbproxy.py | 175 ++++++++ .../Python/lldbsuite/test/lldbreverse.py | 418 ++++++++++++++++++ .../Python/lldbsuite/test/lldbtest.py | 2 + lldb/source/API/SBProcess.cpp | 8 +- lldb/source/API/SBThread.cpp | 2 + .../source/Interpreter/CommandInterpreter.cpp | 3 +- .../Process/Linux/NativeThreadLinux.cpp | 3 + .../Process/MacOSX-Kernel/ProcessKDP.cpp | 9 +- .../Process/MacOSX-Kernel/ProcessKDP.h | 2 +- .../Process/Windows/Common/ProcessWindows.cpp | 8 +- .../Process/Windows/Common/ProcessWindows.h | 2 +- .../GDBRemoteCommunicationClient.cpp | 22 + .../gdb-remote/GDBRemoteCommunicationClient.h | 6 + .../GDBRemoteCommunicationServerLLGS.cpp | 1 + .../Process/gdb-remote/ProcessGDBRemote.cpp | 77 +++- .../Process/gdb-remote/ProcessGDBRemote.h | 2 +- .../Process/scripted/ScriptedProcess.cpp | 9 +- .../Process/scripted/ScriptedProcess.h | 2 +- lldb/source/Target/Process.cpp | 29 +- lldb/source/Target/StopInfo.cpp | 29 ++ lldb/source/Target/Thread.cpp | 8 +- .../reverse-execution/Makefile | 3 + .../TestReverseContinueBreakpoints.py | 115 +++++ .../TestReverseContinueNotSupported.py | 30 ++ .../functionalities/reverse-execution/main.c | 14 + lldb/tools/lldb-dap/JSONUtils.cpp | 3 + lldb/tools/lldb-dap/LLDBUtils.cpp | 1 + 32 files changed, 978 insertions(+), 44 deletions(-) create mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py create mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py create mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py create mode 100644 lldb/test/API/functionalities/reverse-execution/main.c diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h index 1624e02070b1b2..8b8ed830b54cc0 100644 --- a/lldb/include/lldb/API/SBProcess.h +++ b/lldb/include/lldb/API/SBProcess.h @@ -159,6 +159,7 @@ class LLDB_API SBProcess { lldb::SBError Destroy(); lldb::SBError Continue(); + lldb::SBError Continue(RunDirection direction); lldb::SBError Stop(); diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index b8c53a474ba6b9..fe7fbc50fd5770 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - Status Resume(); + Status Resume(lldb::RunDirection direction = lldb::eRunForward); /// Resume a process, and wait for it to stop. - Status ResumeSynchronous(Stream *stream); + Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward); /// Halts a running process. /// @@ -1104,9 +1104,14 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - virtual Status DoResume() { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support resuming processes", GetPluginName()); + virtual Status DoResume(lldb::RunDirection direction) { + if (direction == lldb::RunDirection::eRunForward) { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support resuming processes", GetPluginName()); + } else { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + } } /// Called after resuming a process. @@ -2332,6 +2337,8 @@ class Process : public std::enable_shared_from_this, bool IsRunning() const; + lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; } + DynamicCheckerFunctions *GetDynamicCheckers() { return m_dynamic_checkers_up.get(); } @@ -2851,7 +2858,7 @@ void PruneThreadPlans(); /// /// \return /// An Status object describing the success or failure of the resume. - Status PrivateResume(); + Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward); // Called internally void CompleteAttach(); @@ -3127,6 +3134,8 @@ void PruneThreadPlans(); // m_currently_handling_do_on_removals are true, // Resume will only request a resume, using this // flag to check. + // The direction of execution from the last time this process was resumed. + lldb::RunDirection m_last_run_direction; lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async /// interrupt, used by thread plan timeout. It diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h index fae90364deaf0a..072f71f6b1122f 100644 --- a/lldb/include/lldb/Target/StopInfo.h +++ b/lldb/include/lldb/Target/StopInfo.h @@ -142,6 +142,12 @@ class StopInfo : public std::enable_shared_from_this { static lldb::StopInfoSP CreateStopReasonProcessorTrace(Thread &thread, const char *description); + // This creates a StopInfo indicating that execution stopped because + // it was replaying some recorded execution history, and execution reached + // the end of that recorded history. + static lldb::StopInfoSP + CreateStopReasonHistoryBoundary(Thread &thread, const char *description); + static lldb::StopInfoSP CreateStopReasonFork(Thread &thread, lldb::pid_t child_pid, lldb::tid_t child_tid); diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 938f6e3abe8f2a..232d1dfdb5c9d0 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -135,6 +135,9 @@ FLAGS_ENUM(LaunchFlags){ /// Thread Run Modes. enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping }; +/// Execution directions +enum RunDirection { eRunForward, eRunReverse }; + /// Byte ordering definitions. enum ByteOrder { eByteOrderInvalid = 0, @@ -254,6 +257,9 @@ enum StopReason { eStopReasonVFork, eStopReasonVForkDone, eStopReasonInterrupt, ///< Thread requested interrupt + // Indicates that execution stopped because the debugger backend relies + // on recorded data and we reached the end of that data. + eStopReasonHistoryBoundary, }; /// Command Return Status Types. diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py index 1784487323ad6b..732d6171320680 100644 --- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py +++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py @@ -510,8 +510,9 @@ def start(self): self._thread.start() def stop(self): - self._thread.join() - self._thread = None + if self._thread is not None: + self._thread.join() + self._thread = None def get_connect_address(self): return self._socket.get_connect_address() diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py new file mode 100644 index 00000000000000..2a9592bf4545a4 --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py @@ -0,0 +1,175 @@ +import logging +import os +import os.path +import random + +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.gdbclientutils import * +import lldbgdbserverutils +from lldbsuite.support import seven + + +class GDBProxyTestBase(TestBase): + """ + Base class for gdbserver proxy tests. + + This class will setup and start a mock GDB server for the test to use. + It pases through requests to a regular lldb-server/debugserver and + forwards replies back to the LLDB under test. + """ + + """The gdbserver that we implement.""" + server = None + """The inner lldb-server/debugserver process that we proxy requests into.""" + monitor_server = None + monitor_sock = None + + server_socket_class = TCPServerSocket + + DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) + + _verbose_log_handler = None + _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s") + + def setUpBaseLogging(self): + self.logger = logging.getLogger(__name__) + + if len(self.logger.handlers) > 0: + return # We have set up this handler already + + self.logger.propagate = False + self.logger.setLevel(logging.DEBUG) + + # log all warnings to stderr + handler = logging.StreamHandler() + handler.setLevel(logging.WARNING) + handler.setFormatter(self._log_formatter) + self.logger.addHandler(handler) + + def setUp(self): + TestBase.setUp(self) + + self.setUpBaseLogging() + + if self.isVerboseLoggingRequested(): + # If requested, full logs go to a log file + log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log" + self._verbose_log_handler = logging.FileHandler( + log_file_name + ) + self._verbose_log_handler.setFormatter(self._log_formatter) + self._verbose_log_handler.setLevel(logging.DEBUG) + self.logger.addHandler(self._verbose_log_handler) + + lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe() + if lldb_server_exe is None: + self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe() + self.assertTrue(self.debug_monitor_exe is not None) + self.debug_monitor_extra_args = [] + else: + self.debug_monitor_exe = lldb_server_exe + self.debug_monitor_extra_args = ["gdbserver"] + + self.server = MockGDBServer(self.server_socket_class()) + self.server.responder = self + + def tearDown(self): + # TestBase.tearDown will kill the process, but we need to kill it early + # so its client connection closes and we can stop the server before + # finally calling the base tearDown. + if self.process() is not None: + self.process().Kill() + self.server.stop() + + self.logger.removeHandler(self._verbose_log_handler) + self._verbose_log_handler = None + + TestBase.tearDown(self) + + def isVerboseLoggingRequested(self): + # We will report our detailed logs if the user requested that the "gdb-remote" channel is + # logged. + return any(("gdb-remote" in channel) for channel in lldbtest_config.channels) + + def connect(self, target): + """ + Create a process by connecting to the mock GDB server. + """ + self.prep_debug_monitor_and_inferior() + self.server.start() + + listener = self.dbg.GetListener() + error = lldb.SBError() + process = target.ConnectRemote( + listener, self.server.get_connect_url(), "gdb-remote", error + ) + self.assertTrue(error.Success(), error.description) + self.assertTrue(process, PROCESS_IS_VALID) + return process + + def get_next_port(self): + return 12000 + random.randint(0, 3999) + + def prep_debug_monitor_and_inferior(self): + inferior_exe_path = self.getBuildArtifact("a.out") + self.connect_to_debug_monitor([inferior_exe_path]) + self.assertIsNotNone(self.monitor_server) + self.initial_handshake() + + def initial_handshake(self): + self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "+") + self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode")) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "+") + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "OK") + self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "+") + + def get_debug_monitor_command_line_args(self, connect_address, launch_args): + return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args + + def launch_debug_monitor(self, launch_args): + family, type, proto, _, addr = socket.getaddrinfo( + "localhost", 0, proto=socket.IPPROTO_TCP + )[0] + sock = socket.socket(family, type, proto) + sock.settimeout(self.DEFAULT_TIMEOUT) + sock.bind(addr) + sock.listen(1) + addr = sock.getsockname() + connect_address = "[{}]:{}".format(*addr) + + commandline_args = self.get_debug_monitor_command_line_args( + connect_address, launch_args + ) + + # Start the server. + self.logger.info(f"Spawning monitor {commandline_args}") + monitor_process = self.spawnSubprocess( + self.debug_monitor_exe, commandline_args, install_remote=False + ) + self.assertIsNotNone(monitor_process) + + self.monitor_sock = sock.accept()[0] + self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT) + return monitor_process + + def connect_to_debug_monitor(self, launch_args): + monitor_process = self.launch_debug_monitor(launch_args) + self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process) + + def respond(self, packet): + """Subclasses can override this to change how packets are handled.""" + return self.pass_through(packet) + + def pass_through(self, packet): + self.logger.info(f"Sending packet {packet}") + self.monitor_server.send_packet(seven.bitcast_to_bytes(packet)) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.logger.info(f"Received reply {reply}") + return reply diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py new file mode 100644 index 00000000000000..0f02fdffbdeada --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/lldbreverse.py @@ -0,0 +1,418 @@ +import os +import os.path +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.gdbclientutils import * +from lldbsuite.test.lldbgdbproxy import * +import lldbgdbserverutils +import re + + +class ThreadSnapshot: + def __init__(self, thread_id, registers): + self.thread_id = thread_id + self.registers = registers + + +class MemoryBlockSnapshot: + def __init__(self, address, data): + self.address = address + self.data = data + + +class StateSnapshot: + def __init__(self, thread_snapshots, memory): + self.thread_snapshots = thread_snapshots + self.memory = memory + self.thread_id = None + + +class RegisterInfo: + def __init__(self, lldb_index, bitsize, little_endian): + self.lldb_index = lldb_index + self.bitsize = bitsize + self.little_endian = little_endian + + +BELOW_STACK_POINTER = 16384 +ABOVE_STACK_POINTER = 4096 + +BLOCK_SIZE = 1024 + +SOFTWARE_BREAKPOINTS = 0 +HARDWARE_BREAKPOINTS = 1 +WRITE_WATCHPOINTS = 2 + + +class ReverseTestBase(GDBProxyTestBase): + """ + Base class for tests that need reverse execution. + + This class uses a gdbserver proxy to add very limited reverse- + execution capability to lldb-server/debugserver for testing + purposes only. + + To use this class, run the inferior forward until some stopping point. + Then call `start_recording()` and execute forward again until reaching + a software breakpoint; this class records the state before each execution executes. + At that point, the server will accept "bc" and "bs" packets to step + backwards through the state. + When executing during recording, we only allow single-step and continue without + delivering a signal, and only software breakpoint stops are allowed. + + We assume that while recording is enabled, the only effects of instructions + are on general-purpose registers (read/written by the 'g' and 'G' packets) + and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER). + """ + + """ + A list of StateSnapshots in time order. + + There is one snapshot per single-stepped instruction, + representing the state before that instruction was + executed. The last snapshot in the list is the + snapshot before the last instruction was executed. + This is an undo log; we snapshot a superset of the state that may have + been changed by the instruction's execution. + """ + snapshots = None + recording_enabled = False + + breakpoints = None + + pid = None + + pc_register_info = None + sp_register_info = None + general_purpose_register_info = None + + def __init__(self, *args, **kwargs): + GDBProxyTestBase.__init__(self, *args, **kwargs) + self.breakpoints = [set(), set(), set(), set(), set()] + + def respond(self, packet): + if not packet: + raise ValueError("Invalid empty packet") + if packet == self.server.PACKET_INTERRUPT: + # Don't send a response. We'll just run to completion. + return [] + if self.is_command(packet, "qSupported", ":"): + reply = self.pass_through(packet) + return reply + ";ReverseStep+;ReverseContinue+" + if self.is_command(packet, "vCont", ";"): + if self.recording_enabled: + return self.continue_with_recording(packet) + snapshots = [] + if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S": + raise ValueError("LLDB should not be sending old-style continuation packets") + if packet == "bc": + return self.reverse_continue() + if packet == "bs": + return self.reverse_step() + if packet == 'jThreadsInfo': + # Suppress this because it contains thread stop reasons which we might + # need to modify, and we don't want to have to implement that. + return "" + if packet[0] == "z" or packet[0] == "Z": + reply = self.pass_through(packet) + if reply == "OK": + self.update_breakpoints(packet) + return reply + return GDBProxyTestBase.respond(self, packet) + + def start_recording(self): + self.recording_enabled = True + self.snapshots = [] + + def stop_recording(self): + """ + Don't record when executing foward. + + Reverse execution is still supported until the next forward continue. + """ + self.recording_enabled = False + + def is_command(self, packet, cmd, follow_token): + return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token + + def update_breakpoints(self, packet): + m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet) + if m is None: + raise ValueError("Invalid breakpoint packet: " + packet) + t = int(m.group(2)) + addr = int(m.group(3), 16) + kind = int(m.group(4), 16) + if m.group(1) == 'Z': + self.breakpoints[t].add((addr, kind)) + else: + self.breakpoints[t].discard((addr, kind)) + + def breakpoint_triggered_at(self, pc): + if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]): + return True + if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]): + return True + return False + + def watchpoint_triggered(self, new_value_block, current_contents): + """Returns the address or None.""" + for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]: + for offset in range(0, kind): + addr = watch_addr + offset + if (addr >= new_value_block.address and + addr < new_value_block.address + len(new_value_block.data)): + index = addr - new_value_block.address + if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]: + return watch_addr + return None + + def continue_with_recording(self, packet): + self.logger.debug("Continue with recording enabled") + + step_packet = "vCont;s" + if packet == "vCont": + requested_step = False + else: + m = re.match("vCont;(c|s)(.*)", packet) + if m is None: + raise ValueError("Unsupported vCont packet: " + packet) + requested_step = m.group(1) == 's' + step_packet += m.group(2) + + while True: + snapshot = self.capture_snapshot() + reply = self.pass_through(step_packet) + (stop_signal, stop_pairs) = self.parse_stop(reply) + if stop_signal != 5: + raise ValueError("Unexpected stop signal: " + reply) + is_swbreak = False + thread_id = None + for key, value in stop_pairs.items(): + if key == "thread": + thread_id = self.parse_thread_id(value) + continue + if re.match('[0-9a-f]+', key): + continue + if key == "swbreak" or (key == "reason" and value == "breakpoint"): + is_swbreak = True + continue + if key in ["name", "threads", "thread-pcs", "reason"]: + continue + raise ValueError(f"Unknown stop key '{key}' in {reply}") + if is_swbreak: + self.logger.debug("Recording stopped") + return reply + if thread_id is None: + return ValueError("Expected thread ID: " + reply) + snapshot.thread_id = thread_id + self.snapshots.append(snapshot) + if requested_step: + self.logger.debug("Recording stopped for step") + return reply + + def parse_stop(self, reply): + result = {} + if not reply: + raise ValueError("Invalid empty packet") + if reply[0] == "T" and len(reply) >= 3: + result = {k:v for k, v in self.parse_pairs(reply[3:])} + return (int(reply[1:3], 16), result) + raise "Unsupported stop reply: " + reply + + def parse_pairs(self, text): + for pair in text.split(";"): + if not pair: + continue + m = re.match("([^:]+):(.*)", pair) + if m is None: + raise ValueError("Invalid pair text: " + text) + yield (m.group(1), m.group(2)) + + def capture_snapshot(self): + """Snapshot all threads and their stack memories.""" + self.ensure_register_info() + current_thread = self.get_current_thread() + thread_snapshots = [] + memory = [] + for thread_id in self.get_thread_list(): + registers = {} + for index in sorted(self.general_purpose_register_info.keys()): + reply = self.pass_through(f"p{index:x};thread:{thread_id:x};") + if reply == "" or reply[0] == 'E': + raise ValueError("Can't read register") + registers[index] = reply + thread_snapshot = ThreadSnapshot(thread_id, registers) + thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers) + memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER) + thread_snapshots.append(thread_snapshot) + self.set_current_thread(current_thread) + return StateSnapshot(thread_snapshots, memory) + + def restore_snapshot(self, snapshot): + """ + Restore the snapshot during reverse execution. + + If this triggers a breakpoint or watchpoint, return the stop reply, + otherwise None. + """ + current_thread = self.get_current_thread() + stop_reasons = [] + for thread_snapshot in snapshot.thread_snapshots: + thread_id = thread_snapshot.thread_id + for lldb_index in sorted(thread_snapshot.registers.keys()): + data = thread_snapshot.registers[lldb_index] + reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};") + if reply != "OK": + raise ValueError("Can't restore thread register") + if thread_id == snapshot.thread_id: + new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers) + if self.breakpoint_triggered_at(new_pc): + stop_reasons.append([("reason", "breakpoint")]) + self.set_current_thread(current_thread) + for block in snapshot.memory: + current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}") + if not current_memory or current_memory[0] == 'E': + raise ValueError("Can't read back memory") + reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data) + if reply != "OK": + raise ValueError("Can't restore memory") + watch_addr = self.watchpoint_triggered(block, current_memory[1:]) + if watch_addr is not None: + stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")]) + if stop_reasons: + pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0]) + return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};" + return None + + def reverse_step(self): + if not self.snapshots: + self.logger.debug("Reverse-step at history boundary") + return self.history_boundary_reply(self.get_current_thread()) + self.logger.debug("Reverse-step started") + snapshot = self.snapshots.pop() + stop_reply = self.restore_snapshot(snapshot) + self.set_current_thread(snapshot.thread_id) + self.logger.debug("Reverse-step stopped") + if stop_reply is None: + return self.singlestep_stop_reply(snapshot.thread_id) + return stop_reply + + def reverse_continue(self): + self.logger.debug("Reverse-continue started") + thread_id = None + while self.snapshots: + snapshot = self.snapshots.pop() + stop_reply = self.restore_snapshot(snapshot) + thread_id = snapshot.thread_id + if stop_reply is not None: + self.set_current_thread(thread_id) + self.logger.debug("Reverse-continue stopped") + return stop_reply + if thread_id is None: + thread_id = self.get_current_thread() + else: + self.set_current_thread(snapshot.thread_id) + self.logger.debug("Reverse-continue stopped at history boundary") + return self.history_boundary_reply(thread_id) + + def get_current_thread(self): + reply = self.pass_through("qC") + return self.parse_thread_id(reply[2:]) + + def parse_thread_id(self, thread_id): + m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id) + if m is None: + raise ValueError("Invalid thread ID: " + thread_id) + if self.pid is None: + self.pid = int(m.group(2), 16) + return int(m.group(3), 16) + + def history_boundary_reply(self, thread_id): + return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;" + + def singlestep_stop_reply(self, thread_id): + return f"T05thread:{self.pid:x}.{thread_id:x};" + + def set_current_thread(self, thread_id): + """ + Set current thread in inner gdbserver. + """ + if thread_id >= 0: + self.pass_through(f"Hg{self.pid:x}.{thread_id:x}") + self.pass_through(f"Hc{self.pid:x}.{thread_id:x}") + else: + self.pass_through(f"Hc-1.-1") + self.pass_through(f"Hg-1.-1") + + def get_register(self, register_info, registers): + if register_info.bitsize % 8 != 0: + raise ValueError("Register size must be a multiple of 8 bits") + if register_info.lldb_index not in registers: + raise ValueError("Register value not captured") + data = registers[register_info.lldb_index] + num_bytes = register_info.bitsize//8 + bytes = [] + for i in range(0, num_bytes): + bytes.append(int(data[i*2:(i + 1)*2], 16)) + if register_info.little_endian: + bytes.reverse() + result = 0 + for byte in bytes: + result = (result << 8) + byte + return result + + def read_memory(self, start_addr, end_addr): + """ + Read a region of memory from the target. + + Some of the addresses may extend into invalid virtual memory; + skip those areas. + Return a list of blocks containing the valid area(s) in the + requested range. + """ + regions = [] + start_addr = start_addr & (BLOCK_SIZE - 1) + end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1) + for addr in range(start_addr, end_addr, BLOCK_SIZE): + reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}") + if reply and reply[0] != 'E': + block = MemoryBlockSnapshot(addr, reply[1:]) + regions.append(block) + return regions + + def ensure_register_info(self): + if self.general_purpose_register_info is not None: + return + reply = self.pass_through("qHostInfo") + little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply)) + self.general_purpose_register_info = {} + lldb_index = 0 + while True: + reply = self.pass_through(f"qRegisterInfo{lldb_index:x}") + if not reply or reply[0] == 'E': + break + info = {k:v for k, v in self.parse_pairs(reply)} + reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian) + if info["set"] == "General Purpose Registers" and not "container-regs" in info: + self.general_purpose_register_info[lldb_index] = reg_info + if "generic" in info: + if info["generic"] == "pc": + self.pc_register_info = reg_info + elif info["generic"] == "sp": + self.sp_register_info = reg_info + lldb_index += 1 + if self.pc_register_info is None or self.sp_register_info is None: + raise ValueError("Can't find generic pc or sp register") + + def get_thread_list(self): + threads = [] + reply = self.pass_through("qfThreadInfo") + while True: + if not reply: + raise ValueError("Missing reply packet") + if reply[0] == 'm': + for id in reply[1:].split(","): + threads.append(self.parse_thread_id(id)) + elif reply[0] == 'l': + return threads + reply = self.pass_through("qsThreadInfo") diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index 8884ef5933ada8..7cc1ac9749ec93 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -143,6 +143,8 @@ STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint" +STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary" + DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly" VALID_BREAKPOINT = "Got a valid breakpoint" diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp index 9773144723c34c..07780f9f9c8393 100644 --- a/lldb/source/API/SBProcess.cpp +++ b/lldb/source/API/SBProcess.cpp @@ -564,6 +564,10 @@ uint32_t SBProcess::GetAddressByteSize() const { } SBError SBProcess::Continue() { + return Continue(RunDirection::eRunForward); +} + +SBError SBProcess::Continue(RunDirection direction) { LLDB_INSTRUMENT_VA(this); SBError sb_error; @@ -574,9 +578,9 @@ SBError SBProcess::Continue() { process_sp->GetTarget().GetAPIMutex()); if (process_sp->GetTarget().GetDebugger().GetAsyncExecution()) - sb_error.ref() = process_sp->Resume(); + sb_error.ref() = process_sp->Resume(direction); else - sb_error.ref() = process_sp->ResumeSynchronous(nullptr); + sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction); } else sb_error = Status::FromErrorString("SBProcess is invalid"); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index a99456e06d0329..aca8a039952960 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -172,6 +172,7 @@ size_t SBThread::GetStopReasonDataCount() { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: + case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; @@ -233,6 +234,7 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: + case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 8d3a82ef6c990a..ea60492ac46a10 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2553,7 +2553,8 @@ bool CommandInterpreter::DidProcessStopAbnormally() const { const StopReason reason = stop_info->GetStopReason(); if (reason == eStopReasonException || reason == eStopReasonInstrumentation || - reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt) + reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt || + reason == eStopReasonHistoryBoundary) return true; if (reason == eStopReasonSignal) { diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp index de047ee214c11e..b0aa664775b463 100644 --- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp @@ -82,6 +82,9 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info, case eStopReasonProcessorTrace: log.Printf("%s: %s processor trace", __FUNCTION__, header); return; + case eStopReasonHistoryBoundary: + log.Printf("%s: %s history boundary", __FUNCTION__, header); + return; default: log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header, static_cast(stop_info.reason)); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 9b2907c6809965..116c43343c01d1 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -402,9 +402,16 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() { Status ProcessKDP::WillResume() { return Status(); } -Status ProcessKDP::DoResume() { +Status ProcessKDP::DoResume(RunDirection direction) { Status error; Log *log = GetLog(KDPLog::Process); + + if (direction == RunDirection::eRunReverse) { + error.SetErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + return error; + } + // Only start the async thread if we try to do any process control if (!m_async_thread.IsJoinable()) StartAsyncThread(); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h index e5ec5914f9600d..1b71d83f70b087 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h @@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process { // Process Control lldb_private::Status WillResume() override; - lldb_private::Status DoResume() override; + lldb_private::Status DoResume(lldb::RunDirection direction) override; lldb_private::Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index 703aa082f0476f..76b7095deaa503 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -204,11 +204,17 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid, return error; } -Status ProcessWindows::DoResume() { +Status ProcessWindows::DoResume(RunDirection direction) { Log *log = GetLog(WindowsLog::Process); llvm::sys::ScopedLock lock(m_mutex); Status error; + if (direction == RunDirection::eRunReverse) { + error.SetErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + return error; + } + StateType private_state = GetPrivateState(); if (private_state == eStateStopped || private_state == eStateCrashed) { LLDB_LOG(log, "process {0} is in state {1}. Resuming...", diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h index e97cfb790248be..97284b7cd1436e 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h @@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger { Status DoAttachToProcessWithID( lldb::pid_t pid, const lldb_private::ProcessAttachInfo &attach_info) override; - Status DoResume() override; + Status DoResume(lldb::RunDirection direction) override; Status DoDestroy() override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index e42526c8fd7266..fc792a4409410b 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -199,6 +199,20 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() { return m_max_packet_size; } +bool GDBRemoteCommunicationClient::GetReverseContinueSupported() { + if (m_supports_reverse_continue == eLazyBoolCalculate) { + GetRemoteQSupported(); + } + return m_supports_reverse_continue == eLazyBoolYes; +} + +bool GDBRemoteCommunicationClient::GetReverseStepSupported() { + if (m_supports_reverse_step == eLazyBoolCalculate) { + GetRemoteQSupported(); + } + return m_supports_reverse_step == eLazyBoolYes; +} + bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() { if (m_supports_not_sending_acks == eLazyBoolCalculate) { m_send_acks = true; @@ -295,6 +309,8 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) { m_supports_qXfer_siginfo_read = eLazyBoolCalculate; m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate; m_uses_native_signals = eLazyBoolCalculate; + m_supports_reverse_continue = eLazyBoolCalculate; + m_supports_reverse_step = eLazyBoolCalculate; m_supports_qProcessInfoPID = true; m_supports_qfProcessInfo = true; m_supports_qUserName = true; @@ -348,6 +364,8 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_memory_tagging = eLazyBoolNo; m_supports_qSaveCore = eLazyBoolNo; m_uses_native_signals = eLazyBoolNo; + m_supports_reverse_continue = eLazyBoolNo; + m_supports_reverse_step = eLazyBoolNo; m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if // not, we assume no limit @@ -401,6 +419,10 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_qSaveCore = eLazyBoolYes; else if (x == "native-signals+") m_uses_native_signals = eLazyBoolYes; + else if (x == "ReverseContinue+") + m_supports_reverse_continue = eLazyBoolYes; + else if (x == "ReverseStep+") + m_supports_reverse_step = eLazyBoolYes; // Look for a list of compressions in the features list e.g. // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib- // deflate,lzma diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h index 898d176abc3465..116b47c1edf033 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h @@ -331,6 +331,10 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { bool GetMultiprocessSupported(); + bool GetReverseContinueSupported(); + + bool GetReverseStepSupported(); + LazyBool SupportsAllocDeallocMemory() // const { // Uncomment this to have lldb pretend the debug server doesn't respond to @@ -561,6 +565,8 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { LazyBool m_supports_memory_tagging = eLazyBoolCalculate; LazyBool m_supports_qSaveCore = eLazyBoolCalculate; LazyBool m_uses_native_signals = eLazyBoolCalculate; + LazyBool m_supports_reverse_continue = eLazyBoolCalculate; + LazyBool m_supports_reverse_step = eLazyBoolCalculate; bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1, m_supports_qUserName : 1, m_supports_qGroupName : 1, diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp index 35fa93e53bc66f..4016cde74ebea8 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp @@ -716,6 +716,7 @@ static const char *GetStopReasonString(StopReason stop_reason) { return "vforkdone"; case eStopReasonInterrupt: return "async interrupt"; + case eStopReasonHistoryBoundary: case eStopReasonInstrumentation: case eStopReasonInvalid: case eStopReasonPlanComplete: diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 3e09c316d74f44..3fc03bd05d5df0 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -169,6 +169,10 @@ class PluginProperties : public Properties { } }; +std::chrono::seconds ResumeTimeout() { + return std::chrono::seconds(5); +} + } // namespace static PluginProperties &GetGlobalPluginProperties() { @@ -1180,10 +1184,11 @@ Status ProcessGDBRemote::WillResume() { return Status(); } -Status ProcessGDBRemote::DoResume() { +Status ProcessGDBRemote::DoResume(RunDirection direction) { Status error; Log *log = GetLog(GDBRLog::Process); - LLDB_LOGF(log, "ProcessGDBRemote::Resume()"); + LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)", + direction == RunDirection::eRunForward ? "" : "reverse"); ListenerSP listener_sp( Listener::MakeListener("gdb-remote.resume-packet-sent")); @@ -1197,12 +1202,21 @@ Status ProcessGDBRemote::DoResume() { StreamString continue_packet; bool continue_packet_error = false; - if (m_gdb_comm.HasAnyVContSupport()) { + // Number of threads continuing with "c", i.e. continuing without a signal to deliver. + const size_t num_continue_c_tids = m_continue_c_tids.size(); + // Number of threads continuing with "C", i.e. continuing with a signal to deliver. + const size_t num_continue_C_tids = m_continue_C_tids.size(); + // Number of threads continuing with "s", i.e. single-stepping. + const size_t num_continue_s_tids = m_continue_s_tids.size(); + // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver. + const size_t num_continue_S_tids = m_continue_S_tids.size(); + if (direction == RunDirection::eRunForward && + m_gdb_comm.HasAnyVContSupport()) { std::string pid_prefix; if (m_gdb_comm.GetMultiprocessSupported()) pid_prefix = llvm::formatv("p{0:x-}.", GetID()); - if (m_continue_c_tids.size() == num_threads || + if (num_continue_c_tids == num_threads || (m_continue_c_tids.empty() && m_continue_C_tids.empty() && m_continue_s_tids.empty() && m_continue_S_tids.empty())) { // All threads are continuing @@ -1265,14 +1279,11 @@ Status ProcessGDBRemote::DoResume() { } else continue_packet_error = true; - if (continue_packet_error) { + if (direction == RunDirection::eRunForward && continue_packet_error) { // Either no vCont support, or we tried to use part of the vCont packet - // that wasn't supported by the remote GDB server. We need to try and - // make a simple packet that can do our continue - const size_t num_continue_c_tids = m_continue_c_tids.size(); - const size_t num_continue_C_tids = m_continue_C_tids.size(); - const size_t num_continue_s_tids = m_continue_s_tids.size(); - const size_t num_continue_S_tids = m_continue_S_tids.size(); + // that wasn't supported by the remote GDB server, or it's the reverse + // direction. We need to try and make a simple packet that can do our + // continue. if (num_continue_c_tids > 0) { if (num_continue_c_tids == num_threads) { // All threads are resuming... @@ -1363,9 +1374,41 @@ Status ProcessGDBRemote::DoResume() { } } + if (direction == RunDirection::eRunReverse && continue_packet_error) { + if (num_continue_C_tids > 0 || num_continue_S_tids > 0) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported"); + return Status::FromErrorString("can't deliver signals while running in reverse"); + } + + if (num_continue_s_tids > 0) { + if (num_continue_s_tids > 1) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads"); + return Status::FromErrorString("can't step multiple threads while reverse-stepping"); + } + + if (!m_gdb_comm.GetReverseStepSupported()) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping"); + return Status::FromErrorString("target does not support reverse-stepping"); + } + + m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front()); + continue_packet.PutCString("bs"); + } else { + if (!m_gdb_comm.GetReverseContinueSupported()) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue"); + return Status::FromErrorString("target does not support reverse-continue"); + } + + // All threads continue whether requested or not --- + // we can't change how threads ran in the past. + continue_packet.PutCString("bc"); + } + + continue_packet_error = false; + } + if (continue_packet_error) { - error = - Status::FromErrorString("can't make continue packet for this resume"); + return Status::FromErrorString("can't make continue packet for this resume"); } else { EventSP event_sp; if (!m_async_thread.IsJoinable()) { @@ -1380,7 +1423,7 @@ Status ProcessGDBRemote::DoResume() { std::make_shared(continue_packet.GetString()); m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp); - if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) { + if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) { error = Status::FromErrorString("Resume timed out."); LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out."); } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) { @@ -1863,6 +1906,10 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo( thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException( *thread_sp, description.c_str())); handled = true; + } else if (reason == "replaylog") { + thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary( + *thread_sp, description.c_str())); + handled = true; } else if (reason == "exec") { did_exec = true; thread_sp->SetStopInfo( @@ -2318,6 +2365,8 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) { description = std::string(ostr.GetString()); } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) { reason = "breakpoint"; + } else if (key.compare("replaylog") == 0) { + reason = "replaylog"; } else if (key.compare("library") == 0) { auto error = LoadModules(); if (error) { diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h index 2492795851388a..fa3e1cec76e2b3 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h @@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process, // Process Control Status WillResume() override; - Status DoResume() override; + Status DoResume(lldb::RunDirection direction) override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp index d2111ce877ce55..304c12173dd35d 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp @@ -182,10 +182,15 @@ void ScriptedProcess::DidResume() { m_pid = GetInterface().GetProcessID(); } -Status ScriptedProcess::DoResume() { +Status ScriptedProcess::DoResume(RunDirection direction) { LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__); - return GetInterface().Resume(); + if (direction == RunDirection::eRunForward) { + return GetInterface().Resume(); + } else { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + } } Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) { diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h index 0335364b4010b2..8ebe4ca5f3d449 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h @@ -52,7 +52,7 @@ class ScriptedProcess : public Process { void DidResume() override; - Status DoResume() override; + Status DoResume(lldb::RunDirection direction) override; Status DoAttachToProcessWithID(lldb::pid_t pid, const ProcessAttachInfo &attach_info) override; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index aca08972811470..ff6a2f59eba35f 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -446,7 +446,8 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, m_memory_cache(*this), m_allocated_memory_cache(*this), m_should_detach(false), m_next_event_action_up(), m_public_run_lock(), m_private_run_lock(), m_currently_handling_do_on_removals(false), - m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID), + m_resume_requested(false), m_last_run_direction(eRunForward), + m_interrupt_tid(LLDB_INVALID_THREAD_ID), m_finalizing(false), m_destructing(false), m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false), m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false), @@ -845,6 +846,7 @@ bool Process::HandleProcessStateChangedEvent( switch (thread_stop_reason) { case eStopReasonInvalid: case eStopReasonNone: + case eStopReasonHistoryBoundary: break; case eStopReasonSignal: { @@ -1352,7 +1354,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) { } } -Status Process::Resume() { +Status Process::Resume(RunDirection direction) { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data()); if (!m_public_run_lock.TrySetRunning()) { @@ -1361,7 +1363,7 @@ Status Process::Resume() { return Status::FromErrorString( "Resume request failed - process still running."); } - Status error = PrivateResume(); + Status error = PrivateResume(direction); if (!error.Success()) { // Undo running state change m_public_run_lock.SetStopped(); @@ -1369,7 +1371,7 @@ Status Process::Resume() { return error; } -Status Process::ResumeSynchronous(Stream *stream) { +Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock"); if (!m_public_run_lock.TrySetRunning()) { @@ -1382,7 +1384,7 @@ Status Process::ResumeSynchronous(Stream *stream) { Listener::MakeListener(ResumeSynchronousHijackListenerName.data())); HijackProcessEvents(listener_sp); - Status error = PrivateResume(); + Status error = PrivateResume(direction); if (error.Success()) { StateType state = WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream, @@ -3239,7 +3241,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) { return error; } -Status Process::PrivateResume() { +Status Process::PrivateResume(RunDirection direction) { Log *log(GetLog(LLDBLog::Process | LLDBLog::Step)); LLDB_LOGF(log, "Process::PrivateResume() m_stop_id = %u, public state: %s " @@ -3255,6 +3257,15 @@ Status Process::PrivateResume() { if (!GetModID().IsLastResumeForUserExpression()) ResetExtendedCrashInfoDict(); + if (m_last_run_direction != direction) { + // In the future we might want to support mixed-direction plans, + // e.g. a forward step-over stops at a breakpoint, the user does + // a reverse-step, then disables the breakpoint and continues forward. + // This code will need to be changed to support that. + m_thread_list.DiscardThreadPlans(); + m_last_run_direction = direction; + } + Status error(WillResume()); // Tell the process it is about to resume before the thread list if (error.Success()) { @@ -3272,7 +3283,7 @@ Status Process::PrivateResume() { "Process::PrivateResume PreResumeActions failed, not resuming."); } else { m_mod_id.BumpResumeID(); - error = DoResume(); + error = DoResume(direction); if (error.Success()) { DidResume(); m_thread_list.DidResume(); @@ -3735,7 +3746,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) { "from state: %s", static_cast(event_ptr), StateAsCString(state)); ProcessEventData::SetRestartedInEvent(event_ptr, true); - PrivateResume(); + PrivateResume(m_last_run_direction); } } else { return_value = true; @@ -4346,7 +4357,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) { SetRestarted(true); // Use the private resume method here, since we aren't changing the run // lock state. - process_sp->PrivateResume(); + process_sp->PrivateResume(process_sp->m_last_run_direction); } else { bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) && !process_sp->StateChangedIsHijackedForSynchronousResume(); diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index bd7032b803df90..08e9a7c099bad2 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -1212,6 +1212,30 @@ class StopInfoProcessorTrace : public StopInfo { } }; +// StopInfoHistoryBoundary + +class StopInfoHistoryBoundary : public StopInfo { +public: + StopInfoHistoryBoundary(Thread &thread, const char *description) + : StopInfo(thread, LLDB_INVALID_UID) { + if (description) + SetDescription(description); + } + + ~StopInfoHistoryBoundary() override = default; + + StopReason GetStopReason() const override { + return eStopReasonHistoryBoundary; + } + + const char *GetDescription() override { + if (m_description.empty()) + return "history boundary"; + else + return m_description.c_str(); + } +}; + // StopInfoThreadPlan class StopInfoThreadPlan : public StopInfo { @@ -1439,6 +1463,11 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread, return StopInfoSP(new StopInfoProcessorTrace(thread, description)); } +StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread, + const char *description) { + return StopInfoSP(new StopInfoHistoryBoundary(thread, description)); +} + StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) { return StopInfoSP(new StopInfoExec(thread)); } diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 902fbb2b519ef7..bbb586f033b746 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -624,10 +624,12 @@ void Thread::SetupForResume() { // what the current plan is. lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext()); - if (reg_ctx_sp) { + ProcessSP process_sp(GetProcess()); + if (reg_ctx_sp && process_sp && + process_sp->GetLastRunDirection() == eRunForward) { const addr_t thread_pc = reg_ctx_sp->GetPC(); BreakpointSiteSP bp_site_sp = - GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc); + process_sp->GetBreakpointSiteList().FindByAddress(thread_pc); if (bp_site_sp) { // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the // target may not require anything special to step over a breakpoint. @@ -1732,6 +1734,8 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) { return "processor trace"; case eStopReasonInterrupt: return "async interrupt"; + case eStopReasonHistoryBoundary: + return "history boundary"; } return "StopReason = " + std::to_string(reason); diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile new file mode 100644 index 00000000000000..10495940055b63 --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py new file mode 100644 index 00000000000000..b37578fbd82468 --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py @@ -0,0 +1,115 @@ +import lldb +import time +import unittest +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test.gdbclientutils import * +from lldbsuite.test.lldbreverse import ReverseTestBase +from lldbsuite.test import lldbutil + + +class TestReverseContinueBreakpoints(ReverseTestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test_reverse_continue(self): + self.reverse_continue_internal(async_mode=False) + + def test_reverse_continue_async(self): + self.reverse_continue_internal(async_mode=True) + + def reverse_continue_internal(self, async_mode): + target, process, initial_threads = self.setup_recording(async_mode) + + # Reverse-continue. We'll stop at the point where we started recording. + status = process.Continue(lldb.eRunReverse) + self.assertSuccess(status) + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) + self.expect( + "thread list", + STOPPED_DUE_TO_HISTORY_BOUNDARY, + substrs=["stopped", "stop reason = history boundary"], + ) + + # Continue forward normally until the target exits. + status = process.Continue() + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited]) + self.assertSuccess(status) + self.assertState(process.GetState(), lldb.eStateExited) + self.assertEqual(process.GetExitStatus(), 0) + + def test_reverse_continue_breakpoint(self): + self.reverse_continue_breakpoint_internal(async_mode=False) + + def test_reverse_continue_breakpoint_async(self): + self.reverse_continue_breakpoint_internal(async_mode=True) + + def reverse_continue_breakpoint_internal(self, async_mode): + target, process, initial_threads = self.setup_recording(async_mode) + + # Reverse-continue to the function "trigger_breakpoint". + trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) + status = process.Continue(lldb.eRunReverse) + self.assertSuccess(status) + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) + threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt) + self.assertEqual(threads_now, initial_threads) + + def test_reverse_continue_skip_breakpoint(self): + self.reverse_continue_skip_breakpoint_internal(async_mode=False) + + def test_reverse_continue_skip_breakpoint_async(self): + self.reverse_continue_skip_breakpoint_internal(async_mode=True) + + def reverse_continue_skip_breakpoint_internal(self, async_mode): + target, process, initial_threads = self.setup_recording(async_mode) + + # Reverse-continue over a breakpoint at "trigger_breakpoint" whose + # condition is false. + # This tests that we continue in the correct direction after hitting + # the breakpoint. + trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) + trigger_bkpt.SetCondition("false_condition") + status = process.Continue(lldb.eRunReverse) + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) + self.assertSuccess(status) + self.expect( + "thread list", + STOPPED_DUE_TO_HISTORY_BOUNDARY, + substrs=["stopped", "stop reason = history boundary"], + ) + + def setup_recording(self, async_mode): + """ + Record execution of code between "start_recording" and "stop_recording" breakpoints. + + Returns with the target stopped at "stop_recording", with recording disabled, + ready to reverse-execute. + """ + self.build() + target = self.dbg.CreateTarget("") + process = self.connect(target) + + # Record execution from the start of the function "start_recording" + # to the start of the function "stop_recording". We want to keep the + # interval that we record as small as possible to minimize the run-time + # of our single-stepping recorder. + start_recording_bkpt = target.BreakpointCreateByName("start_recording", None) + initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt) + self.assertEqual(len(initial_threads), 1) + target.BreakpointDelete(start_recording_bkpt.GetID()) + self.start_recording() + stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None) + lldbutil.continue_to_breakpoint(process, stop_recording_bkpt) + target.BreakpointDelete(stop_recording_bkpt.GetID()) + self.stop_recording() + + self.dbg.SetAsync(async_mode) + self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped]) + + return target, process, initial_threads + + def expect_async_state_changes(self, async_mode, process, states): + if not async_mode: + return + listener = self.dbg.GetListener() + lldbutil.expect_state_changes(self, listener, process, states) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py new file mode 100644 index 00000000000000..d610761b8cb0bc --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py @@ -0,0 +1,30 @@ +import lldb +import unittest +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test import lldbutil + + +class TestReverseContinueNotSupported(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test_reverse_continue_not_supported(self): + self.build() + exe = self.getBuildArtifact("a.out") + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + + main_bkpt = target.BreakpointCreateByName("main", None) + self.assertTrue(main_bkpt, VALID_BREAKPOINT) + + process = target.LaunchSimple(None, None, self.get_process_working_directory()) + self.assertTrue(process, PROCESS_IS_VALID) + + # This will fail gracefully. + status = process.Continue(lldb.eRunReverse) + self.assertFailure(status, "target does not support reverse-continue") + + status = process.Continue() + self.assertSuccess(status) + self.assertState(process.GetState(), lldb.eStateExited) + self.assertEqual(process.GetExitStatus(), 0) diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c new file mode 100644 index 00000000000000..40e45dc9f5c317 --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/main.c @@ -0,0 +1,14 @@ +volatile int false_condition = 0; + +static void start_recording() {} + +static void trigger_breakpoint() {} + +static void stop_recording() {} + +int main() { + start_recording(); + trigger_breakpoint(); + stop_recording(); + return 0; +} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 558f889c4b7f23..211fd34957f496 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -1045,6 +1045,9 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread, case lldb::eStopReasonProcessorTrace: body.try_emplace("reason", "processor trace"); break; + case lldb::eStopReasonHistoryBoundary: + body.try_emplace("reason", "history boundary"); + break; case lldb::eStopReasonSignal: case lldb::eStopReasonException: body.try_emplace("reason", "exception"); diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp index b38833c0fdb6b6..1c5e3ac7008727 100644 --- a/lldb/tools/lldb-dap/LLDBUtils.cpp +++ b/lldb/tools/lldb-dap/LLDBUtils.cpp @@ -111,6 +111,7 @@ bool ThreadHasStopReason(lldb::SBThread &thread) { case lldb::eStopReasonVFork: case lldb::eStopReasonVForkDone: case lldb::eStopReasonInterrupt: + case lldb::eStopReasonHistoryBoundary: return true; case lldb::eStopReasonThreadExiting: case lldb::eStopReasonInvalid: From f0ed31ce4b63a5530fd1de875c0d1467d4d2c6ea Mon Sep 17 00:00:00 2001 From: Youngsuk Kim Date: Thu, 10 Oct 2024 16:02:13 -0400 Subject: [PATCH 073/177] [llvm][PGOCtxProfLowering] Avoid Type::getPointerTo() (NFC) (#111857) `Type::getPointerTo()` is to be deprecated & removed soon. --- .../Instrumentation/PGOCtxProfLowering.cpp | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index b620306628729b..e7b7c26c493e50 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -154,15 +154,15 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, StartCtx = cast( M.getOrInsertFunction( CompilerRtAPINames::StartCtx, - FunctionType::get(ContextNodeTy->getPointerTo(), - {ContextRootTy->getPointerTo(), /*ContextRoot*/ + FunctionType::get(PointerTy, + {PointerTy, /*ContextRoot*/ I64Ty, /*Guid*/ I32Ty, /*NumCounters*/ I32Ty /*NumCallsites*/}, false)) .getCallee()); GetCtx = cast( M.getOrInsertFunction(CompilerRtAPINames::GetCtx, - FunctionType::get(ContextNodeTy->getPointerTo(), + FunctionType::get(PointerTy, {PointerTy, /*Callee*/ I64Ty, /*Guid*/ I32Ty, /*NumCounters*/ @@ -170,13 +170,12 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, false)) .getCallee()); ReleaseCtx = cast( - M.getOrInsertFunction( - CompilerRtAPINames::ReleaseCtx, - FunctionType::get(Type::getVoidTy(M.getContext()), - { - ContextRootTy->getPointerTo(), /*ContextRoot*/ - }, - false)) + M.getOrInsertFunction(CompilerRtAPINames::ReleaseCtx, + FunctionType::get(Type::getVoidTy(M.getContext()), + { + PointerTy, /*ContextRoot*/ + }, + false)) .getCallee()); // Declare the TLSes we will need to use. @@ -264,7 +263,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { auto *Index = Builder.CreateAnd(CtxAsInt, Builder.getInt64(1)); // The GEPs corresponding to that index, in the respective TLS. ExpectedCalleeTLSAddr = Builder.CreateGEP( - Builder.getInt8Ty()->getPointerTo(), + PointerType::getUnqual(F.getContext()), Builder.CreateThreadLocalAddress(ExpectedCalleeTLS), {Index}); CallsiteInfoTLSAddr = Builder.CreateGEP( Builder.getInt32Ty(), @@ -277,7 +276,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { // with counters) stays the same. RealContext = Builder.CreateIntToPtr( Builder.CreateAnd(CtxAsInt, Builder.getInt64(-2)), - ThisContextType->getPointerTo()); + PointerType::getUnqual(F.getContext())); I.eraseFromParent(); break; } From 942fefe74112acb68fa43dde44abe3ae125457e1 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 11:23:11 -0700 Subject: [PATCH 074/177] [NFC][sanitizer] Reopen '/proc/%d/task' instead of seek NFC because I am not aware of any particular issue from seek, but reopen looks less error prone. Pull Request: https://github.com/llvm/llvm-project/pull/111899 --- .../lib/sanitizer_common/sanitizer_linux.cpp | 30 +++++++------------ .../lib/sanitizer_common/sanitizer_linux.h | 5 ++-- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index d421d117e67274..70fd9405e5454f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -1025,21 +1025,19 @@ bool internal_sigismember(__sanitizer_sigset_t *set, int signum) { # if !SANITIZER_NETBSD // ThreadLister implementation. -ThreadLister::ThreadLister(pid_t pid) : pid_(pid), buffer_(4096) { - char task_directory_path[80]; - internal_snprintf(task_directory_path, sizeof(task_directory_path), - "/proc/%d/task/", pid); - descriptor_ = internal_open(task_directory_path, O_RDONLY | O_DIRECTORY); - if (internal_iserror(descriptor_)) { - Report("Can't open /proc/%d/task for reading.\n", pid); - } +ThreadLister::ThreadLister(pid_t pid) : buffer_(4096) { + task_path_.AppendF("/proc/%d/task", pid); + status_path_.AppendF("%s/status", task_path_.data()); } ThreadLister::Result ThreadLister::ListThreads( InternalMmapVector *threads) { - if (internal_iserror(descriptor_)) + int descriptor = internal_open(task_path_.data(), O_RDONLY | O_DIRECTORY); + if (internal_iserror(descriptor)) { + Report("Can't open %s for reading.\n", task_path_.data()); return Error; - internal_lseek(descriptor_, 0, SEEK_SET); + } + auto acts_cleanup = at_scope_exit([&] { internal_close(descriptor); }); threads->clear(); Result result = Ok; @@ -1048,11 +1046,11 @@ ThreadLister::Result ThreadLister::ListThreads( buffer_.resize(buffer_.capacity()); CHECK_GE(buffer_.size(), 4096); uptr read = internal_getdents( - descriptor_, (struct linux_dirent *)buffer_.data(), buffer_.size()); + descriptor, (struct linux_dirent *)buffer_.data(), buffer_.size()); if (!read) return result; if (internal_iserror(read)) { - Report("Can't read directory entries from /proc/%d/task.\n", pid_); + Report("Can't read directory entries from %s.\n", task_path_.data()); return Error; } @@ -1093,9 +1091,7 @@ ThreadLister::Result ThreadLister::ListThreads( bool ThreadLister::IsAlive(int tid) { // /proc/%d/task/%d/status uses same call to detect alive threads as // proc_task_readdir. See task_state implementation in Linux. - char path[80]; - internal_snprintf(path, sizeof(path), "/proc/%d/task/%d/status", pid_, tid); - if (!ReadFileToVector(path, &buffer_) || buffer_.empty()) + if (!ReadFileToVector(status_path_.data(), &buffer_) || buffer_.empty()) return false; buffer_.push_back(0); static const char kPrefix[] = "\nPPid:"; @@ -1106,10 +1102,6 @@ bool ThreadLister::IsAlive(int tid) { return (int)internal_atoll(field) != 0; } -ThreadLister::~ThreadLister() { - if (!internal_iserror(descriptor_)) - internal_close(descriptor_); -} # endif # if SANITIZER_WORDSIZE == 32 diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h index c30f0326793d5a..96c617822b5b27 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h @@ -97,7 +97,6 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg); class ThreadLister { public: explicit ThreadLister(pid_t pid); - ~ThreadLister(); enum Result { Error, Incomplete, @@ -108,8 +107,8 @@ class ThreadLister { private: bool IsAlive(int tid); - pid_t pid_; - int descriptor_ = -1; + InternalScopedString task_path_; + InternalScopedString status_path_; InternalMmapVector buffer_; }; From 69b0b7e7ac3adc42df517c25ed7017b5af9be9f1 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 10 Oct 2024 13:11:08 -0700 Subject: [PATCH 075/177] [lldb] Return an llvm::Error from GetFrameBaseValue (#111882) This fixes the following assertion: "Cannot create Expected from Error success value." The problem was that GetFrameBaseValue return false without updating the Status argument. This patch eliminates the opportunity for mistakes by returning an llvm:Error. --- lldb/include/lldb/Target/StackFrame.h | 10 +++------- lldb/source/Expression/DWARFExpression.cpp | 14 ++++++-------- lldb/source/Target/StackFrame.cpp | 13 ++++++------- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h index 5cc0fccee03b8f..fdbe1f567eabfa 100644 --- a/lldb/include/lldb/Target/StackFrame.h +++ b/lldb/include/lldb/Target/StackFrame.h @@ -195,14 +195,10 @@ class StackFrame : public ExecutionContextScope, /// \param [out] value /// The address of the CFA for this frame, if available. /// - /// \param [out] error_ptr - /// If there is an error determining the CFA address, this may contain a - /// string explaining the failure. - /// /// \return - /// Returns true if the CFA value was successfully set in value. Some - /// frames may be unable to provide this value; they will return false. - bool GetFrameBaseValue(Scalar &value, Status *error_ptr); + /// If there is an error determining the CFA address, return an error + /// explaining the failure. Success otherwise. + llvm::Error GetFrameBaseValue(Scalar &value); /// Get the DWARFExpressionList corresponding to the Canonical Frame Address. /// diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp index 22d899f799d0fd..97bcd4f7eec26f 100644 --- a/lldb/source/Expression/DWARFExpression.cpp +++ b/lldb/source/Expression/DWARFExpression.cpp @@ -1780,14 +1780,12 @@ llvm::Expected DWARFExpression::Evaluate( if (exe_ctx) { if (frame) { Scalar value; - Status fb_err; - if (frame->GetFrameBaseValue(value, &fb_err)) { - int64_t fbreg_offset = opcodes.GetSLEB128(&offset); - value += fbreg_offset; - stack.push_back(value); - stack.back().SetValueType(Value::ValueType::LoadAddress); - } else - return fb_err.ToError(); + if (llvm::Error err = frame->GetFrameBaseValue(value)) + return err; + int64_t fbreg_offset = opcodes.GetSLEB128(&offset); + value += fbreg_offset; + stack.push_back(value); + stack.back().SetValueType(Value::ValueType::LoadAddress); } else { return llvm::createStringError( "invalid stack frame in context for DW_OP_fbreg opcode"); diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index fe0d4c93c50627..ed493e35316137 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1079,12 +1079,12 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath( return valobj_sp; } -bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) { +llvm::Error StackFrame::GetFrameBaseValue(Scalar &frame_base) { std::lock_guard guard(m_mutex); if (!m_cfa_is_valid) { m_frame_base_error = Status::FromErrorString( "No frame base available for this historical stack frame."); - return false; + return m_frame_base_error.ToError(); } if (m_flags.IsClear(GOT_FRAME_BASE)) { @@ -1113,12 +1113,11 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) { } } - if (m_frame_base_error.Success()) - frame_base = m_frame_base; + if (m_frame_base_error.Fail()) + return m_frame_base_error.ToError(); - if (error_ptr) - *error_ptr = m_frame_base_error.Clone(); - return m_frame_base_error.Success(); + frame_base = m_frame_base; + return llvm::Error::success(); } DWARFExpressionList *StackFrame::GetFrameBaseExpression(Status *error_ptr) { From b3554265f24aa570bbc8693af8420a306c459f94 Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Thu, 10 Oct 2024 13:11:46 -0700 Subject: [PATCH 076/177] [lldb] Add include for SBLanguages in lldb-enumerations (#111907) This adds an include for SBLanguages.h in lldb-enumerations.h so that files that need this enum do not have to explicitly include SBLanguages. --- lldb/include/lldb/lldb-enumerations.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 232d1dfdb5c9d0..217cd7f65cc1c4 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -12,6 +12,8 @@ #include #include +#include + #ifndef SWIG // Macro to enable bitmask operations on an enum. Without this, Enum | Enum // gets promoted to an int, so you have to say Enum a = Enum(eFoo | eBar). If From 36bd9aebc428413a94f77e8daa679d1937dc2b63 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 13:12:36 -0700 Subject: [PATCH 077/177] [sanitizer] VReport BeforeFork/AfterFork (#111900) Forks are common suspects for unusual sanitizer behavior. It can be handy to see them without rebuild. --- compiler-rt/lib/asan/asan_posix.cpp | 2 ++ compiler-rt/lib/dfsan/dfsan_custom.cpp | 2 ++ compiler-rt/lib/hwasan/hwasan_linux.cpp | 2 ++ compiler-rt/lib/lsan/lsan_posix.cpp | 2 ++ compiler-rt/lib/msan/msan_linux.cpp | 2 ++ compiler-rt/lib/tsan/rtl/tsan_rtl.cpp | 2 ++ 6 files changed, 12 insertions(+) diff --git a/compiler-rt/lib/asan/asan_posix.cpp b/compiler-rt/lib/asan/asan_posix.cpp index 4ee8d7d399e95c..39685696a0d0dd 100644 --- a/compiler-rt/lib/asan/asan_posix.cpp +++ b/compiler-rt/lib/asan/asan_posix.cpp @@ -149,6 +149,7 @@ void PlatformTSDDtor(void *tsd) { # endif static void BeforeFork() { + VReport(2, "BeforeFork tid: %llu\n", GetTid()); if (CAN_SANITIZE_LEAKS) { __lsan::LockGlobal(); } @@ -168,6 +169,7 @@ static void AfterFork(bool fork_child) { if (CAN_SANITIZE_LEAKS) { __lsan::UnlockGlobal(); } + VReport(2, "AfterFork tid: %llu\n", GetTid()); } void InstallAtForkHandler() { diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp index 03147a79ed6543..dbc00d7ac3ea39 100644 --- a/compiler-rt/lib/dfsan/dfsan_custom.cpp +++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp @@ -2859,6 +2859,7 @@ WRAPPER_ALIAS(__isoc99_sscanf, sscanf) WRAPPER_ALIAS(__isoc23_sscanf, sscanf) static void BeforeFork() { + VReport(2, "BeforeFork tid: %llu\n", GetTid()); StackDepotLockBeforeFork(); ChainedOriginDepotLockBeforeFork(); } @@ -2866,6 +2867,7 @@ static void BeforeFork() { static void AfterFork(bool fork_child) { ChainedOriginDepotUnlockAfterFork(fork_child); StackDepotUnlockAfterFork(fork_child); + VReport(2, "AfterFork tid: %llu\n", GetTid()); } SANITIZER_INTERFACE_ATTRIBUTE diff --git a/compiler-rt/lib/hwasan/hwasan_linux.cpp b/compiler-rt/lib/hwasan/hwasan_linux.cpp index d174fb882ca483..68651d3d39d03e 100644 --- a/compiler-rt/lib/hwasan/hwasan_linux.cpp +++ b/compiler-rt/lib/hwasan/hwasan_linux.cpp @@ -528,6 +528,7 @@ uptr TagMemoryAligned(uptr p, uptr size, tag_t tag) { } static void BeforeFork() { + VReport(2, "BeforeFork tid: %llu\n", GetTid()); if (CAN_SANITIZE_LEAKS) { __lsan::LockGlobal(); } @@ -547,6 +548,7 @@ static void AfterFork(bool fork_child) { if (CAN_SANITIZE_LEAKS) { __lsan::UnlockGlobal(); } + VReport(2, "AfterFork tid: %llu\n", GetTid()); } void HwasanInstallAtForkHandler() { diff --git a/compiler-rt/lib/lsan/lsan_posix.cpp b/compiler-rt/lib/lsan/lsan_posix.cpp index ddd9fee07e89d2..593000b9eef991 100644 --- a/compiler-rt/lib/lsan/lsan_posix.cpp +++ b/compiler-rt/lib/lsan/lsan_posix.cpp @@ -97,6 +97,7 @@ void InstallAtExitCheckLeaks() { } static void BeforeFork() { + VReport(2, "BeforeFork tid: %llu\n", GetTid()); LockGlobal(); LockThreads(); LockAllocator(); @@ -108,6 +109,7 @@ static void AfterFork(bool fork_child) { UnlockAllocator(); UnlockThreads(); UnlockGlobal(); + VReport(2, "AfterFork tid: %llu\n", GetTid()); } void InstallAtForkHandler() { diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp index 894cf17002bbc0..7140de7e9c5432 100644 --- a/compiler-rt/lib/msan/msan_linux.cpp +++ b/compiler-rt/lib/msan/msan_linux.cpp @@ -302,6 +302,7 @@ void MsanTSDDtor(void *tsd) { # endif static void BeforeFork() { + VReport(2, "BeforeFork tid: %llu\n", GetTid()); // Usually we lock ThreadRegistry, but msan does not have one. LockAllocator(); StackDepotLockBeforeFork(); @@ -313,6 +314,7 @@ static void AfterFork(bool fork_child) { StackDepotUnlockAfterFork(fork_child); UnlockAllocator(); // Usually we unlock ThreadRegistry, but msan does not have one. + VReport(2, "AfterFork tid: %llu\n", GetTid()); } void InstallAtForkHandler() { diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp index bf29aa316f6809..5a2d39cd30607f 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp @@ -806,6 +806,7 @@ int Finalize(ThreadState *thr) { #if !SANITIZER_GO void ForkBefore(ThreadState* thr, uptr pc) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + VReport(2, "BeforeFork tid: %llu\n", GetTid()); GlobalProcessorLock(); // Detaching from the slot makes OnUserFree skip writing to the shadow. // The slot will be locked so any attempts to use it will deadlock anyway. @@ -847,6 +848,7 @@ static void ForkAfter(ThreadState* thr, SlotAttachAndLock(thr); SlotUnlock(thr); GlobalProcessorUnlock(); + VReport(2, "AfterFork tid: %llu\n", GetTid()); } void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr, false); } From 86f78c0093100016bcb0299d1b7828c2d30e3a56 Mon Sep 17 00:00:00 2001 From: Alexis Perry-Holby Date: Thu, 10 Oct 2024 14:21:21 -0600 Subject: [PATCH 078/177] [flang] Add a link to the ICS file for the Biweekly Flang Community Call --- flang/docs/GettingInvolved.md | 1 + 1 file changed, 1 insertion(+) diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md index f583d934ff2bfb..a8bd93517709dd 100644 --- a/flang/docs/GettingInvolved.md +++ b/flang/docs/GettingInvolved.md @@ -49,6 +49,7 @@ To understand the status of various developments in Flang please join the respec - If you prefer to join using a meeting number and password, those can be found in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/). Alternative methods of joining, such as call-in numbers, are also available. - Time: Wednesdays, 8:30 a.m. Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call. +- Calendar invite: https://drive.google.com/file/d/1rkfWCtIvQFcxN0Uz8YVwQGoX_BbzT8oc/view?usp=drive_link - Meeting minutes are available in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/edit). - Minutes from older meetings were posted on the [Flang forum](https://discourse.llvm.org/c/subprojects/flang). Search for `Flang Biweekly Sync - Notes`. From b77fdf5799be6b29869f2f7969851709e03938ba Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 10 Oct 2024 13:22:56 -0700 Subject: [PATCH 079/177] [lldb] SetErrorStringWithFormatv -> FromErrorStringWithFormatv (NFC) --- lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 116c43343c01d1..367fce442bb866 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -407,8 +407,9 @@ Status ProcessKDP::DoResume(RunDirection direction) { Log *log = GetLog(KDPLog::Process); if (direction == RunDirection::eRunReverse) { - error.SetErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); + error.FromErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", + GetPluginName()); return error; } From 69c0067927293bff1401a9a050081e83dbefd282 Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 10 Oct 2024 13:25:03 -0700 Subject: [PATCH 080/177] [SandboxVec][DAG] Refactoring: Outline code that looks for mem nodes (#111750) --- .../SandboxVectorizer/DependencyGraph.h | 8 ++++ .../SandboxVectorizer/DependencyGraph.cpp | 42 ++++++++++++++----- .../SandboxVectorizer/DependencyGraphTest.cpp | 14 +++++++ 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index da50e5326ea069..7d300ea2b60d2d 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -231,6 +231,14 @@ class MemDGNode final : public DGNode { /// Convenience builders for a MemDGNode interval. class MemDGNodeIntervalBuilder { public: + /// Scans the instruction chain in \p Intvl top-down, returning the top-most + /// MemDGNode, or nullptr. + static MemDGNode *getTopMemDGNode(const Interval &Intvl, + const DependencyGraph &DAG); + /// Scans the instruction chain in \p Intvl bottom-up, returning the + /// bottom-most MemDGNode, or nullptr. + static MemDGNode *getBotMemDGNode(const Interval &Intvl, + const DependencyGraph &DAG); /// Given \p Instrs it finds their closest mem nodes in the interval and /// returns the corresponding mem range. Note: BotN (or its neighboring mem /// node) is included in the range. diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 70843812ff65bc..0cd2240e7ff1b3 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -78,23 +78,43 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { } #endif // NDEBUG +MemDGNode * +MemDGNodeIntervalBuilder::getTopMemDGNode(const Interval &Intvl, + const DependencyGraph &DAG) { + Instruction *I = Intvl.top(); + Instruction *BeforeI = Intvl.bottom(); + // Walk down the chain looking for a mem-dep candidate instruction. + while (!DGNode::isMemDepNodeCandidate(I) && I != BeforeI) + I = I->getNextNode(); + if (!DGNode::isMemDepNodeCandidate(I)) + return nullptr; + return cast(DAG.getNode(I)); +} + +MemDGNode * +MemDGNodeIntervalBuilder::getBotMemDGNode(const Interval &Intvl, + const DependencyGraph &DAG) { + Instruction *I = Intvl.bottom(); + Instruction *AfterI = Intvl.top(); + // Walk up the chain looking for a mem-dep candidate instruction. + while (!DGNode::isMemDepNodeCandidate(I) && I != AfterI) + I = I->getPrevNode(); + if (!DGNode::isMemDepNodeCandidate(I)) + return nullptr; + return cast(DAG.getNode(I)); +} + Interval MemDGNodeIntervalBuilder::make(const Interval &Instrs, DependencyGraph &DAG) { - // If top or bottom instructions are not mem-dep candidate nodes we need to - // walk down/up the chain and find the mem-dep ones. - Instruction *MemTopI = Instrs.top(); - Instruction *MemBotI = Instrs.bottom(); - while (!DGNode::isMemDepNodeCandidate(MemTopI) && MemTopI != MemBotI) - MemTopI = MemTopI->getNextNode(); - while (!DGNode::isMemDepNodeCandidate(MemBotI) && MemBotI != MemTopI) - MemBotI = MemBotI->getPrevNode(); + auto *TopMemN = getTopMemDGNode(Instrs, DAG); // If we couldn't find a mem node in range TopN - BotN then it's empty. - if (!DGNode::isMemDepNodeCandidate(MemTopI)) + if (TopMemN == nullptr) return {}; + auto *BotMemN = getBotMemDGNode(Instrs, DAG); + assert(BotMemN != nullptr && "TopMemN should be null too!"); // Now that we have the mem-dep nodes, create and return the range. - return Interval(cast(DAG.getNode(MemTopI)), - cast(DAG.getNode(MemBotI))); + return Interval(TopMemN, BotMemN); } DependencyGraph::DependencyType diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp index 5a9c9815ca42fa..7e2be25fa25ae6 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp @@ -347,6 +347,20 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) { auto *S0N = cast(DAG.getNode(S0)); auto *S1N = cast(DAG.getNode(S1)); + // Check getTopMemDGNode(). + using B = sandboxir::MemDGNodeIntervalBuilder; + using InstrInterval = sandboxir::Interval; + EXPECT_EQ(B::getTopMemDGNode(InstrInterval(S0, S0), DAG), S0N); + EXPECT_EQ(B::getTopMemDGNode(InstrInterval(S0, Ret), DAG), S0N); + EXPECT_EQ(B::getTopMemDGNode(InstrInterval(Add0, Add1), DAG), S0N); + EXPECT_EQ(B::getTopMemDGNode(InstrInterval(Add0, Add0), DAG), nullptr); + + // Check getBotMemDGNode(). + EXPECT_EQ(B::getBotMemDGNode(InstrInterval(S1, S1), DAG), S1N); + EXPECT_EQ(B::getBotMemDGNode(InstrInterval(Add0, S1), DAG), S1N); + EXPECT_EQ(B::getBotMemDGNode(InstrInterval(Add0, Ret), DAG), S1N); + EXPECT_EQ(B::getBotMemDGNode(InstrInterval(Ret, Ret), DAG), nullptr); + // Check empty range. EXPECT_THAT(sandboxir::MemDGNodeIntervalBuilder::makeEmpty(), testing::ElementsAre()); From 195486950fa64938e62f6d85d31222fa41d0ee09 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 13:25:46 -0700 Subject: [PATCH 081/177] [NFC][sanitizer] Fix at_scope_exit name. --- compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index 70fd9405e5454f..e5d6d0a6e71649 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -1037,7 +1037,7 @@ ThreadLister::Result ThreadLister::ListThreads( Report("Can't open %s for reading.\n", task_path_.data()); return Error; } - auto acts_cleanup = at_scope_exit([&] { internal_close(descriptor); }); + auto cleanup = at_scope_exit([&] { internal_close(descriptor); }); threads->clear(); Result result = Ok; From 4b5018d2311596778cade4db5177e2ab879cc218 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 10 Oct 2024 13:40:22 -0700 Subject: [PATCH 082/177] [SLP]Track repeated reduced value as it might be vectorized Need to track changes with the repeated reduced value, since it might be vectorized in the next attempt for reduction vectorization, to correctly generate the code and avoid compiler crash. Fixes #111887 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 22 ++++++----- .../reduced-value-repeated-and-vectorized.ll | 37 +++++++++++++++++++ 2 files changed, 49 insertions(+), 10 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 94de520a2715ff..e2958c49b8ca9f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1293,8 +1293,7 @@ class BoUpSLP { using InstrList = SmallVector; using ValueSet = SmallPtrSet; using StoreList = SmallVector; - using ExtraValueToDebugLocsMap = - MapVector>; + using ExtraValueToDebugLocsMap = SmallDenseSet; using OrdersType = SmallVector; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, @@ -6322,7 +6321,7 @@ void BoUpSLP::buildExternalUses( continue; // Check if the scalar is externally used as an extra arg. - const auto *ExtI = ExternallyUsedValues.find(Scalar); + const auto ExtI = ExternallyUsedValues.find(Scalar); if (ExtI != ExternallyUsedValues.end()) { int FoundLane = Entry->findLaneForValue(Scalar); LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " @@ -18820,7 +18819,7 @@ class HorizontalReduction { // List of the values that were reduced in other trees as part of gather // nodes and thus requiring extract if fully vectorized in other trees. SmallPtrSet RequiredExtract; - Value *VectorizedTree = nullptr; + WeakTrackingVH VectorizedTree = nullptr; bool CheckForReusedReductionOps = false; // Try to vectorize elements based on their type. SmallVector States; @@ -18916,6 +18915,7 @@ class HorizontalReduction { bool SameScaleFactor = false; bool OptReusedScalars = IsSupportedHorRdxIdentityOp && SameValuesCounter.size() != Candidates.size(); + BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; if (OptReusedScalars) { SameScaleFactor = (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || @@ -18936,6 +18936,7 @@ class HorizontalReduction { emitScaleForReusedOps(Candidates.front(), Builder, Cnt); VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); VectorizedVals.try_emplace(OrigV, Cnt); + ExternallyUsedValues.insert(OrigV); continue; } } @@ -19015,17 +19016,18 @@ class HorizontalReduction { V.reorderBottomToTop(/*IgnoreReorder=*/true); // Keep extracted other reduction values, if they are used in the // vectorization trees. - BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues; + BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( + ExternallyUsedValues); // The reduction root is used as the insertion point for new // instructions, so set it as externally used to prevent it from being // deleted. - LocalExternallyUsedValues[ReductionRoot]; + LocalExternallyUsedValues.insert(ReductionRoot); for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) continue; for (Value *V : ReducedVals[Cnt]) if (isa(V)) - LocalExternallyUsedValues[TrackedVals[V]]; + LocalExternallyUsedValues.insert(TrackedVals[V]); } if (!IsSupportedHorRdxIdentityOp) { // Number of uses of the candidates in the vector of values. @@ -19054,21 +19056,21 @@ class HorizontalReduction { // Check if the scalar was vectorized as part of the vectorization // tree but not the top node. if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) { - LocalExternallyUsedValues[RdxVal]; + LocalExternallyUsedValues.insert(RdxVal); continue; } Value *OrigV = TrackedToOrig.at(RdxVal); unsigned NumOps = VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV); if (NumOps != ReducedValsToOps.at(OrigV).size()) - LocalExternallyUsedValues[RdxVal]; + LocalExternallyUsedValues.insert(RdxVal); } // Do not need the list of reused scalars in regular mode anymore. if (!IsSupportedHorRdxIdentityOp) SameValuesCounter.clear(); for (Value *RdxVal : VL) if (RequiredExtract.contains(RdxVal)) - LocalExternallyUsedValues[RdxVal]; + LocalExternallyUsedValues.insert(RdxVal); V.buildExternalUses(LocalExternallyUsedValues); V.computeMinimumValueSizes(); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll new file mode 100644 index 00000000000000..d5e1a110c6277c --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 null, i64 6, <4 x i1> , i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr null, align 2 +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i16> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP1]], i16 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP4]], i16 0) +; CHECK-NEXT: [[TMP6:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP5]], i16 0) +; CHECK-NEXT: ret void +; +entry: + %0 = load i16, ptr null, align 2 + %1 = xor i16 %0, 0 + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 %0) + %3 = tail call i16 @llvm.smax.i16(i16 0, i16 %2) + %4 = load i16, ptr getelementptr inbounds (i8, ptr null, i64 6), align 2 + %5 = xor i16 %4, 0 + %6 = tail call i16 @llvm.smax.i16(i16 %5, i16 %0) + %7 = tail call i16 @llvm.smax.i16(i16 %3, i16 %6) + %8 = load i16, ptr getelementptr (i8, ptr null, i64 12), align 2 + %9 = xor i16 %8, 0 + %10 = tail call i16 @llvm.smax.i16(i16 %9, i16 %0) + %11 = tail call i16 @llvm.smax.i16(i16 %7, i16 %10) + %12 = load i16, ptr getelementptr (i8, ptr null, i64 18), align 2 + %13 = xor i16 %12, 0 + %14 = tail call i16 @llvm.smax.i16(i16 %13, i16 %0) + %15 = tail call i16 @llvm.smax.i16(i16 %11, i16 %14) + %16 = tail call i16 @llvm.smax.i16(i16 %15, i16 0) + ret void +} + From 16ef893e9fdec2b08dafc82f5450b41834e09039 Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Wed, 9 Oct 2024 18:06:56 +0000 Subject: [PATCH 083/177] [test] env -u is not supported on AIX, use `unset` instead --- compiler-rt/test/profile/instrprof-tmpdir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler-rt/test/profile/instrprof-tmpdir.c b/compiler-rt/test/profile/instrprof-tmpdir.c index 6f323e7e6a01a5..7206df3c2eb0c6 100644 --- a/compiler-rt/test/profile/instrprof-tmpdir.c +++ b/compiler-rt/test/profile/instrprof-tmpdir.c @@ -12,7 +12,8 @@ // RUN: llvm-profdata show ./raw2.profraw | FileCheck %s -check-prefix TMPDIR // // Check that we fall back to the default path if TMPDIR is missing. -// RUN: env -u TMPDIR LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING +// RUN: %if system-aix %{ unset TMPDIR %} +// RUN: env %if !system-aix %{ -u TMPDIR %} LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING // RUN: llvm-profdata show ./default.profraw | FileCheck %s -check-prefix TMPDIR // TMPDIR: Maximum function count: 1 From c99b36554745837c549e1b46cd60db70588affcf Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 10 Oct 2024 14:50:34 -0700 Subject: [PATCH 084/177] Revert "[lldb] Add include for SBLanguages in lldb-enumerations (#111907)" Temporarily Revert until Chelsea can look at this. With a clean build, SBLanguages.h won't be generated in the build directory at the point when it is included by lldb-enumerations when compiling e.g. Broadcaster.cpp. On a clean build (no pre-existing build directory), the dependency ordering is not explicitly stated so the build will fail. An incremental build will succeed. This reverts commit b3554265f24aa570bbc8693af8420a306c459f94. --- lldb/include/lldb/lldb-enumerations.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 217cd7f65cc1c4..232d1dfdb5c9d0 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -12,8 +12,6 @@ #include #include -#include - #ifndef SWIG // Macro to enable bitmask operations on an enum. Without this, Enum | Enum // gets promoted to an int, so you have to say Enum a = Enum(eFoo | eBar). If From 5deadc6eaede3d32ccdd68529f371092d4d218da Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 14:52:50 -0700 Subject: [PATCH 085/177] [NFC][sanitizer] Extract `LoadStatus` (#111909) For #111901 --- .../lib/sanitizer_common/sanitizer_linux.cpp | 21 +++++++++++++------ .../lib/sanitizer_common/sanitizer_linux.h | 1 + 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index e5d6d0a6e71649..a4e58133c79f08 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -1042,8 +1042,6 @@ ThreadLister::Result ThreadLister::ListThreads( Result result = Ok; for (bool first_read = true;; first_read = false) { - // Resize to max capacity if it was downsized by IsAlive. - buffer_.resize(buffer_.capacity()); CHECK_GE(buffer_.size(), 4096); uptr read = internal_getdents( descriptor, (struct linux_dirent *)buffer_.data(), buffer_.size()); @@ -1088,14 +1086,25 @@ ThreadLister::Result ThreadLister::ListThreads( } } +const char *ThreadLister::LoadStatus(int tid) { + auto cleanup = at_scope_exit([&] { + // Resize back to capacity if it is downsized by `ReadFileToVector`. + buffer_.resize(buffer_.capacity()); + }); + if (!ReadFileToVector(status_path_.data(), &buffer_) || buffer_.empty()) + return nullptr; + buffer_.push_back('\0'); + return buffer_.data(); +} + bool ThreadLister::IsAlive(int tid) { // /proc/%d/task/%d/status uses same call to detect alive threads as // proc_task_readdir. See task_state implementation in Linux. - if (!ReadFileToVector(status_path_.data(), &buffer_) || buffer_.empty()) - return false; - buffer_.push_back(0); static const char kPrefix[] = "\nPPid:"; - const char *field = internal_strstr(buffer_.data(), kPrefix); + const char *status = LoadStatus(tid); + if (!status) + return false; + const char *field = internal_strstr(status, kPrefix); if (!field) return false; field += internal_strlen(kPrefix); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h index 96c617822b5b27..07d9528813b3fe 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h @@ -103,6 +103,7 @@ class ThreadLister { Ok, }; Result ListThreads(InternalMmapVector *threads); + const char *LoadStatus(int tid); private: bool IsAlive(int tid); From af7fa2710c998811dd72799799798f2bd4d9bff4 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 14:53:16 -0700 Subject: [PATCH 086/177] [sanitizer] VReport thread status for failed PTRACE_ATTACH (#111901) Such threads can cause false leak reports, but often it's hard to diagnose the reason of failed PTRACE_ATTACH. Maybe we can find a clue from `/proc/*/task/*/status` --- .../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp index d9f803a276dadc..6ebca965f6a334 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp @@ -228,6 +228,8 @@ bool ThreadSuspender::SuspendAllThreads() { for (tid_t tid : threads) { if (SuspendThread(tid)) retry = true; + else + VReport(2, "%llu/status: %s\n", tid, thread_lister.LoadStatus(tid)); } if (retry) VReport(1, "SuspendAllThreads retry: %d\n", i); From 48545a955c4e61f42833af7417032d816482bdfc Mon Sep 17 00:00:00 2001 From: William Junda Huang Date: Thu, 10 Oct 2024 17:59:44 -0400 Subject: [PATCH 087/177] [ThinLTO] Do not duplicate import a function that is actually defined in the current module (#110064) Doing so could cause a bug where the linker tries to remap a function "reimported" from the current module when materializing it, causing a lookup assert in the type mappings. --- llvm/lib/Linker/IRMover.cpp | 6 +- .../Inputs/ditemplatevalueparameter-remap.ll | 29 +++++++ .../X86/ditemplatevalueparameter-remap.ll | 87 +++++++++++++++++++ 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll create mode 100644 llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 3a6c2678cd157f..5067fbff2e277b 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -595,11 +595,15 @@ Value *IRLinker::materialize(Value *V, bool ForIndirectSymbol) { if (!SGV) return nullptr; + // If SGV is from dest, it was already materialized when dest was loaded. + if (SGV->getParent() == &DstM) + return nullptr; + // When linking a global from other modules than source & dest, skip // materializing it because it would be mapped later when its containing // module is linked. Linking it now would potentially pull in many types that // may not be mapped properly. - if (SGV->getParent() != &DstM && SGV->getParent() != SrcM.get()) + if (SGV->getParent() != SrcM.get()) return nullptr; Expected NewProto = linkGlobalValueProto(SGV, ForIndirectSymbol); diff --git a/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll new file mode 100644 index 00000000000000..be93160b943397 --- /dev/null +++ b/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll @@ -0,0 +1,29 @@ +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @_Z8thinlto1v() unnamed_addr { + %3 = alloca i64, align 4 + #dbg_declare(ptr %3, !14, !DIExpression(), !15) + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "B.cpp", directory: ".") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!10 = distinct !DISubprogram(name: "thinlto1", linkageName: "_Z8thinlto1v", scope: !11, file: !11, line: 8, type: !12, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!11 = !DIFile(filename: "b.cpp", directory: ".") +!12 = !DISubroutineType(types: !13) +!13 = !{null} +!14 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !11, line: 18, type: !16) +!15 = !DILocation(line: 18, column: 19, scope: !10) +!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !17, templateParams: !18, identifier: "_ZTS1SIXadL_Z5func1vEEE") +!17 = !{} +!18 = !{!19} +!19 = !DITemplateValueParameter(name: "Func", type: !20, value: ptr undef) +!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64) diff --git a/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll new file mode 100644 index 00000000000000..0651705ccba8b8 --- /dev/null +++ b/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll @@ -0,0 +1,87 @@ +; https://github.com/llvm/llvm-project/pull/110064 +; This test case checks if thinLTO correctly links metadata values in a specific +; situation. Assume we are linking module B into module A, where an extern +; function used in A is defined in B, but the function body has a +; DITemplateValueParameter referring to another function back in A. The +; compiler must check this other function is actually coming from A, thus +; already materialized and does not require remapping. The IR here is modified +; from the following source code. +; +; // A.h +; template +; struct S { +; void Impl() { +; Func(); +; } +; }; +; +; void func1(); +; +; // A.cpp +; #include "A.h" +; __attribute__((weak)) void func1() {} +; extern void thinlto1(); +; void bar() { +; S s; // Force instantiation of S in this compilation unit. +; s.Impl(); +; thinlto1(); +; } +; +; // B.cpp +; #include "A.h" +; void thinlto1() { +; S s; +; } +; +; RUN: opt -module-summary -o %t1.bc %s +; RUN: opt -module-summary -o %t2.bc %S/Inputs/ditemplatevalueparameter-remap.ll +; RUN: ld.lld --plugin-opt=thinlto-index-only -shared %t1.bc %t2.bc +; RUN: clang -O3 -fthinlto-index=%t1.bc.thinlto.bc -x ir %t1.bc -S -emit-llvm -o - | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +$_Z5func1v = comdat any + +define linkonce_odr dso_local void @_Z5func1v() unnamed_addr !dbg !10 { + ret void +} + +; Dummy function to use _Z5func1v so that it is not treated as dead symbol. +define void @_Z3bazv() { + tail call void @_Z5func1v() + ret void +} + +declare void @_Z8thinlto1v() unnamed_addr + +; CHECK: void @_Z3barv() +; CHECK-NOT: call void @_Z8thinlto1v() +; CHECK-NEXT: ret void +define void @_Z3barv() unnamed_addr !dbg !14 { + tail call void @_Z8thinlto1v(), !dbg !25 + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "A.cpp", directory: ".") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!10 = distinct !DISubprogram(name: "func1", linkageName: "_Z5func1v", scope: !11, file: !11, line: 6, type: !12, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!11 = !DIFile(filename: "a.h", directory: ".") +!12 = !DISubroutineType(types: !13) +!13 = !{null} +!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !11, file: !11, line: 15, type: !12, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16) +!16 = !{!17} +!17 = !DILocalVariable(name: "s", scope: !14, file: !11, line: 10, type: !18) +!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !19, templateParams: !20, identifier: "_ZTS1SIXadL_Z5func1vEEE") +!19 = !{} +!20 = !{!21} +!21 = !DITemplateValueParameter(name: "Func", type: !22, value: ptr @_Z5func1v) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64) +!25 = !DILocation(line: 16, column: 5, scope: !14) From f02252e1fd2965db007cf7be74c448b7a119c321 Mon Sep 17 00:00:00 2001 From: Augusto Noronha Date: Thu, 10 Oct 2024 15:01:13 -0700 Subject: [PATCH 088/177] Revert "[lldb] SetErrorStringWithFormatv -> FromErrorStringWithFormatv (NFC)" This reverts commit b77fdf5799be6b29869f2f7969851709e03938ba. --- lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 367fce442bb866..116c43343c01d1 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -407,9 +407,8 @@ Status ProcessKDP::DoResume(RunDirection direction) { Log *log = GetLog(KDPLog::Process); if (direction == RunDirection::eRunReverse) { - error.FromErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", - GetPluginName()); + error.SetErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); return error; } From 2ff4c25b7efff64b3b662d0bedcfe7edebcf20b9 Mon Sep 17 00:00:00 2001 From: Augusto Noronha Date: Thu, 10 Oct 2024 15:01:20 -0700 Subject: [PATCH 089/177] Revert "[lldb] Implement basic support for reverse-continue (#99736)" This reverts commit d5e1de6da96c1ab3b8cae68447e8ed3696a7006e. --- lldb/include/lldb/API/SBProcess.h | 1 - lldb/include/lldb/Target/Process.h | 21 +- lldb/include/lldb/Target/StopInfo.h | 6 - lldb/include/lldb/lldb-enumerations.h | 6 - .../Python/lldbsuite/test/gdbclientutils.py | 5 +- .../Python/lldbsuite/test/lldbgdbproxy.py | 175 -------- .../Python/lldbsuite/test/lldbreverse.py | 418 ------------------ .../Python/lldbsuite/test/lldbtest.py | 2 - lldb/source/API/SBProcess.cpp | 8 +- lldb/source/API/SBThread.cpp | 2 - .../source/Interpreter/CommandInterpreter.cpp | 3 +- .../Process/Linux/NativeThreadLinux.cpp | 3 - .../Process/MacOSX-Kernel/ProcessKDP.cpp | 9 +- .../Process/MacOSX-Kernel/ProcessKDP.h | 2 +- .../Process/Windows/Common/ProcessWindows.cpp | 8 +- .../Process/Windows/Common/ProcessWindows.h | 2 +- .../GDBRemoteCommunicationClient.cpp | 22 - .../gdb-remote/GDBRemoteCommunicationClient.h | 6 - .../GDBRemoteCommunicationServerLLGS.cpp | 1 - .../Process/gdb-remote/ProcessGDBRemote.cpp | 77 +--- .../Process/gdb-remote/ProcessGDBRemote.h | 2 +- .../Process/scripted/ScriptedProcess.cpp | 9 +- .../Process/scripted/ScriptedProcess.h | 2 +- lldb/source/Target/Process.cpp | 29 +- lldb/source/Target/StopInfo.cpp | 29 -- lldb/source/Target/Thread.cpp | 8 +- .../reverse-execution/Makefile | 3 - .../TestReverseContinueBreakpoints.py | 115 ----- .../TestReverseContinueNotSupported.py | 30 -- .../functionalities/reverse-execution/main.c | 14 - lldb/tools/lldb-dap/JSONUtils.cpp | 3 - lldb/tools/lldb-dap/LLDBUtils.cpp | 1 - 32 files changed, 44 insertions(+), 978 deletions(-) delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py delete mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py delete mode 100644 lldb/test/API/functionalities/reverse-execution/main.c diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h index 8b8ed830b54cc0..1624e02070b1b2 100644 --- a/lldb/include/lldb/API/SBProcess.h +++ b/lldb/include/lldb/API/SBProcess.h @@ -159,7 +159,6 @@ class LLDB_API SBProcess { lldb::SBError Destroy(); lldb::SBError Continue(); - lldb::SBError Continue(RunDirection direction); lldb::SBError Stop(); diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index fe7fbc50fd5770..b8c53a474ba6b9 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - Status Resume(lldb::RunDirection direction = lldb::eRunForward); + Status Resume(); /// Resume a process, and wait for it to stop. - Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward); + Status ResumeSynchronous(Stream *stream); /// Halts a running process. /// @@ -1104,14 +1104,9 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - virtual Status DoResume(lldb::RunDirection direction) { - if (direction == lldb::RunDirection::eRunForward) { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support resuming processes", GetPluginName()); - } else { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - } + virtual Status DoResume() { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support resuming processes", GetPluginName()); } /// Called after resuming a process. @@ -2337,8 +2332,6 @@ class Process : public std::enable_shared_from_this, bool IsRunning() const; - lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; } - DynamicCheckerFunctions *GetDynamicCheckers() { return m_dynamic_checkers_up.get(); } @@ -2858,7 +2851,7 @@ void PruneThreadPlans(); /// /// \return /// An Status object describing the success or failure of the resume. - Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward); + Status PrivateResume(); // Called internally void CompleteAttach(); @@ -3134,8 +3127,6 @@ void PruneThreadPlans(); // m_currently_handling_do_on_removals are true, // Resume will only request a resume, using this // flag to check. - // The direction of execution from the last time this process was resumed. - lldb::RunDirection m_last_run_direction; lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async /// interrupt, used by thread plan timeout. It diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h index 072f71f6b1122f..fae90364deaf0a 100644 --- a/lldb/include/lldb/Target/StopInfo.h +++ b/lldb/include/lldb/Target/StopInfo.h @@ -142,12 +142,6 @@ class StopInfo : public std::enable_shared_from_this { static lldb::StopInfoSP CreateStopReasonProcessorTrace(Thread &thread, const char *description); - // This creates a StopInfo indicating that execution stopped because - // it was replaying some recorded execution history, and execution reached - // the end of that recorded history. - static lldb::StopInfoSP - CreateStopReasonHistoryBoundary(Thread &thread, const char *description); - static lldb::StopInfoSP CreateStopReasonFork(Thread &thread, lldb::pid_t child_pid, lldb::tid_t child_tid); diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 232d1dfdb5c9d0..938f6e3abe8f2a 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -135,9 +135,6 @@ FLAGS_ENUM(LaunchFlags){ /// Thread Run Modes. enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping }; -/// Execution directions -enum RunDirection { eRunForward, eRunReverse }; - /// Byte ordering definitions. enum ByteOrder { eByteOrderInvalid = 0, @@ -257,9 +254,6 @@ enum StopReason { eStopReasonVFork, eStopReasonVForkDone, eStopReasonInterrupt, ///< Thread requested interrupt - // Indicates that execution stopped because the debugger backend relies - // on recorded data and we reached the end of that data. - eStopReasonHistoryBoundary, }; /// Command Return Status Types. diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py index 732d6171320680..1784487323ad6b 100644 --- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py +++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py @@ -510,9 +510,8 @@ def start(self): self._thread.start() def stop(self): - if self._thread is not None: - self._thread.join() - self._thread = None + self._thread.join() + self._thread = None def get_connect_address(self): return self._socket.get_connect_address() diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py deleted file mode 100644 index 2a9592bf4545a4..00000000000000 --- a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py +++ /dev/null @@ -1,175 +0,0 @@ -import logging -import os -import os.path -import random - -import lldb -from lldbsuite.test.lldbtest import * -from lldbsuite.test.gdbclientutils import * -import lldbgdbserverutils -from lldbsuite.support import seven - - -class GDBProxyTestBase(TestBase): - """ - Base class for gdbserver proxy tests. - - This class will setup and start a mock GDB server for the test to use. - It pases through requests to a regular lldb-server/debugserver and - forwards replies back to the LLDB under test. - """ - - """The gdbserver that we implement.""" - server = None - """The inner lldb-server/debugserver process that we proxy requests into.""" - monitor_server = None - monitor_sock = None - - server_socket_class = TCPServerSocket - - DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) - - _verbose_log_handler = None - _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s") - - def setUpBaseLogging(self): - self.logger = logging.getLogger(__name__) - - if len(self.logger.handlers) > 0: - return # We have set up this handler already - - self.logger.propagate = False - self.logger.setLevel(logging.DEBUG) - - # log all warnings to stderr - handler = logging.StreamHandler() - handler.setLevel(logging.WARNING) - handler.setFormatter(self._log_formatter) - self.logger.addHandler(handler) - - def setUp(self): - TestBase.setUp(self) - - self.setUpBaseLogging() - - if self.isVerboseLoggingRequested(): - # If requested, full logs go to a log file - log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log" - self._verbose_log_handler = logging.FileHandler( - log_file_name - ) - self._verbose_log_handler.setFormatter(self._log_formatter) - self._verbose_log_handler.setLevel(logging.DEBUG) - self.logger.addHandler(self._verbose_log_handler) - - lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe() - if lldb_server_exe is None: - self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe() - self.assertTrue(self.debug_monitor_exe is not None) - self.debug_monitor_extra_args = [] - else: - self.debug_monitor_exe = lldb_server_exe - self.debug_monitor_extra_args = ["gdbserver"] - - self.server = MockGDBServer(self.server_socket_class()) - self.server.responder = self - - def tearDown(self): - # TestBase.tearDown will kill the process, but we need to kill it early - # so its client connection closes and we can stop the server before - # finally calling the base tearDown. - if self.process() is not None: - self.process().Kill() - self.server.stop() - - self.logger.removeHandler(self._verbose_log_handler) - self._verbose_log_handler = None - - TestBase.tearDown(self) - - def isVerboseLoggingRequested(self): - # We will report our detailed logs if the user requested that the "gdb-remote" channel is - # logged. - return any(("gdb-remote" in channel) for channel in lldbtest_config.channels) - - def connect(self, target): - """ - Create a process by connecting to the mock GDB server. - """ - self.prep_debug_monitor_and_inferior() - self.server.start() - - listener = self.dbg.GetListener() - error = lldb.SBError() - process = target.ConnectRemote( - listener, self.server.get_connect_url(), "gdb-remote", error - ) - self.assertTrue(error.Success(), error.description) - self.assertTrue(process, PROCESS_IS_VALID) - return process - - def get_next_port(self): - return 12000 + random.randint(0, 3999) - - def prep_debug_monitor_and_inferior(self): - inferior_exe_path = self.getBuildArtifact("a.out") - self.connect_to_debug_monitor([inferior_exe_path]) - self.assertIsNotNone(self.monitor_server) - self.initial_handshake() - - def initial_handshake(self): - self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "+") - self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode")) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "+") - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "OK") - self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "+") - - def get_debug_monitor_command_line_args(self, connect_address, launch_args): - return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args - - def launch_debug_monitor(self, launch_args): - family, type, proto, _, addr = socket.getaddrinfo( - "localhost", 0, proto=socket.IPPROTO_TCP - )[0] - sock = socket.socket(family, type, proto) - sock.settimeout(self.DEFAULT_TIMEOUT) - sock.bind(addr) - sock.listen(1) - addr = sock.getsockname() - connect_address = "[{}]:{}".format(*addr) - - commandline_args = self.get_debug_monitor_command_line_args( - connect_address, launch_args - ) - - # Start the server. - self.logger.info(f"Spawning monitor {commandline_args}") - monitor_process = self.spawnSubprocess( - self.debug_monitor_exe, commandline_args, install_remote=False - ) - self.assertIsNotNone(monitor_process) - - self.monitor_sock = sock.accept()[0] - self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT) - return monitor_process - - def connect_to_debug_monitor(self, launch_args): - monitor_process = self.launch_debug_monitor(launch_args) - self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process) - - def respond(self, packet): - """Subclasses can override this to change how packets are handled.""" - return self.pass_through(packet) - - def pass_through(self, packet): - self.logger.info(f"Sending packet {packet}") - self.monitor_server.send_packet(seven.bitcast_to_bytes(packet)) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.logger.info(f"Received reply {reply}") - return reply diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py deleted file mode 100644 index 0f02fdffbdeada..00000000000000 --- a/lldb/packages/Python/lldbsuite/test/lldbreverse.py +++ /dev/null @@ -1,418 +0,0 @@ -import os -import os.path -import lldb -from lldbsuite.test.lldbtest import * -from lldbsuite.test.gdbclientutils import * -from lldbsuite.test.lldbgdbproxy import * -import lldbgdbserverutils -import re - - -class ThreadSnapshot: - def __init__(self, thread_id, registers): - self.thread_id = thread_id - self.registers = registers - - -class MemoryBlockSnapshot: - def __init__(self, address, data): - self.address = address - self.data = data - - -class StateSnapshot: - def __init__(self, thread_snapshots, memory): - self.thread_snapshots = thread_snapshots - self.memory = memory - self.thread_id = None - - -class RegisterInfo: - def __init__(self, lldb_index, bitsize, little_endian): - self.lldb_index = lldb_index - self.bitsize = bitsize - self.little_endian = little_endian - - -BELOW_STACK_POINTER = 16384 -ABOVE_STACK_POINTER = 4096 - -BLOCK_SIZE = 1024 - -SOFTWARE_BREAKPOINTS = 0 -HARDWARE_BREAKPOINTS = 1 -WRITE_WATCHPOINTS = 2 - - -class ReverseTestBase(GDBProxyTestBase): - """ - Base class for tests that need reverse execution. - - This class uses a gdbserver proxy to add very limited reverse- - execution capability to lldb-server/debugserver for testing - purposes only. - - To use this class, run the inferior forward until some stopping point. - Then call `start_recording()` and execute forward again until reaching - a software breakpoint; this class records the state before each execution executes. - At that point, the server will accept "bc" and "bs" packets to step - backwards through the state. - When executing during recording, we only allow single-step and continue without - delivering a signal, and only software breakpoint stops are allowed. - - We assume that while recording is enabled, the only effects of instructions - are on general-purpose registers (read/written by the 'g' and 'G' packets) - and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER). - """ - - """ - A list of StateSnapshots in time order. - - There is one snapshot per single-stepped instruction, - representing the state before that instruction was - executed. The last snapshot in the list is the - snapshot before the last instruction was executed. - This is an undo log; we snapshot a superset of the state that may have - been changed by the instruction's execution. - """ - snapshots = None - recording_enabled = False - - breakpoints = None - - pid = None - - pc_register_info = None - sp_register_info = None - general_purpose_register_info = None - - def __init__(self, *args, **kwargs): - GDBProxyTestBase.__init__(self, *args, **kwargs) - self.breakpoints = [set(), set(), set(), set(), set()] - - def respond(self, packet): - if not packet: - raise ValueError("Invalid empty packet") - if packet == self.server.PACKET_INTERRUPT: - # Don't send a response. We'll just run to completion. - return [] - if self.is_command(packet, "qSupported", ":"): - reply = self.pass_through(packet) - return reply + ";ReverseStep+;ReverseContinue+" - if self.is_command(packet, "vCont", ";"): - if self.recording_enabled: - return self.continue_with_recording(packet) - snapshots = [] - if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S": - raise ValueError("LLDB should not be sending old-style continuation packets") - if packet == "bc": - return self.reverse_continue() - if packet == "bs": - return self.reverse_step() - if packet == 'jThreadsInfo': - # Suppress this because it contains thread stop reasons which we might - # need to modify, and we don't want to have to implement that. - return "" - if packet[0] == "z" or packet[0] == "Z": - reply = self.pass_through(packet) - if reply == "OK": - self.update_breakpoints(packet) - return reply - return GDBProxyTestBase.respond(self, packet) - - def start_recording(self): - self.recording_enabled = True - self.snapshots = [] - - def stop_recording(self): - """ - Don't record when executing foward. - - Reverse execution is still supported until the next forward continue. - """ - self.recording_enabled = False - - def is_command(self, packet, cmd, follow_token): - return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token - - def update_breakpoints(self, packet): - m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet) - if m is None: - raise ValueError("Invalid breakpoint packet: " + packet) - t = int(m.group(2)) - addr = int(m.group(3), 16) - kind = int(m.group(4), 16) - if m.group(1) == 'Z': - self.breakpoints[t].add((addr, kind)) - else: - self.breakpoints[t].discard((addr, kind)) - - def breakpoint_triggered_at(self, pc): - if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]): - return True - if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]): - return True - return False - - def watchpoint_triggered(self, new_value_block, current_contents): - """Returns the address or None.""" - for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]: - for offset in range(0, kind): - addr = watch_addr + offset - if (addr >= new_value_block.address and - addr < new_value_block.address + len(new_value_block.data)): - index = addr - new_value_block.address - if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]: - return watch_addr - return None - - def continue_with_recording(self, packet): - self.logger.debug("Continue with recording enabled") - - step_packet = "vCont;s" - if packet == "vCont": - requested_step = False - else: - m = re.match("vCont;(c|s)(.*)", packet) - if m is None: - raise ValueError("Unsupported vCont packet: " + packet) - requested_step = m.group(1) == 's' - step_packet += m.group(2) - - while True: - snapshot = self.capture_snapshot() - reply = self.pass_through(step_packet) - (stop_signal, stop_pairs) = self.parse_stop(reply) - if stop_signal != 5: - raise ValueError("Unexpected stop signal: " + reply) - is_swbreak = False - thread_id = None - for key, value in stop_pairs.items(): - if key == "thread": - thread_id = self.parse_thread_id(value) - continue - if re.match('[0-9a-f]+', key): - continue - if key == "swbreak" or (key == "reason" and value == "breakpoint"): - is_swbreak = True - continue - if key in ["name", "threads", "thread-pcs", "reason"]: - continue - raise ValueError(f"Unknown stop key '{key}' in {reply}") - if is_swbreak: - self.logger.debug("Recording stopped") - return reply - if thread_id is None: - return ValueError("Expected thread ID: " + reply) - snapshot.thread_id = thread_id - self.snapshots.append(snapshot) - if requested_step: - self.logger.debug("Recording stopped for step") - return reply - - def parse_stop(self, reply): - result = {} - if not reply: - raise ValueError("Invalid empty packet") - if reply[0] == "T" and len(reply) >= 3: - result = {k:v for k, v in self.parse_pairs(reply[3:])} - return (int(reply[1:3], 16), result) - raise "Unsupported stop reply: " + reply - - def parse_pairs(self, text): - for pair in text.split(";"): - if not pair: - continue - m = re.match("([^:]+):(.*)", pair) - if m is None: - raise ValueError("Invalid pair text: " + text) - yield (m.group(1), m.group(2)) - - def capture_snapshot(self): - """Snapshot all threads and their stack memories.""" - self.ensure_register_info() - current_thread = self.get_current_thread() - thread_snapshots = [] - memory = [] - for thread_id in self.get_thread_list(): - registers = {} - for index in sorted(self.general_purpose_register_info.keys()): - reply = self.pass_through(f"p{index:x};thread:{thread_id:x};") - if reply == "" or reply[0] == 'E': - raise ValueError("Can't read register") - registers[index] = reply - thread_snapshot = ThreadSnapshot(thread_id, registers) - thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers) - memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER) - thread_snapshots.append(thread_snapshot) - self.set_current_thread(current_thread) - return StateSnapshot(thread_snapshots, memory) - - def restore_snapshot(self, snapshot): - """ - Restore the snapshot during reverse execution. - - If this triggers a breakpoint or watchpoint, return the stop reply, - otherwise None. - """ - current_thread = self.get_current_thread() - stop_reasons = [] - for thread_snapshot in snapshot.thread_snapshots: - thread_id = thread_snapshot.thread_id - for lldb_index in sorted(thread_snapshot.registers.keys()): - data = thread_snapshot.registers[lldb_index] - reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};") - if reply != "OK": - raise ValueError("Can't restore thread register") - if thread_id == snapshot.thread_id: - new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers) - if self.breakpoint_triggered_at(new_pc): - stop_reasons.append([("reason", "breakpoint")]) - self.set_current_thread(current_thread) - for block in snapshot.memory: - current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}") - if not current_memory or current_memory[0] == 'E': - raise ValueError("Can't read back memory") - reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data) - if reply != "OK": - raise ValueError("Can't restore memory") - watch_addr = self.watchpoint_triggered(block, current_memory[1:]) - if watch_addr is not None: - stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")]) - if stop_reasons: - pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0]) - return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};" - return None - - def reverse_step(self): - if not self.snapshots: - self.logger.debug("Reverse-step at history boundary") - return self.history_boundary_reply(self.get_current_thread()) - self.logger.debug("Reverse-step started") - snapshot = self.snapshots.pop() - stop_reply = self.restore_snapshot(snapshot) - self.set_current_thread(snapshot.thread_id) - self.logger.debug("Reverse-step stopped") - if stop_reply is None: - return self.singlestep_stop_reply(snapshot.thread_id) - return stop_reply - - def reverse_continue(self): - self.logger.debug("Reverse-continue started") - thread_id = None - while self.snapshots: - snapshot = self.snapshots.pop() - stop_reply = self.restore_snapshot(snapshot) - thread_id = snapshot.thread_id - if stop_reply is not None: - self.set_current_thread(thread_id) - self.logger.debug("Reverse-continue stopped") - return stop_reply - if thread_id is None: - thread_id = self.get_current_thread() - else: - self.set_current_thread(snapshot.thread_id) - self.logger.debug("Reverse-continue stopped at history boundary") - return self.history_boundary_reply(thread_id) - - def get_current_thread(self): - reply = self.pass_through("qC") - return self.parse_thread_id(reply[2:]) - - def parse_thread_id(self, thread_id): - m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id) - if m is None: - raise ValueError("Invalid thread ID: " + thread_id) - if self.pid is None: - self.pid = int(m.group(2), 16) - return int(m.group(3), 16) - - def history_boundary_reply(self, thread_id): - return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;" - - def singlestep_stop_reply(self, thread_id): - return f"T05thread:{self.pid:x}.{thread_id:x};" - - def set_current_thread(self, thread_id): - """ - Set current thread in inner gdbserver. - """ - if thread_id >= 0: - self.pass_through(f"Hg{self.pid:x}.{thread_id:x}") - self.pass_through(f"Hc{self.pid:x}.{thread_id:x}") - else: - self.pass_through(f"Hc-1.-1") - self.pass_through(f"Hg-1.-1") - - def get_register(self, register_info, registers): - if register_info.bitsize % 8 != 0: - raise ValueError("Register size must be a multiple of 8 bits") - if register_info.lldb_index not in registers: - raise ValueError("Register value not captured") - data = registers[register_info.lldb_index] - num_bytes = register_info.bitsize//8 - bytes = [] - for i in range(0, num_bytes): - bytes.append(int(data[i*2:(i + 1)*2], 16)) - if register_info.little_endian: - bytes.reverse() - result = 0 - for byte in bytes: - result = (result << 8) + byte - return result - - def read_memory(self, start_addr, end_addr): - """ - Read a region of memory from the target. - - Some of the addresses may extend into invalid virtual memory; - skip those areas. - Return a list of blocks containing the valid area(s) in the - requested range. - """ - regions = [] - start_addr = start_addr & (BLOCK_SIZE - 1) - end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1) - for addr in range(start_addr, end_addr, BLOCK_SIZE): - reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}") - if reply and reply[0] != 'E': - block = MemoryBlockSnapshot(addr, reply[1:]) - regions.append(block) - return regions - - def ensure_register_info(self): - if self.general_purpose_register_info is not None: - return - reply = self.pass_through("qHostInfo") - little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply)) - self.general_purpose_register_info = {} - lldb_index = 0 - while True: - reply = self.pass_through(f"qRegisterInfo{lldb_index:x}") - if not reply or reply[0] == 'E': - break - info = {k:v for k, v in self.parse_pairs(reply)} - reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian) - if info["set"] == "General Purpose Registers" and not "container-regs" in info: - self.general_purpose_register_info[lldb_index] = reg_info - if "generic" in info: - if info["generic"] == "pc": - self.pc_register_info = reg_info - elif info["generic"] == "sp": - self.sp_register_info = reg_info - lldb_index += 1 - if self.pc_register_info is None or self.sp_register_info is None: - raise ValueError("Can't find generic pc or sp register") - - def get_thread_list(self): - threads = [] - reply = self.pass_through("qfThreadInfo") - while True: - if not reply: - raise ValueError("Missing reply packet") - if reply[0] == 'm': - for id in reply[1:].split(","): - threads.append(self.parse_thread_id(id)) - elif reply[0] == 'l': - return threads - reply = self.pass_through("qsThreadInfo") diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index 7cc1ac9749ec93..8884ef5933ada8 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -143,8 +143,6 @@ STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint" -STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary" - DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly" VALID_BREAKPOINT = "Got a valid breakpoint" diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp index 07780f9f9c8393..9773144723c34c 100644 --- a/lldb/source/API/SBProcess.cpp +++ b/lldb/source/API/SBProcess.cpp @@ -564,10 +564,6 @@ uint32_t SBProcess::GetAddressByteSize() const { } SBError SBProcess::Continue() { - return Continue(RunDirection::eRunForward); -} - -SBError SBProcess::Continue(RunDirection direction) { LLDB_INSTRUMENT_VA(this); SBError sb_error; @@ -578,9 +574,9 @@ SBError SBProcess::Continue(RunDirection direction) { process_sp->GetTarget().GetAPIMutex()); if (process_sp->GetTarget().GetDebugger().GetAsyncExecution()) - sb_error.ref() = process_sp->Resume(direction); + sb_error.ref() = process_sp->Resume(); else - sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction); + sb_error.ref() = process_sp->ResumeSynchronous(nullptr); } else sb_error = Status::FromErrorString("SBProcess is invalid"); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index aca8a039952960..a99456e06d0329 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -172,7 +172,6 @@ size_t SBThread::GetStopReasonDataCount() { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: - case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; @@ -234,7 +233,6 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: - case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index ea60492ac46a10..8d3a82ef6c990a 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2553,8 +2553,7 @@ bool CommandInterpreter::DidProcessStopAbnormally() const { const StopReason reason = stop_info->GetStopReason(); if (reason == eStopReasonException || reason == eStopReasonInstrumentation || - reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt || - reason == eStopReasonHistoryBoundary) + reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt) return true; if (reason == eStopReasonSignal) { diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp index b0aa664775b463..de047ee214c11e 100644 --- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp @@ -82,9 +82,6 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info, case eStopReasonProcessorTrace: log.Printf("%s: %s processor trace", __FUNCTION__, header); return; - case eStopReasonHistoryBoundary: - log.Printf("%s: %s history boundary", __FUNCTION__, header); - return; default: log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header, static_cast(stop_info.reason)); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 116c43343c01d1..9b2907c6809965 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -402,16 +402,9 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() { Status ProcessKDP::WillResume() { return Status(); } -Status ProcessKDP::DoResume(RunDirection direction) { +Status ProcessKDP::DoResume() { Status error; Log *log = GetLog(KDPLog::Process); - - if (direction == RunDirection::eRunReverse) { - error.SetErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - return error; - } - // Only start the async thread if we try to do any process control if (!m_async_thread.IsJoinable()) StartAsyncThread(); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h index 1b71d83f70b087..e5ec5914f9600d 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h @@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process { // Process Control lldb_private::Status WillResume() override; - lldb_private::Status DoResume(lldb::RunDirection direction) override; + lldb_private::Status DoResume() override; lldb_private::Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index 76b7095deaa503..703aa082f0476f 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -204,17 +204,11 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid, return error; } -Status ProcessWindows::DoResume(RunDirection direction) { +Status ProcessWindows::DoResume() { Log *log = GetLog(WindowsLog::Process); llvm::sys::ScopedLock lock(m_mutex); Status error; - if (direction == RunDirection::eRunReverse) { - error.SetErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - return error; - } - StateType private_state = GetPrivateState(); if (private_state == eStateStopped || private_state == eStateCrashed) { LLDB_LOG(log, "process {0} is in state {1}. Resuming...", diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h index 97284b7cd1436e..e97cfb790248be 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h @@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger { Status DoAttachToProcessWithID( lldb::pid_t pid, const lldb_private::ProcessAttachInfo &attach_info) override; - Status DoResume(lldb::RunDirection direction) override; + Status DoResume() override; Status DoDestroy() override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index fc792a4409410b..e42526c8fd7266 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -199,20 +199,6 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() { return m_max_packet_size; } -bool GDBRemoteCommunicationClient::GetReverseContinueSupported() { - if (m_supports_reverse_continue == eLazyBoolCalculate) { - GetRemoteQSupported(); - } - return m_supports_reverse_continue == eLazyBoolYes; -} - -bool GDBRemoteCommunicationClient::GetReverseStepSupported() { - if (m_supports_reverse_step == eLazyBoolCalculate) { - GetRemoteQSupported(); - } - return m_supports_reverse_step == eLazyBoolYes; -} - bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() { if (m_supports_not_sending_acks == eLazyBoolCalculate) { m_send_acks = true; @@ -309,8 +295,6 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) { m_supports_qXfer_siginfo_read = eLazyBoolCalculate; m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate; m_uses_native_signals = eLazyBoolCalculate; - m_supports_reverse_continue = eLazyBoolCalculate; - m_supports_reverse_step = eLazyBoolCalculate; m_supports_qProcessInfoPID = true; m_supports_qfProcessInfo = true; m_supports_qUserName = true; @@ -364,8 +348,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_memory_tagging = eLazyBoolNo; m_supports_qSaveCore = eLazyBoolNo; m_uses_native_signals = eLazyBoolNo; - m_supports_reverse_continue = eLazyBoolNo; - m_supports_reverse_step = eLazyBoolNo; m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if // not, we assume no limit @@ -419,10 +401,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_qSaveCore = eLazyBoolYes; else if (x == "native-signals+") m_uses_native_signals = eLazyBoolYes; - else if (x == "ReverseContinue+") - m_supports_reverse_continue = eLazyBoolYes; - else if (x == "ReverseStep+") - m_supports_reverse_step = eLazyBoolYes; // Look for a list of compressions in the features list e.g. // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib- // deflate,lzma diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h index 116b47c1edf033..898d176abc3465 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h @@ -331,10 +331,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { bool GetMultiprocessSupported(); - bool GetReverseContinueSupported(); - - bool GetReverseStepSupported(); - LazyBool SupportsAllocDeallocMemory() // const { // Uncomment this to have lldb pretend the debug server doesn't respond to @@ -565,8 +561,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { LazyBool m_supports_memory_tagging = eLazyBoolCalculate; LazyBool m_supports_qSaveCore = eLazyBoolCalculate; LazyBool m_uses_native_signals = eLazyBoolCalculate; - LazyBool m_supports_reverse_continue = eLazyBoolCalculate; - LazyBool m_supports_reverse_step = eLazyBoolCalculate; bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1, m_supports_qUserName : 1, m_supports_qGroupName : 1, diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp index 4016cde74ebea8..35fa93e53bc66f 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp @@ -716,7 +716,6 @@ static const char *GetStopReasonString(StopReason stop_reason) { return "vforkdone"; case eStopReasonInterrupt: return "async interrupt"; - case eStopReasonHistoryBoundary: case eStopReasonInstrumentation: case eStopReasonInvalid: case eStopReasonPlanComplete: diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 3fc03bd05d5df0..3e09c316d74f44 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -169,10 +169,6 @@ class PluginProperties : public Properties { } }; -std::chrono::seconds ResumeTimeout() { - return std::chrono::seconds(5); -} - } // namespace static PluginProperties &GetGlobalPluginProperties() { @@ -1184,11 +1180,10 @@ Status ProcessGDBRemote::WillResume() { return Status(); } -Status ProcessGDBRemote::DoResume(RunDirection direction) { +Status ProcessGDBRemote::DoResume() { Status error; Log *log = GetLog(GDBRLog::Process); - LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)", - direction == RunDirection::eRunForward ? "" : "reverse"); + LLDB_LOGF(log, "ProcessGDBRemote::Resume()"); ListenerSP listener_sp( Listener::MakeListener("gdb-remote.resume-packet-sent")); @@ -1202,21 +1197,12 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { StreamString continue_packet; bool continue_packet_error = false; - // Number of threads continuing with "c", i.e. continuing without a signal to deliver. - const size_t num_continue_c_tids = m_continue_c_tids.size(); - // Number of threads continuing with "C", i.e. continuing with a signal to deliver. - const size_t num_continue_C_tids = m_continue_C_tids.size(); - // Number of threads continuing with "s", i.e. single-stepping. - const size_t num_continue_s_tids = m_continue_s_tids.size(); - // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver. - const size_t num_continue_S_tids = m_continue_S_tids.size(); - if (direction == RunDirection::eRunForward && - m_gdb_comm.HasAnyVContSupport()) { + if (m_gdb_comm.HasAnyVContSupport()) { std::string pid_prefix; if (m_gdb_comm.GetMultiprocessSupported()) pid_prefix = llvm::formatv("p{0:x-}.", GetID()); - if (num_continue_c_tids == num_threads || + if (m_continue_c_tids.size() == num_threads || (m_continue_c_tids.empty() && m_continue_C_tids.empty() && m_continue_s_tids.empty() && m_continue_S_tids.empty())) { // All threads are continuing @@ -1279,11 +1265,14 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { } else continue_packet_error = true; - if (direction == RunDirection::eRunForward && continue_packet_error) { + if (continue_packet_error) { // Either no vCont support, or we tried to use part of the vCont packet - // that wasn't supported by the remote GDB server, or it's the reverse - // direction. We need to try and make a simple packet that can do our - // continue. + // that wasn't supported by the remote GDB server. We need to try and + // make a simple packet that can do our continue + const size_t num_continue_c_tids = m_continue_c_tids.size(); + const size_t num_continue_C_tids = m_continue_C_tids.size(); + const size_t num_continue_s_tids = m_continue_s_tids.size(); + const size_t num_continue_S_tids = m_continue_S_tids.size(); if (num_continue_c_tids > 0) { if (num_continue_c_tids == num_threads) { // All threads are resuming... @@ -1374,41 +1363,9 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { } } - if (direction == RunDirection::eRunReverse && continue_packet_error) { - if (num_continue_C_tids > 0 || num_continue_S_tids > 0) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported"); - return Status::FromErrorString("can't deliver signals while running in reverse"); - } - - if (num_continue_s_tids > 0) { - if (num_continue_s_tids > 1) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads"); - return Status::FromErrorString("can't step multiple threads while reverse-stepping"); - } - - if (!m_gdb_comm.GetReverseStepSupported()) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping"); - return Status::FromErrorString("target does not support reverse-stepping"); - } - - m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front()); - continue_packet.PutCString("bs"); - } else { - if (!m_gdb_comm.GetReverseContinueSupported()) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue"); - return Status::FromErrorString("target does not support reverse-continue"); - } - - // All threads continue whether requested or not --- - // we can't change how threads ran in the past. - continue_packet.PutCString("bc"); - } - - continue_packet_error = false; - } - if (continue_packet_error) { - return Status::FromErrorString("can't make continue packet for this resume"); + error = + Status::FromErrorString("can't make continue packet for this resume"); } else { EventSP event_sp; if (!m_async_thread.IsJoinable()) { @@ -1423,7 +1380,7 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { std::make_shared(continue_packet.GetString()); m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp); - if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) { + if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) { error = Status::FromErrorString("Resume timed out."); LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out."); } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) { @@ -1906,10 +1863,6 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo( thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException( *thread_sp, description.c_str())); handled = true; - } else if (reason == "replaylog") { - thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary( - *thread_sp, description.c_str())); - handled = true; } else if (reason == "exec") { did_exec = true; thread_sp->SetStopInfo( @@ -2365,8 +2318,6 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) { description = std::string(ostr.GetString()); } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) { reason = "breakpoint"; - } else if (key.compare("replaylog") == 0) { - reason = "replaylog"; } else if (key.compare("library") == 0) { auto error = LoadModules(); if (error) { diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h index fa3e1cec76e2b3..2492795851388a 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h @@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process, // Process Control Status WillResume() override; - Status DoResume(lldb::RunDirection direction) override; + Status DoResume() override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp index 304c12173dd35d..d2111ce877ce55 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp @@ -182,15 +182,10 @@ void ScriptedProcess::DidResume() { m_pid = GetInterface().GetProcessID(); } -Status ScriptedProcess::DoResume(RunDirection direction) { +Status ScriptedProcess::DoResume() { LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__); - if (direction == RunDirection::eRunForward) { - return GetInterface().Resume(); - } else { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - } + return GetInterface().Resume(); } Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) { diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h index 8ebe4ca5f3d449..0335364b4010b2 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h @@ -52,7 +52,7 @@ class ScriptedProcess : public Process { void DidResume() override; - Status DoResume(lldb::RunDirection direction) override; + Status DoResume() override; Status DoAttachToProcessWithID(lldb::pid_t pid, const ProcessAttachInfo &attach_info) override; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index ff6a2f59eba35f..aca08972811470 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -446,8 +446,7 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, m_memory_cache(*this), m_allocated_memory_cache(*this), m_should_detach(false), m_next_event_action_up(), m_public_run_lock(), m_private_run_lock(), m_currently_handling_do_on_removals(false), - m_resume_requested(false), m_last_run_direction(eRunForward), - m_interrupt_tid(LLDB_INVALID_THREAD_ID), + m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID), m_finalizing(false), m_destructing(false), m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false), m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false), @@ -846,7 +845,6 @@ bool Process::HandleProcessStateChangedEvent( switch (thread_stop_reason) { case eStopReasonInvalid: case eStopReasonNone: - case eStopReasonHistoryBoundary: break; case eStopReasonSignal: { @@ -1354,7 +1352,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) { } } -Status Process::Resume(RunDirection direction) { +Status Process::Resume() { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data()); if (!m_public_run_lock.TrySetRunning()) { @@ -1363,7 +1361,7 @@ Status Process::Resume(RunDirection direction) { return Status::FromErrorString( "Resume request failed - process still running."); } - Status error = PrivateResume(direction); + Status error = PrivateResume(); if (!error.Success()) { // Undo running state change m_public_run_lock.SetStopped(); @@ -1371,7 +1369,7 @@ Status Process::Resume(RunDirection direction) { return error; } -Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) { +Status Process::ResumeSynchronous(Stream *stream) { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock"); if (!m_public_run_lock.TrySetRunning()) { @@ -1384,7 +1382,7 @@ Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) { Listener::MakeListener(ResumeSynchronousHijackListenerName.data())); HijackProcessEvents(listener_sp); - Status error = PrivateResume(direction); + Status error = PrivateResume(); if (error.Success()) { StateType state = WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream, @@ -3241,7 +3239,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) { return error; } -Status Process::PrivateResume(RunDirection direction) { +Status Process::PrivateResume() { Log *log(GetLog(LLDBLog::Process | LLDBLog::Step)); LLDB_LOGF(log, "Process::PrivateResume() m_stop_id = %u, public state: %s " @@ -3257,15 +3255,6 @@ Status Process::PrivateResume(RunDirection direction) { if (!GetModID().IsLastResumeForUserExpression()) ResetExtendedCrashInfoDict(); - if (m_last_run_direction != direction) { - // In the future we might want to support mixed-direction plans, - // e.g. a forward step-over stops at a breakpoint, the user does - // a reverse-step, then disables the breakpoint and continues forward. - // This code will need to be changed to support that. - m_thread_list.DiscardThreadPlans(); - m_last_run_direction = direction; - } - Status error(WillResume()); // Tell the process it is about to resume before the thread list if (error.Success()) { @@ -3283,7 +3272,7 @@ Status Process::PrivateResume(RunDirection direction) { "Process::PrivateResume PreResumeActions failed, not resuming."); } else { m_mod_id.BumpResumeID(); - error = DoResume(direction); + error = DoResume(); if (error.Success()) { DidResume(); m_thread_list.DidResume(); @@ -3746,7 +3735,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) { "from state: %s", static_cast(event_ptr), StateAsCString(state)); ProcessEventData::SetRestartedInEvent(event_ptr, true); - PrivateResume(m_last_run_direction); + PrivateResume(); } } else { return_value = true; @@ -4357,7 +4346,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) { SetRestarted(true); // Use the private resume method here, since we aren't changing the run // lock state. - process_sp->PrivateResume(process_sp->m_last_run_direction); + process_sp->PrivateResume(); } else { bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) && !process_sp->StateChangedIsHijackedForSynchronousResume(); diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index 08e9a7c099bad2..bd7032b803df90 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -1212,30 +1212,6 @@ class StopInfoProcessorTrace : public StopInfo { } }; -// StopInfoHistoryBoundary - -class StopInfoHistoryBoundary : public StopInfo { -public: - StopInfoHistoryBoundary(Thread &thread, const char *description) - : StopInfo(thread, LLDB_INVALID_UID) { - if (description) - SetDescription(description); - } - - ~StopInfoHistoryBoundary() override = default; - - StopReason GetStopReason() const override { - return eStopReasonHistoryBoundary; - } - - const char *GetDescription() override { - if (m_description.empty()) - return "history boundary"; - else - return m_description.c_str(); - } -}; - // StopInfoThreadPlan class StopInfoThreadPlan : public StopInfo { @@ -1463,11 +1439,6 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread, return StopInfoSP(new StopInfoProcessorTrace(thread, description)); } -StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread, - const char *description) { - return StopInfoSP(new StopInfoHistoryBoundary(thread, description)); -} - StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) { return StopInfoSP(new StopInfoExec(thread)); } diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index bbb586f033b746..902fbb2b519ef7 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -624,12 +624,10 @@ void Thread::SetupForResume() { // what the current plan is. lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext()); - ProcessSP process_sp(GetProcess()); - if (reg_ctx_sp && process_sp && - process_sp->GetLastRunDirection() == eRunForward) { + if (reg_ctx_sp) { const addr_t thread_pc = reg_ctx_sp->GetPC(); BreakpointSiteSP bp_site_sp = - process_sp->GetBreakpointSiteList().FindByAddress(thread_pc); + GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc); if (bp_site_sp) { // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the // target may not require anything special to step over a breakpoint. @@ -1734,8 +1732,6 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) { return "processor trace"; case eStopReasonInterrupt: return "async interrupt"; - case eStopReasonHistoryBoundary: - return "history boundary"; } return "StopReason = " + std::to_string(reason); diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile deleted file mode 100644 index 10495940055b63..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -C_SOURCES := main.c - -include Makefile.rules diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py deleted file mode 100644 index b37578fbd82468..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py +++ /dev/null @@ -1,115 +0,0 @@ -import lldb -import time -import unittest -from lldbsuite.test.lldbtest import * -from lldbsuite.test.decorators import * -from lldbsuite.test.gdbclientutils import * -from lldbsuite.test.lldbreverse import ReverseTestBase -from lldbsuite.test import lldbutil - - -class TestReverseContinueBreakpoints(ReverseTestBase): - NO_DEBUG_INFO_TESTCASE = True - - def test_reverse_continue(self): - self.reverse_continue_internal(async_mode=False) - - def test_reverse_continue_async(self): - self.reverse_continue_internal(async_mode=True) - - def reverse_continue_internal(self, async_mode): - target, process, initial_threads = self.setup_recording(async_mode) - - # Reverse-continue. We'll stop at the point where we started recording. - status = process.Continue(lldb.eRunReverse) - self.assertSuccess(status) - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) - self.expect( - "thread list", - STOPPED_DUE_TO_HISTORY_BOUNDARY, - substrs=["stopped", "stop reason = history boundary"], - ) - - # Continue forward normally until the target exits. - status = process.Continue() - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited]) - self.assertSuccess(status) - self.assertState(process.GetState(), lldb.eStateExited) - self.assertEqual(process.GetExitStatus(), 0) - - def test_reverse_continue_breakpoint(self): - self.reverse_continue_breakpoint_internal(async_mode=False) - - def test_reverse_continue_breakpoint_async(self): - self.reverse_continue_breakpoint_internal(async_mode=True) - - def reverse_continue_breakpoint_internal(self, async_mode): - target, process, initial_threads = self.setup_recording(async_mode) - - # Reverse-continue to the function "trigger_breakpoint". - trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) - status = process.Continue(lldb.eRunReverse) - self.assertSuccess(status) - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) - threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt) - self.assertEqual(threads_now, initial_threads) - - def test_reverse_continue_skip_breakpoint(self): - self.reverse_continue_skip_breakpoint_internal(async_mode=False) - - def test_reverse_continue_skip_breakpoint_async(self): - self.reverse_continue_skip_breakpoint_internal(async_mode=True) - - def reverse_continue_skip_breakpoint_internal(self, async_mode): - target, process, initial_threads = self.setup_recording(async_mode) - - # Reverse-continue over a breakpoint at "trigger_breakpoint" whose - # condition is false. - # This tests that we continue in the correct direction after hitting - # the breakpoint. - trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) - trigger_bkpt.SetCondition("false_condition") - status = process.Continue(lldb.eRunReverse) - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) - self.assertSuccess(status) - self.expect( - "thread list", - STOPPED_DUE_TO_HISTORY_BOUNDARY, - substrs=["stopped", "stop reason = history boundary"], - ) - - def setup_recording(self, async_mode): - """ - Record execution of code between "start_recording" and "stop_recording" breakpoints. - - Returns with the target stopped at "stop_recording", with recording disabled, - ready to reverse-execute. - """ - self.build() - target = self.dbg.CreateTarget("") - process = self.connect(target) - - # Record execution from the start of the function "start_recording" - # to the start of the function "stop_recording". We want to keep the - # interval that we record as small as possible to minimize the run-time - # of our single-stepping recorder. - start_recording_bkpt = target.BreakpointCreateByName("start_recording", None) - initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt) - self.assertEqual(len(initial_threads), 1) - target.BreakpointDelete(start_recording_bkpt.GetID()) - self.start_recording() - stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None) - lldbutil.continue_to_breakpoint(process, stop_recording_bkpt) - target.BreakpointDelete(stop_recording_bkpt.GetID()) - self.stop_recording() - - self.dbg.SetAsync(async_mode) - self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped]) - - return target, process, initial_threads - - def expect_async_state_changes(self, async_mode, process, states): - if not async_mode: - return - listener = self.dbg.GetListener() - lldbutil.expect_state_changes(self, listener, process, states) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py deleted file mode 100644 index d610761b8cb0bc..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py +++ /dev/null @@ -1,30 +0,0 @@ -import lldb -import unittest -from lldbsuite.test.lldbtest import * -from lldbsuite.test.decorators import * -from lldbsuite.test import lldbutil - - -class TestReverseContinueNotSupported(TestBase): - NO_DEBUG_INFO_TESTCASE = True - - def test_reverse_continue_not_supported(self): - self.build() - exe = self.getBuildArtifact("a.out") - target = self.dbg.CreateTarget(exe) - self.assertTrue(target, VALID_TARGET) - - main_bkpt = target.BreakpointCreateByName("main", None) - self.assertTrue(main_bkpt, VALID_BREAKPOINT) - - process = target.LaunchSimple(None, None, self.get_process_working_directory()) - self.assertTrue(process, PROCESS_IS_VALID) - - # This will fail gracefully. - status = process.Continue(lldb.eRunReverse) - self.assertFailure(status, "target does not support reverse-continue") - - status = process.Continue() - self.assertSuccess(status) - self.assertState(process.GetState(), lldb.eStateExited) - self.assertEqual(process.GetExitStatus(), 0) diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c deleted file mode 100644 index 40e45dc9f5c317..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/main.c +++ /dev/null @@ -1,14 +0,0 @@ -volatile int false_condition = 0; - -static void start_recording() {} - -static void trigger_breakpoint() {} - -static void stop_recording() {} - -int main() { - start_recording(); - trigger_breakpoint(); - stop_recording(); - return 0; -} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 211fd34957f496..558f889c4b7f23 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -1045,9 +1045,6 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread, case lldb::eStopReasonProcessorTrace: body.try_emplace("reason", "processor trace"); break; - case lldb::eStopReasonHistoryBoundary: - body.try_emplace("reason", "history boundary"); - break; case lldb::eStopReasonSignal: case lldb::eStopReasonException: body.try_emplace("reason", "exception"); diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp index 1c5e3ac7008727..b38833c0fdb6b6 100644 --- a/lldb/tools/lldb-dap/LLDBUtils.cpp +++ b/lldb/tools/lldb-dap/LLDBUtils.cpp @@ -111,7 +111,6 @@ bool ThreadHasStopReason(lldb::SBThread &thread) { case lldb::eStopReasonVFork: case lldb::eStopReasonVForkDone: case lldb::eStopReasonInterrupt: - case lldb::eStopReasonHistoryBoundary: return true; case lldb::eStopReasonThreadExiting: case lldb::eStopReasonInvalid: From e9c8f75d45ababe7f805078bbf7bda2e7425f1b7 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Thu, 10 Oct 2024 15:59:51 -0700 Subject: [PATCH 090/177] [LLDB][Minidump] Have Minidumps save off and properly read TLS data (#109477) This patch adds the support to `Process.cpp` to automatically save off TLS sections, either via loading the memory region for the module, or via reading `fs_base` via generic register. Then when Minidumps are loaded, we now specify we want the dynamic loader to be the `POSIXDYLD` so we can leverage the same TLS accessor code as `ProcessELFCore`. Being able to access TLS Data is an important step for LLDB generated minidumps to have feature parity with ELF Core dumps. --- lldb/include/lldb/Target/DynamicLoader.h | 12 +++ lldb/source/Core/DynamicLoader.cpp | 7 +- .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp | 80 +++++++++++++++++++ .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.h | 6 ++ .../Process/minidump/ProcessMinidump.cpp | 20 ++++- .../Process/minidump/ProcessMinidump.h | 5 +- .../RegisterContextMinidump_x86_64.cpp | 20 ++++- lldb/source/Target/Process.cpp | 36 ++++++++- .../TestProcessSaveCoreMinidump.py | 77 +++++++++++++++++- .../process_save_core_minidump/main.cpp | 1 + 10 files changed, 248 insertions(+), 16 deletions(-) diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h index 0629e2faae7e9e..75bb6cb6bb9074 100644 --- a/lldb/include/lldb/Target/DynamicLoader.h +++ b/lldb/include/lldb/Target/DynamicLoader.h @@ -11,6 +11,7 @@ #include "lldb/Core/Address.h" #include "lldb/Core/PluginInterface.h" +#include "lldb/Target/CoreFileMemoryRanges.h" #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/Status.h" #include "lldb/Utility/UUID.h" @@ -337,6 +338,17 @@ class DynamicLoader : public PluginInterface { return std::nullopt; } + /// Returns a list of memory ranges that should be saved in the core file, + /// specific for this dynamic loader. + /// + /// For example, an implementation of this function can save the thread + /// local data of a given thread. + virtual void CalculateDynamicSaveCoreRanges( + lldb_private::Process &process, + std::vector &ranges, + llvm::function_ref + save_thread_predicate) {}; + protected: // Utility methods for derived classes diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp index 7758a87403b5a3..68d6ab0850853f 100644 --- a/lldb/source/Core/DynamicLoader.cpp +++ b/lldb/source/Core/DynamicLoader.cpp @@ -83,7 +83,11 @@ ModuleSP DynamicLoader::GetTargetExecutable() { ModuleSpec module_spec(executable->GetFileSpec(), executable->GetArchitecture()); auto module_sp = std::make_shared(module_spec); - + // If we're a coredump and we already have a main executable, we don't + // need to reload the module list that target already has + if (!m_process->IsLiveDebugSession()) { + return executable; + } // Check if the executable has changed and set it to the target // executable if they differ. if (module_sp && module_sp->GetUUID().IsValid() && @@ -369,4 +373,3 @@ void DynamicLoader::LoadOperatingSystemPlugin(bool flush) if (m_process) m_process->LoadOperatingSystemPlugin(flush); } - diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp index b9c0e174c3be68..34aca50df0ac4b 100644 --- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp @@ -18,6 +18,7 @@ #include "lldb/Symbol/ObjectFile.h" #include "lldb/Target/MemoryRegionInfo.h" #include "lldb/Target/Platform.h" +#include "lldb/Target/RegisterContext.h" #include "lldb/Target/Target.h" #include "lldb/Target/Thread.h" #include "lldb/Target/ThreadPlanRunToAddress.h" @@ -866,3 +867,82 @@ bool DynamicLoaderPOSIXDYLD::AlwaysRelyOnEHUnwindInfo( bool DynamicLoaderPOSIXDYLD::IsCoreFile() const { return !m_process->IsLiveDebugSession(); } + +// For our ELF/POSIX builds save off the fs_base/gs_base regions +static void AddThreadLocalMemoryRegions(Process &process, ThreadSP &thread_sp, + std::vector &ranges) { + lldb::RegisterContextSP reg_ctx = thread_sp->GetRegisterContext(); + if (!reg_ctx) + return; + + const RegisterInfo *reg_info = reg_ctx->GetRegisterInfo( + lldb::RegisterKind::eRegisterKindGeneric, LLDB_REGNUM_GENERIC_TP); + if (!reg_info) + return; + + lldb_private::RegisterValue thread_local_register_value; + bool success = reg_ctx->ReadRegister(reg_info, thread_local_register_value); + if (!success) + return; + + const uint64_t fail_value = UINT64_MAX; + bool readSuccess = false; + const lldb::addr_t reg_value_addr = + thread_local_register_value.GetAsUInt64(fail_value, &readSuccess); + if (!readSuccess || reg_value_addr == fail_value) + return; + + MemoryRegionInfo thread_local_region; + Status err = process.GetMemoryRegionInfo(reg_value_addr, thread_local_region); + if (err.Fail()) + return; + + ranges.push_back(thread_local_region); +} + +// Save off the link map for core files. +static void AddLinkMapSections(Process &process, + std::vector &ranges) { + ModuleList &module_list = process.GetTarget().GetImages(); + Target *target = &process.GetTarget(); + for (size_t idx = 0; idx < module_list.GetSize(); idx++) { + ModuleSP module_sp = module_list.GetModuleAtIndex(idx); + if (!module_sp) + continue; + + ObjectFile *obj = module_sp->GetObjectFile(); + if (!obj) + continue; + Address addr = obj->GetImageInfoAddress(target); + addr_t load_addr = addr.GetLoadAddress(target); + if (load_addr == LLDB_INVALID_ADDRESS) + continue; + + MemoryRegionInfo link_map_section; + Status err = process.GetMemoryRegionInfo(load_addr, link_map_section); + if (err.Fail()) + continue; + + ranges.push_back(link_map_section); + } +} + +void DynamicLoaderPOSIXDYLD::CalculateDynamicSaveCoreRanges( + lldb_private::Process &process, + std::vector &ranges, + llvm::function_ref + save_thread_predicate) { + ThreadList &thread_list = process.GetThreadList(); + for (size_t idx = 0; idx < thread_list.GetSize(); idx++) { + ThreadSP thread_sp = thread_list.GetThreadAtIndex(idx); + if (!thread_sp) + continue; + + if (!save_thread_predicate(*thread_sp)) + continue; + + AddThreadLocalMemoryRegions(process, thread_sp, ranges); + } + + AddLinkMapSections(process, ranges); +} diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h index 4c92335602cdf4..bde334aaca40b4 100644 --- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h +++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h @@ -60,6 +60,12 @@ class DynamicLoaderPOSIXDYLD : public lldb_private::DynamicLoader { lldb::addr_t base_addr, bool base_addr_is_offset) override; + void CalculateDynamicSaveCoreRanges( + lldb_private::Process &process, + std::vector &ranges, + llvm::function_ref + save_thread_predicate) override; + protected: /// Runtime linker rendezvous structure. DYLDRendezvous m_rendezvous; diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index 32ffba763c08e3..5ea3db23f114c4 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -21,11 +21,13 @@ #include "lldb/Interpreter/CommandReturnObject.h" #include "lldb/Interpreter/OptionArgParser.h" #include "lldb/Interpreter/OptionGroupBoolean.h" +#include "lldb/Target/DynamicLoader.h" #include "lldb/Target/JITLoaderList.h" #include "lldb/Target/MemoryRegionInfo.h" #include "lldb/Target/SectionLoadList.h" #include "lldb/Target/Target.h" #include "lldb/Target/UnixSignals.h" +#include "lldb/Utility/DataBufferHeap.h" #include "lldb/Utility/LLDBAssert.h" #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" @@ -34,6 +36,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Threading.h" +#include "Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h" #include "Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.h" #include "Plugins/Process/Utility/StopInfoMachException.h" @@ -333,6 +336,16 @@ ArchSpec ProcessMinidump::GetArchitecture() { return ArchSpec(triple); } +DataExtractor ProcessMinidump::GetAuxvData() { + std::optional> auxv = + m_minidump_parser->GetStream(StreamType::LinuxAuxv); + if (!auxv) + return DataExtractor(); + + return DataExtractor(auxv->data(), auxv->size(), GetByteOrder(), + GetAddressByteSize(), GetAddressByteSize()); +} + void ProcessMinidump::BuildMemoryRegions() { if (m_memory_regions) return; @@ -534,7 +547,12 @@ void ProcessMinidump::ReadModuleList() { module_sp = Module::CreateModuleFromObjectFile( module_spec, load_addr, load_size); - GetTarget().GetImages().Append(module_sp, true /* notify */); + // If we haven't loaded a main executable yet, set the first module to be + // main executable + if (!GetTarget().GetExecutableModule()) + GetTarget().SetExecutableModule(module_sp); + else + GetTarget().GetImages().Append(module_sp, true /* notify */); } bool load_addr_changed = false; diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h index f2ea0a2b61d14e..3d235670a33abc 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h @@ -53,12 +53,11 @@ class ProcessMinidump : public PostMortemProcess { Status DoLoadCore() override; - DynamicLoader *GetDynamicLoader() override { return nullptr; } + // Returns AUXV structure found in the core file + lldb_private::DataExtractor GetAuxvData() override; llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); } - SystemRuntime *GetSystemRuntime() override { return nullptr; } - Status DoDestroy() override; void RefreshStateAfterStop() override; diff --git a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp index e879c493156593..f305d1b7031d82 100644 --- a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp +++ b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp @@ -44,6 +44,17 @@ static void writeRegister(const void *reg_src, uint8_t *context, memcpy(reg_dest.data(), reg_src, reg_dest.size()); } +// TODO: Fix the registers in this file! +// writeRegister checks x86_64 registers without base registers. This causes +// an overlap in the register enum values. So we were truncating fs_base. +// We should standardize to the x86_64_with_base registers. +static void writeBaseRegister(const void *reg_src, uint8_t *context, + const RegisterInfo ®) { + auto bytes = reg.mutable_data(context); + llvm::MutableArrayRef reg_dest = bytes.take_front(8); + memcpy(reg_dest.data(), reg_src, reg_dest.size()); +} + lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64( llvm::ArrayRef source_data, RegisterInfoInterface *target_reg_interface) { @@ -105,11 +116,12 @@ lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64( writeRegister(&context->r15, result_base, reg_info[lldb_r15_x86_64]); } + // See comment on base regsiter if ((context_flags & LLDBSpecificFlag) == LLDBSpecificFlag) { - writeRegister(&context->fs_base, result_base, - reg_info[x86_64_with_base::lldb_fs_base]); - writeRegister(&context->gs_base, result_base, - reg_info[x86_64_with_base::lldb_gs_base]); + writeBaseRegister(&context->fs_base, result_base, + reg_info[x86_64_with_base::lldb_fs_base]); + writeBaseRegister(&context->gs_base, result_base, + reg_info[x86_64_with_base::lldb_gs_base]); } // TODO parse the floating point registers diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index aca08972811470..c009d17d3ba507 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -6528,6 +6528,29 @@ static void AddRegion(const MemoryRegionInfo ®ion, bool try_dirty_pages, CreateCoreFileMemoryRange(region)); } +static void SaveDynamicLoaderSections(Process &process, + const SaveCoreOptions &options, + CoreFileMemoryRanges &ranges, + std::set &stack_ends) { + DynamicLoader *dyld = process.GetDynamicLoader(); + if (!dyld) + return; + + std::vector dynamic_loader_mem_regions; + std::function save_thread_predicate = + [&](const lldb_private::Thread &t) -> bool { + return options.ShouldThreadBeSaved(t.GetID()); + }; + dyld->CalculateDynamicSaveCoreRanges(process, dynamic_loader_mem_regions, + save_thread_predicate); + for (const auto ®ion : dynamic_loader_mem_regions) { + // The Dynamic Loader can give us regions that could include a truncated + // stack + if (stack_ends.count(region.GetRange().GetRangeEnd()) == 0) + AddRegion(region, true, ranges); + } +} + static void SaveOffRegionsWithStackPointers(Process &process, const SaveCoreOptions &core_options, const MemoryRegionInfos ®ions, @@ -6559,11 +6582,13 @@ static void SaveOffRegionsWithStackPointers(Process &process, // off in other calls sp_region.GetRange().SetRangeBase(stack_head); sp_region.GetRange().SetByteSize(stack_size); - stack_ends.insert(sp_region.GetRange().GetRangeEnd()); + const addr_t range_end = sp_region.GetRange().GetRangeEnd(); + stack_ends.insert(range_end); // This will return true if the threadlist the user specified is empty, // or contains the thread id from thread_sp. - if (core_options.ShouldThreadBeSaved(thread_sp->GetID())) + if (core_options.ShouldThreadBeSaved(thread_sp->GetID())) { AddRegion(sp_region, try_dirty_pages, ranges); + } } } } @@ -6672,9 +6697,14 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options, std::set stack_ends; // For fully custom set ups, we don't want to even look at threads if there // are no threads specified. - if (core_style != lldb::eSaveCoreCustomOnly || options.HasSpecifiedThreads()) + if (core_style != lldb::eSaveCoreCustomOnly || + options.HasSpecifiedThreads()) { SaveOffRegionsWithStackPointers(*this, options, regions, ranges, stack_ends); + // Save off the dynamic loader sections, so if we are on an architecture + // that supports Thread Locals, that we include those as well. + SaveDynamicLoaderSections(*this, options, ranges, stack_ends); + } switch (core_style) { case eSaveCoreUnspecified: diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py index 03cc415924e0bb..4818dde4f3b838 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py @@ -523,8 +523,10 @@ def minidump_deleted_on_save_failure(self): finally: self.assertTrue(self.dbg.DeleteTarget(target)) - def minidump_deterministic_difference(self): - """Test that verifies that two minidumps produced are identical.""" + @skipUnlessPlatform(["linux"]) + @skipUnlessArch("x86_64") + def minidump_saves_fs_base_region(self): + """Test that verifies the minidump file saves region for fs_base""" self.build() exe = self.getBuildArtifact("a.out") @@ -534,6 +536,45 @@ def minidump_deterministic_difference(self): None, None, self.get_process_working_directory() ) self.assertState(process.GetState(), lldb.eStateStopped) + thread = process.GetThreadAtIndex(0) + custom_file = self.getBuildArtifact("core.reg_region.dmp") + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(custom_file)) + options.SetPluginName("minidump") + options.SetStyle(lldb.eSaveCoreCustomOnly) + options.AddThread(thread) + error = process.SaveCore(options) + self.assertTrue(error.Success()) + + registers = thread.GetFrameAtIndex(0).GetRegisters() + fs_base = registers.GetFirstValueByName("fs_base").GetValueAsUnsigned() + self.assertTrue(fs_base != 0) + core_target = self.dbg.CreateTarget(None) + core_proc = core_target.LoadCore(one_region_file) + core_region_list = core_proc.GetMemoryRegions() + live_region_list = process.GetMemoryRegions() + live_region = lldb.SBMemoryRegionInfo() + live_region_list.GetMemoryRegionForAddress(fs_base, live_region) + core_region = lldb.SBMemoryRegionInfo() + error = core_region_list.GetMemoryRegionForAddress(fs_base, core_region) + self.assertTrue(error.Success()) + self.assertEqual(live_region, core_region) + + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) + self.assertTrue(self.dbg.DeleteTarget(core_target)) + if os.path.isfile(custom_file): + os.unlink(custom_file) + + def minidump_deterministic_difference(self): + """Test that verifies that two minidumps produced are identical.""" + self.build() + exe = self.getBuildArtifact("a.out") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) core_styles = [ lldb.eSaveCoreStackOnly, @@ -562,6 +603,36 @@ def minidump_deterministic_difference(self): self.assertEqual(file_one, file_two) self.assertTrue(os.unlink(spec_one.GetFileName())) self.assertTrue(os.unlink(spec_two.GetFileName())) - finally: self.assertTrue(self.dbg.DeleteTarget(target)) + + @skipUnlessPlatform(["linux"]) + @skipUnlessArch("x86_64") + def minidump_saves_fs_base_region(self): + self.build() + exe = self.getBuildArtifact("a.out") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + self.assertState(process.GetState(), lldb.eStateStopped) + thread = process.GetThreadAtIndex(0) + tls_file = self.getBuildArtifact("core.tls.dmp") + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(tls_file)) + options.SetPluginName("minidump") + options.SetStyle(lldb.eSaveCoreCustomOnly) + options.AddThread(thread) + error = process.SaveCore(options) + self.assertTrue(error.Success()) + core_target = self.dbg.CreateTarget(None) + core_proc = core_target.LoadCore(tls_file) + frame = core_proc.GetThreadAtIndex(0).GetFrameAtIndex(0) + tls_val = frame.FindValue("lf") + self.assertEqual(tls_val.GetValueAsUnsigned(), 42) + + except: + self.assertTrue(self.dbg.DeleteTarget(target)) + if os.path.isfile(tls_file): + os.unlink(tls_file) diff --git a/lldb/test/API/functionalities/process_save_core_minidump/main.cpp b/lldb/test/API/functionalities/process_save_core_minidump/main.cpp index fa34a371f20647..15daa68e9a648c 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/main.cpp +++ b/lldb/test/API/functionalities/process_save_core_minidump/main.cpp @@ -1,6 +1,7 @@ #include #include #include +thread_local size_t lf = 42; void g() { assert(false); } From 4f297566b3150097de26c6a23a987d2bd5fc19c5 Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Fri, 11 Oct 2024 09:01:47 +1300 Subject: [PATCH 091/177] [lldb] Implement basic support for reverse-continue (#99736) This commit only adds support for the `SBProcess::ReverseContinue()` API. A user-accessible command for this will follow in a later commit. This feature depends on a gdbserver implementation (e.g. `rr`) providing support for the `bc` and `bs` packets. `lldb-server` does not support those packets, and there is no plan to change that. So, for testing purposes, `lldbreverse.py` wraps `lldb-server` with a Python implementation of *very limited* record-and-replay functionality for use by *tests only*. The majority of this PR is test infrastructure (about 700 of the 950 lines added). --- lldb/include/lldb/API/SBProcess.h | 1 + lldb/include/lldb/Target/Process.h | 21 +- lldb/include/lldb/Target/StopInfo.h | 6 + lldb/include/lldb/lldb-enumerations.h | 6 + .../Python/lldbsuite/test/gdbclientutils.py | 5 +- .../Python/lldbsuite/test/lldbgdbproxy.py | 175 ++++++++ .../Python/lldbsuite/test/lldbreverse.py | 418 ++++++++++++++++++ .../Python/lldbsuite/test/lldbtest.py | 2 + lldb/source/API/SBProcess.cpp | 8 +- lldb/source/API/SBThread.cpp | 2 + .../source/Interpreter/CommandInterpreter.cpp | 3 +- .../Process/Linux/NativeThreadLinux.cpp | 3 + .../Process/MacOSX-Kernel/ProcessKDP.cpp | 9 +- .../Process/MacOSX-Kernel/ProcessKDP.h | 2 +- .../Process/Windows/Common/ProcessWindows.cpp | 8 +- .../Process/Windows/Common/ProcessWindows.h | 2 +- .../GDBRemoteCommunicationClient.cpp | 22 + .../gdb-remote/GDBRemoteCommunicationClient.h | 6 + .../GDBRemoteCommunicationServerLLGS.cpp | 1 + .../Process/gdb-remote/ProcessGDBRemote.cpp | 77 +++- .../Process/gdb-remote/ProcessGDBRemote.h | 2 +- .../Process/scripted/ScriptedProcess.cpp | 9 +- .../Process/scripted/ScriptedProcess.h | 2 +- lldb/source/Target/Process.cpp | 29 +- lldb/source/Target/StopInfo.cpp | 29 ++ lldb/source/Target/Thread.cpp | 8 +- .../reverse-execution/Makefile | 3 + .../TestReverseContinueBreakpoints.py | 115 +++++ .../TestReverseContinueNotSupported.py | 30 ++ .../functionalities/reverse-execution/main.c | 14 + lldb/tools/lldb-dap/JSONUtils.cpp | 3 + lldb/tools/lldb-dap/LLDBUtils.cpp | 1 + 32 files changed, 978 insertions(+), 44 deletions(-) create mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py create mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py create mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py create mode 100644 lldb/test/API/functionalities/reverse-execution/main.c diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h index 1624e02070b1b2..8b8ed830b54cc0 100644 --- a/lldb/include/lldb/API/SBProcess.h +++ b/lldb/include/lldb/API/SBProcess.h @@ -159,6 +159,7 @@ class LLDB_API SBProcess { lldb::SBError Destroy(); lldb::SBError Continue(); + lldb::SBError Continue(RunDirection direction); lldb::SBError Stop(); diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index b8c53a474ba6b9..fe7fbc50fd5770 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - Status Resume(); + Status Resume(lldb::RunDirection direction = lldb::eRunForward); /// Resume a process, and wait for it to stop. - Status ResumeSynchronous(Stream *stream); + Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward); /// Halts a running process. /// @@ -1104,9 +1104,14 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - virtual Status DoResume() { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support resuming processes", GetPluginName()); + virtual Status DoResume(lldb::RunDirection direction) { + if (direction == lldb::RunDirection::eRunForward) { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support resuming processes", GetPluginName()); + } else { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + } } /// Called after resuming a process. @@ -2332,6 +2337,8 @@ class Process : public std::enable_shared_from_this, bool IsRunning() const; + lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; } + DynamicCheckerFunctions *GetDynamicCheckers() { return m_dynamic_checkers_up.get(); } @@ -2851,7 +2858,7 @@ void PruneThreadPlans(); /// /// \return /// An Status object describing the success or failure of the resume. - Status PrivateResume(); + Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward); // Called internally void CompleteAttach(); @@ -3127,6 +3134,8 @@ void PruneThreadPlans(); // m_currently_handling_do_on_removals are true, // Resume will only request a resume, using this // flag to check. + // The direction of execution from the last time this process was resumed. + lldb::RunDirection m_last_run_direction; lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async /// interrupt, used by thread plan timeout. It diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h index fae90364deaf0a..072f71f6b1122f 100644 --- a/lldb/include/lldb/Target/StopInfo.h +++ b/lldb/include/lldb/Target/StopInfo.h @@ -142,6 +142,12 @@ class StopInfo : public std::enable_shared_from_this { static lldb::StopInfoSP CreateStopReasonProcessorTrace(Thread &thread, const char *description); + // This creates a StopInfo indicating that execution stopped because + // it was replaying some recorded execution history, and execution reached + // the end of that recorded history. + static lldb::StopInfoSP + CreateStopReasonHistoryBoundary(Thread &thread, const char *description); + static lldb::StopInfoSP CreateStopReasonFork(Thread &thread, lldb::pid_t child_pid, lldb::tid_t child_tid); diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 938f6e3abe8f2a..232d1dfdb5c9d0 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -135,6 +135,9 @@ FLAGS_ENUM(LaunchFlags){ /// Thread Run Modes. enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping }; +/// Execution directions +enum RunDirection { eRunForward, eRunReverse }; + /// Byte ordering definitions. enum ByteOrder { eByteOrderInvalid = 0, @@ -254,6 +257,9 @@ enum StopReason { eStopReasonVFork, eStopReasonVForkDone, eStopReasonInterrupt, ///< Thread requested interrupt + // Indicates that execution stopped because the debugger backend relies + // on recorded data and we reached the end of that data. + eStopReasonHistoryBoundary, }; /// Command Return Status Types. diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py index 1784487323ad6b..732d6171320680 100644 --- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py +++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py @@ -510,8 +510,9 @@ def start(self): self._thread.start() def stop(self): - self._thread.join() - self._thread = None + if self._thread is not None: + self._thread.join() + self._thread = None def get_connect_address(self): return self._socket.get_connect_address() diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py new file mode 100644 index 00000000000000..2a9592bf4545a4 --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py @@ -0,0 +1,175 @@ +import logging +import os +import os.path +import random + +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.gdbclientutils import * +import lldbgdbserverutils +from lldbsuite.support import seven + + +class GDBProxyTestBase(TestBase): + """ + Base class for gdbserver proxy tests. + + This class will setup and start a mock GDB server for the test to use. + It pases through requests to a regular lldb-server/debugserver and + forwards replies back to the LLDB under test. + """ + + """The gdbserver that we implement.""" + server = None + """The inner lldb-server/debugserver process that we proxy requests into.""" + monitor_server = None + monitor_sock = None + + server_socket_class = TCPServerSocket + + DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) + + _verbose_log_handler = None + _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s") + + def setUpBaseLogging(self): + self.logger = logging.getLogger(__name__) + + if len(self.logger.handlers) > 0: + return # We have set up this handler already + + self.logger.propagate = False + self.logger.setLevel(logging.DEBUG) + + # log all warnings to stderr + handler = logging.StreamHandler() + handler.setLevel(logging.WARNING) + handler.setFormatter(self._log_formatter) + self.logger.addHandler(handler) + + def setUp(self): + TestBase.setUp(self) + + self.setUpBaseLogging() + + if self.isVerboseLoggingRequested(): + # If requested, full logs go to a log file + log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log" + self._verbose_log_handler = logging.FileHandler( + log_file_name + ) + self._verbose_log_handler.setFormatter(self._log_formatter) + self._verbose_log_handler.setLevel(logging.DEBUG) + self.logger.addHandler(self._verbose_log_handler) + + lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe() + if lldb_server_exe is None: + self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe() + self.assertTrue(self.debug_monitor_exe is not None) + self.debug_monitor_extra_args = [] + else: + self.debug_monitor_exe = lldb_server_exe + self.debug_monitor_extra_args = ["gdbserver"] + + self.server = MockGDBServer(self.server_socket_class()) + self.server.responder = self + + def tearDown(self): + # TestBase.tearDown will kill the process, but we need to kill it early + # so its client connection closes and we can stop the server before + # finally calling the base tearDown. + if self.process() is not None: + self.process().Kill() + self.server.stop() + + self.logger.removeHandler(self._verbose_log_handler) + self._verbose_log_handler = None + + TestBase.tearDown(self) + + def isVerboseLoggingRequested(self): + # We will report our detailed logs if the user requested that the "gdb-remote" channel is + # logged. + return any(("gdb-remote" in channel) for channel in lldbtest_config.channels) + + def connect(self, target): + """ + Create a process by connecting to the mock GDB server. + """ + self.prep_debug_monitor_and_inferior() + self.server.start() + + listener = self.dbg.GetListener() + error = lldb.SBError() + process = target.ConnectRemote( + listener, self.server.get_connect_url(), "gdb-remote", error + ) + self.assertTrue(error.Success(), error.description) + self.assertTrue(process, PROCESS_IS_VALID) + return process + + def get_next_port(self): + return 12000 + random.randint(0, 3999) + + def prep_debug_monitor_and_inferior(self): + inferior_exe_path = self.getBuildArtifact("a.out") + self.connect_to_debug_monitor([inferior_exe_path]) + self.assertIsNotNone(self.monitor_server) + self.initial_handshake() + + def initial_handshake(self): + self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "+") + self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode")) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "+") + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "OK") + self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.assertEqual(reply, "+") + + def get_debug_monitor_command_line_args(self, connect_address, launch_args): + return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args + + def launch_debug_monitor(self, launch_args): + family, type, proto, _, addr = socket.getaddrinfo( + "localhost", 0, proto=socket.IPPROTO_TCP + )[0] + sock = socket.socket(family, type, proto) + sock.settimeout(self.DEFAULT_TIMEOUT) + sock.bind(addr) + sock.listen(1) + addr = sock.getsockname() + connect_address = "[{}]:{}".format(*addr) + + commandline_args = self.get_debug_monitor_command_line_args( + connect_address, launch_args + ) + + # Start the server. + self.logger.info(f"Spawning monitor {commandline_args}") + monitor_process = self.spawnSubprocess( + self.debug_monitor_exe, commandline_args, install_remote=False + ) + self.assertIsNotNone(monitor_process) + + self.monitor_sock = sock.accept()[0] + self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT) + return monitor_process + + def connect_to_debug_monitor(self, launch_args): + monitor_process = self.launch_debug_monitor(launch_args) + self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process) + + def respond(self, packet): + """Subclasses can override this to change how packets are handled.""" + return self.pass_through(packet) + + def pass_through(self, packet): + self.logger.info(f"Sending packet {packet}") + self.monitor_server.send_packet(seven.bitcast_to_bytes(packet)) + reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) + self.logger.info(f"Received reply {reply}") + return reply diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py new file mode 100644 index 00000000000000..0f02fdffbdeada --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/lldbreverse.py @@ -0,0 +1,418 @@ +import os +import os.path +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.gdbclientutils import * +from lldbsuite.test.lldbgdbproxy import * +import lldbgdbserverutils +import re + + +class ThreadSnapshot: + def __init__(self, thread_id, registers): + self.thread_id = thread_id + self.registers = registers + + +class MemoryBlockSnapshot: + def __init__(self, address, data): + self.address = address + self.data = data + + +class StateSnapshot: + def __init__(self, thread_snapshots, memory): + self.thread_snapshots = thread_snapshots + self.memory = memory + self.thread_id = None + + +class RegisterInfo: + def __init__(self, lldb_index, bitsize, little_endian): + self.lldb_index = lldb_index + self.bitsize = bitsize + self.little_endian = little_endian + + +BELOW_STACK_POINTER = 16384 +ABOVE_STACK_POINTER = 4096 + +BLOCK_SIZE = 1024 + +SOFTWARE_BREAKPOINTS = 0 +HARDWARE_BREAKPOINTS = 1 +WRITE_WATCHPOINTS = 2 + + +class ReverseTestBase(GDBProxyTestBase): + """ + Base class for tests that need reverse execution. + + This class uses a gdbserver proxy to add very limited reverse- + execution capability to lldb-server/debugserver for testing + purposes only. + + To use this class, run the inferior forward until some stopping point. + Then call `start_recording()` and execute forward again until reaching + a software breakpoint; this class records the state before each execution executes. + At that point, the server will accept "bc" and "bs" packets to step + backwards through the state. + When executing during recording, we only allow single-step and continue without + delivering a signal, and only software breakpoint stops are allowed. + + We assume that while recording is enabled, the only effects of instructions + are on general-purpose registers (read/written by the 'g' and 'G' packets) + and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER). + """ + + """ + A list of StateSnapshots in time order. + + There is one snapshot per single-stepped instruction, + representing the state before that instruction was + executed. The last snapshot in the list is the + snapshot before the last instruction was executed. + This is an undo log; we snapshot a superset of the state that may have + been changed by the instruction's execution. + """ + snapshots = None + recording_enabled = False + + breakpoints = None + + pid = None + + pc_register_info = None + sp_register_info = None + general_purpose_register_info = None + + def __init__(self, *args, **kwargs): + GDBProxyTestBase.__init__(self, *args, **kwargs) + self.breakpoints = [set(), set(), set(), set(), set()] + + def respond(self, packet): + if not packet: + raise ValueError("Invalid empty packet") + if packet == self.server.PACKET_INTERRUPT: + # Don't send a response. We'll just run to completion. + return [] + if self.is_command(packet, "qSupported", ":"): + reply = self.pass_through(packet) + return reply + ";ReverseStep+;ReverseContinue+" + if self.is_command(packet, "vCont", ";"): + if self.recording_enabled: + return self.continue_with_recording(packet) + snapshots = [] + if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S": + raise ValueError("LLDB should not be sending old-style continuation packets") + if packet == "bc": + return self.reverse_continue() + if packet == "bs": + return self.reverse_step() + if packet == 'jThreadsInfo': + # Suppress this because it contains thread stop reasons which we might + # need to modify, and we don't want to have to implement that. + return "" + if packet[0] == "z" or packet[0] == "Z": + reply = self.pass_through(packet) + if reply == "OK": + self.update_breakpoints(packet) + return reply + return GDBProxyTestBase.respond(self, packet) + + def start_recording(self): + self.recording_enabled = True + self.snapshots = [] + + def stop_recording(self): + """ + Don't record when executing foward. + + Reverse execution is still supported until the next forward continue. + """ + self.recording_enabled = False + + def is_command(self, packet, cmd, follow_token): + return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token + + def update_breakpoints(self, packet): + m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet) + if m is None: + raise ValueError("Invalid breakpoint packet: " + packet) + t = int(m.group(2)) + addr = int(m.group(3), 16) + kind = int(m.group(4), 16) + if m.group(1) == 'Z': + self.breakpoints[t].add((addr, kind)) + else: + self.breakpoints[t].discard((addr, kind)) + + def breakpoint_triggered_at(self, pc): + if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]): + return True + if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]): + return True + return False + + def watchpoint_triggered(self, new_value_block, current_contents): + """Returns the address or None.""" + for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]: + for offset in range(0, kind): + addr = watch_addr + offset + if (addr >= new_value_block.address and + addr < new_value_block.address + len(new_value_block.data)): + index = addr - new_value_block.address + if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]: + return watch_addr + return None + + def continue_with_recording(self, packet): + self.logger.debug("Continue with recording enabled") + + step_packet = "vCont;s" + if packet == "vCont": + requested_step = False + else: + m = re.match("vCont;(c|s)(.*)", packet) + if m is None: + raise ValueError("Unsupported vCont packet: " + packet) + requested_step = m.group(1) == 's' + step_packet += m.group(2) + + while True: + snapshot = self.capture_snapshot() + reply = self.pass_through(step_packet) + (stop_signal, stop_pairs) = self.parse_stop(reply) + if stop_signal != 5: + raise ValueError("Unexpected stop signal: " + reply) + is_swbreak = False + thread_id = None + for key, value in stop_pairs.items(): + if key == "thread": + thread_id = self.parse_thread_id(value) + continue + if re.match('[0-9a-f]+', key): + continue + if key == "swbreak" or (key == "reason" and value == "breakpoint"): + is_swbreak = True + continue + if key in ["name", "threads", "thread-pcs", "reason"]: + continue + raise ValueError(f"Unknown stop key '{key}' in {reply}") + if is_swbreak: + self.logger.debug("Recording stopped") + return reply + if thread_id is None: + return ValueError("Expected thread ID: " + reply) + snapshot.thread_id = thread_id + self.snapshots.append(snapshot) + if requested_step: + self.logger.debug("Recording stopped for step") + return reply + + def parse_stop(self, reply): + result = {} + if not reply: + raise ValueError("Invalid empty packet") + if reply[0] == "T" and len(reply) >= 3: + result = {k:v for k, v in self.parse_pairs(reply[3:])} + return (int(reply[1:3], 16), result) + raise "Unsupported stop reply: " + reply + + def parse_pairs(self, text): + for pair in text.split(";"): + if not pair: + continue + m = re.match("([^:]+):(.*)", pair) + if m is None: + raise ValueError("Invalid pair text: " + text) + yield (m.group(1), m.group(2)) + + def capture_snapshot(self): + """Snapshot all threads and their stack memories.""" + self.ensure_register_info() + current_thread = self.get_current_thread() + thread_snapshots = [] + memory = [] + for thread_id in self.get_thread_list(): + registers = {} + for index in sorted(self.general_purpose_register_info.keys()): + reply = self.pass_through(f"p{index:x};thread:{thread_id:x};") + if reply == "" or reply[0] == 'E': + raise ValueError("Can't read register") + registers[index] = reply + thread_snapshot = ThreadSnapshot(thread_id, registers) + thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers) + memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER) + thread_snapshots.append(thread_snapshot) + self.set_current_thread(current_thread) + return StateSnapshot(thread_snapshots, memory) + + def restore_snapshot(self, snapshot): + """ + Restore the snapshot during reverse execution. + + If this triggers a breakpoint or watchpoint, return the stop reply, + otherwise None. + """ + current_thread = self.get_current_thread() + stop_reasons = [] + for thread_snapshot in snapshot.thread_snapshots: + thread_id = thread_snapshot.thread_id + for lldb_index in sorted(thread_snapshot.registers.keys()): + data = thread_snapshot.registers[lldb_index] + reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};") + if reply != "OK": + raise ValueError("Can't restore thread register") + if thread_id == snapshot.thread_id: + new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers) + if self.breakpoint_triggered_at(new_pc): + stop_reasons.append([("reason", "breakpoint")]) + self.set_current_thread(current_thread) + for block in snapshot.memory: + current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}") + if not current_memory or current_memory[0] == 'E': + raise ValueError("Can't read back memory") + reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data) + if reply != "OK": + raise ValueError("Can't restore memory") + watch_addr = self.watchpoint_triggered(block, current_memory[1:]) + if watch_addr is not None: + stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")]) + if stop_reasons: + pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0]) + return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};" + return None + + def reverse_step(self): + if not self.snapshots: + self.logger.debug("Reverse-step at history boundary") + return self.history_boundary_reply(self.get_current_thread()) + self.logger.debug("Reverse-step started") + snapshot = self.snapshots.pop() + stop_reply = self.restore_snapshot(snapshot) + self.set_current_thread(snapshot.thread_id) + self.logger.debug("Reverse-step stopped") + if stop_reply is None: + return self.singlestep_stop_reply(snapshot.thread_id) + return stop_reply + + def reverse_continue(self): + self.logger.debug("Reverse-continue started") + thread_id = None + while self.snapshots: + snapshot = self.snapshots.pop() + stop_reply = self.restore_snapshot(snapshot) + thread_id = snapshot.thread_id + if stop_reply is not None: + self.set_current_thread(thread_id) + self.logger.debug("Reverse-continue stopped") + return stop_reply + if thread_id is None: + thread_id = self.get_current_thread() + else: + self.set_current_thread(snapshot.thread_id) + self.logger.debug("Reverse-continue stopped at history boundary") + return self.history_boundary_reply(thread_id) + + def get_current_thread(self): + reply = self.pass_through("qC") + return self.parse_thread_id(reply[2:]) + + def parse_thread_id(self, thread_id): + m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id) + if m is None: + raise ValueError("Invalid thread ID: " + thread_id) + if self.pid is None: + self.pid = int(m.group(2), 16) + return int(m.group(3), 16) + + def history_boundary_reply(self, thread_id): + return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;" + + def singlestep_stop_reply(self, thread_id): + return f"T05thread:{self.pid:x}.{thread_id:x};" + + def set_current_thread(self, thread_id): + """ + Set current thread in inner gdbserver. + """ + if thread_id >= 0: + self.pass_through(f"Hg{self.pid:x}.{thread_id:x}") + self.pass_through(f"Hc{self.pid:x}.{thread_id:x}") + else: + self.pass_through(f"Hc-1.-1") + self.pass_through(f"Hg-1.-1") + + def get_register(self, register_info, registers): + if register_info.bitsize % 8 != 0: + raise ValueError("Register size must be a multiple of 8 bits") + if register_info.lldb_index not in registers: + raise ValueError("Register value not captured") + data = registers[register_info.lldb_index] + num_bytes = register_info.bitsize//8 + bytes = [] + for i in range(0, num_bytes): + bytes.append(int(data[i*2:(i + 1)*2], 16)) + if register_info.little_endian: + bytes.reverse() + result = 0 + for byte in bytes: + result = (result << 8) + byte + return result + + def read_memory(self, start_addr, end_addr): + """ + Read a region of memory from the target. + + Some of the addresses may extend into invalid virtual memory; + skip those areas. + Return a list of blocks containing the valid area(s) in the + requested range. + """ + regions = [] + start_addr = start_addr & (BLOCK_SIZE - 1) + end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1) + for addr in range(start_addr, end_addr, BLOCK_SIZE): + reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}") + if reply and reply[0] != 'E': + block = MemoryBlockSnapshot(addr, reply[1:]) + regions.append(block) + return regions + + def ensure_register_info(self): + if self.general_purpose_register_info is not None: + return + reply = self.pass_through("qHostInfo") + little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply)) + self.general_purpose_register_info = {} + lldb_index = 0 + while True: + reply = self.pass_through(f"qRegisterInfo{lldb_index:x}") + if not reply or reply[0] == 'E': + break + info = {k:v for k, v in self.parse_pairs(reply)} + reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian) + if info["set"] == "General Purpose Registers" and not "container-regs" in info: + self.general_purpose_register_info[lldb_index] = reg_info + if "generic" in info: + if info["generic"] == "pc": + self.pc_register_info = reg_info + elif info["generic"] == "sp": + self.sp_register_info = reg_info + lldb_index += 1 + if self.pc_register_info is None or self.sp_register_info is None: + raise ValueError("Can't find generic pc or sp register") + + def get_thread_list(self): + threads = [] + reply = self.pass_through("qfThreadInfo") + while True: + if not reply: + raise ValueError("Missing reply packet") + if reply[0] == 'm': + for id in reply[1:].split(","): + threads.append(self.parse_thread_id(id)) + elif reply[0] == 'l': + return threads + reply = self.pass_through("qsThreadInfo") diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index 8884ef5933ada8..7cc1ac9749ec93 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -143,6 +143,8 @@ STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint" +STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary" + DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly" VALID_BREAKPOINT = "Got a valid breakpoint" diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp index 9773144723c34c..07780f9f9c8393 100644 --- a/lldb/source/API/SBProcess.cpp +++ b/lldb/source/API/SBProcess.cpp @@ -564,6 +564,10 @@ uint32_t SBProcess::GetAddressByteSize() const { } SBError SBProcess::Continue() { + return Continue(RunDirection::eRunForward); +} + +SBError SBProcess::Continue(RunDirection direction) { LLDB_INSTRUMENT_VA(this); SBError sb_error; @@ -574,9 +578,9 @@ SBError SBProcess::Continue() { process_sp->GetTarget().GetAPIMutex()); if (process_sp->GetTarget().GetDebugger().GetAsyncExecution()) - sb_error.ref() = process_sp->Resume(); + sb_error.ref() = process_sp->Resume(direction); else - sb_error.ref() = process_sp->ResumeSynchronous(nullptr); + sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction); } else sb_error = Status::FromErrorString("SBProcess is invalid"); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index a99456e06d0329..aca8a039952960 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -172,6 +172,7 @@ size_t SBThread::GetStopReasonDataCount() { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: + case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; @@ -233,6 +234,7 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: + case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 8d3a82ef6c990a..ea60492ac46a10 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2553,7 +2553,8 @@ bool CommandInterpreter::DidProcessStopAbnormally() const { const StopReason reason = stop_info->GetStopReason(); if (reason == eStopReasonException || reason == eStopReasonInstrumentation || - reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt) + reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt || + reason == eStopReasonHistoryBoundary) return true; if (reason == eStopReasonSignal) { diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp index de047ee214c11e..b0aa664775b463 100644 --- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp @@ -82,6 +82,9 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info, case eStopReasonProcessorTrace: log.Printf("%s: %s processor trace", __FUNCTION__, header); return; + case eStopReasonHistoryBoundary: + log.Printf("%s: %s history boundary", __FUNCTION__, header); + return; default: log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header, static_cast(stop_info.reason)); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 9b2907c6809965..116c43343c01d1 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -402,9 +402,16 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() { Status ProcessKDP::WillResume() { return Status(); } -Status ProcessKDP::DoResume() { +Status ProcessKDP::DoResume(RunDirection direction) { Status error; Log *log = GetLog(KDPLog::Process); + + if (direction == RunDirection::eRunReverse) { + error.SetErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + return error; + } + // Only start the async thread if we try to do any process control if (!m_async_thread.IsJoinable()) StartAsyncThread(); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h index e5ec5914f9600d..1b71d83f70b087 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h @@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process { // Process Control lldb_private::Status WillResume() override; - lldb_private::Status DoResume() override; + lldb_private::Status DoResume(lldb::RunDirection direction) override; lldb_private::Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index 703aa082f0476f..76b7095deaa503 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -204,11 +204,17 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid, return error; } -Status ProcessWindows::DoResume() { +Status ProcessWindows::DoResume(RunDirection direction) { Log *log = GetLog(WindowsLog::Process); llvm::sys::ScopedLock lock(m_mutex); Status error; + if (direction == RunDirection::eRunReverse) { + error.SetErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + return error; + } + StateType private_state = GetPrivateState(); if (private_state == eStateStopped || private_state == eStateCrashed) { LLDB_LOG(log, "process {0} is in state {1}. Resuming...", diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h index e97cfb790248be..97284b7cd1436e 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h @@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger { Status DoAttachToProcessWithID( lldb::pid_t pid, const lldb_private::ProcessAttachInfo &attach_info) override; - Status DoResume() override; + Status DoResume(lldb::RunDirection direction) override; Status DoDestroy() override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index e42526c8fd7266..fc792a4409410b 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -199,6 +199,20 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() { return m_max_packet_size; } +bool GDBRemoteCommunicationClient::GetReverseContinueSupported() { + if (m_supports_reverse_continue == eLazyBoolCalculate) { + GetRemoteQSupported(); + } + return m_supports_reverse_continue == eLazyBoolYes; +} + +bool GDBRemoteCommunicationClient::GetReverseStepSupported() { + if (m_supports_reverse_step == eLazyBoolCalculate) { + GetRemoteQSupported(); + } + return m_supports_reverse_step == eLazyBoolYes; +} + bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() { if (m_supports_not_sending_acks == eLazyBoolCalculate) { m_send_acks = true; @@ -295,6 +309,8 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) { m_supports_qXfer_siginfo_read = eLazyBoolCalculate; m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate; m_uses_native_signals = eLazyBoolCalculate; + m_supports_reverse_continue = eLazyBoolCalculate; + m_supports_reverse_step = eLazyBoolCalculate; m_supports_qProcessInfoPID = true; m_supports_qfProcessInfo = true; m_supports_qUserName = true; @@ -348,6 +364,8 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_memory_tagging = eLazyBoolNo; m_supports_qSaveCore = eLazyBoolNo; m_uses_native_signals = eLazyBoolNo; + m_supports_reverse_continue = eLazyBoolNo; + m_supports_reverse_step = eLazyBoolNo; m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if // not, we assume no limit @@ -401,6 +419,10 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_qSaveCore = eLazyBoolYes; else if (x == "native-signals+") m_uses_native_signals = eLazyBoolYes; + else if (x == "ReverseContinue+") + m_supports_reverse_continue = eLazyBoolYes; + else if (x == "ReverseStep+") + m_supports_reverse_step = eLazyBoolYes; // Look for a list of compressions in the features list e.g. // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib- // deflate,lzma diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h index 898d176abc3465..116b47c1edf033 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h @@ -331,6 +331,10 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { bool GetMultiprocessSupported(); + bool GetReverseContinueSupported(); + + bool GetReverseStepSupported(); + LazyBool SupportsAllocDeallocMemory() // const { // Uncomment this to have lldb pretend the debug server doesn't respond to @@ -561,6 +565,8 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { LazyBool m_supports_memory_tagging = eLazyBoolCalculate; LazyBool m_supports_qSaveCore = eLazyBoolCalculate; LazyBool m_uses_native_signals = eLazyBoolCalculate; + LazyBool m_supports_reverse_continue = eLazyBoolCalculate; + LazyBool m_supports_reverse_step = eLazyBoolCalculate; bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1, m_supports_qUserName : 1, m_supports_qGroupName : 1, diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp index 35fa93e53bc66f..4016cde74ebea8 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp @@ -716,6 +716,7 @@ static const char *GetStopReasonString(StopReason stop_reason) { return "vforkdone"; case eStopReasonInterrupt: return "async interrupt"; + case eStopReasonHistoryBoundary: case eStopReasonInstrumentation: case eStopReasonInvalid: case eStopReasonPlanComplete: diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 3e09c316d74f44..3fc03bd05d5df0 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -169,6 +169,10 @@ class PluginProperties : public Properties { } }; +std::chrono::seconds ResumeTimeout() { + return std::chrono::seconds(5); +} + } // namespace static PluginProperties &GetGlobalPluginProperties() { @@ -1180,10 +1184,11 @@ Status ProcessGDBRemote::WillResume() { return Status(); } -Status ProcessGDBRemote::DoResume() { +Status ProcessGDBRemote::DoResume(RunDirection direction) { Status error; Log *log = GetLog(GDBRLog::Process); - LLDB_LOGF(log, "ProcessGDBRemote::Resume()"); + LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)", + direction == RunDirection::eRunForward ? "" : "reverse"); ListenerSP listener_sp( Listener::MakeListener("gdb-remote.resume-packet-sent")); @@ -1197,12 +1202,21 @@ Status ProcessGDBRemote::DoResume() { StreamString continue_packet; bool continue_packet_error = false; - if (m_gdb_comm.HasAnyVContSupport()) { + // Number of threads continuing with "c", i.e. continuing without a signal to deliver. + const size_t num_continue_c_tids = m_continue_c_tids.size(); + // Number of threads continuing with "C", i.e. continuing with a signal to deliver. + const size_t num_continue_C_tids = m_continue_C_tids.size(); + // Number of threads continuing with "s", i.e. single-stepping. + const size_t num_continue_s_tids = m_continue_s_tids.size(); + // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver. + const size_t num_continue_S_tids = m_continue_S_tids.size(); + if (direction == RunDirection::eRunForward && + m_gdb_comm.HasAnyVContSupport()) { std::string pid_prefix; if (m_gdb_comm.GetMultiprocessSupported()) pid_prefix = llvm::formatv("p{0:x-}.", GetID()); - if (m_continue_c_tids.size() == num_threads || + if (num_continue_c_tids == num_threads || (m_continue_c_tids.empty() && m_continue_C_tids.empty() && m_continue_s_tids.empty() && m_continue_S_tids.empty())) { // All threads are continuing @@ -1265,14 +1279,11 @@ Status ProcessGDBRemote::DoResume() { } else continue_packet_error = true; - if (continue_packet_error) { + if (direction == RunDirection::eRunForward && continue_packet_error) { // Either no vCont support, or we tried to use part of the vCont packet - // that wasn't supported by the remote GDB server. We need to try and - // make a simple packet that can do our continue - const size_t num_continue_c_tids = m_continue_c_tids.size(); - const size_t num_continue_C_tids = m_continue_C_tids.size(); - const size_t num_continue_s_tids = m_continue_s_tids.size(); - const size_t num_continue_S_tids = m_continue_S_tids.size(); + // that wasn't supported by the remote GDB server, or it's the reverse + // direction. We need to try and make a simple packet that can do our + // continue. if (num_continue_c_tids > 0) { if (num_continue_c_tids == num_threads) { // All threads are resuming... @@ -1363,9 +1374,41 @@ Status ProcessGDBRemote::DoResume() { } } + if (direction == RunDirection::eRunReverse && continue_packet_error) { + if (num_continue_C_tids > 0 || num_continue_S_tids > 0) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported"); + return Status::FromErrorString("can't deliver signals while running in reverse"); + } + + if (num_continue_s_tids > 0) { + if (num_continue_s_tids > 1) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads"); + return Status::FromErrorString("can't step multiple threads while reverse-stepping"); + } + + if (!m_gdb_comm.GetReverseStepSupported()) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping"); + return Status::FromErrorString("target does not support reverse-stepping"); + } + + m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front()); + continue_packet.PutCString("bs"); + } else { + if (!m_gdb_comm.GetReverseContinueSupported()) { + LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue"); + return Status::FromErrorString("target does not support reverse-continue"); + } + + // All threads continue whether requested or not --- + // we can't change how threads ran in the past. + continue_packet.PutCString("bc"); + } + + continue_packet_error = false; + } + if (continue_packet_error) { - error = - Status::FromErrorString("can't make continue packet for this resume"); + return Status::FromErrorString("can't make continue packet for this resume"); } else { EventSP event_sp; if (!m_async_thread.IsJoinable()) { @@ -1380,7 +1423,7 @@ Status ProcessGDBRemote::DoResume() { std::make_shared(continue_packet.GetString()); m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp); - if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) { + if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) { error = Status::FromErrorString("Resume timed out."); LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out."); } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) { @@ -1863,6 +1906,10 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo( thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException( *thread_sp, description.c_str())); handled = true; + } else if (reason == "replaylog") { + thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary( + *thread_sp, description.c_str())); + handled = true; } else if (reason == "exec") { did_exec = true; thread_sp->SetStopInfo( @@ -2318,6 +2365,8 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) { description = std::string(ostr.GetString()); } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) { reason = "breakpoint"; + } else if (key.compare("replaylog") == 0) { + reason = "replaylog"; } else if (key.compare("library") == 0) { auto error = LoadModules(); if (error) { diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h index 2492795851388a..fa3e1cec76e2b3 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h @@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process, // Process Control Status WillResume() override; - Status DoResume() override; + Status DoResume(lldb::RunDirection direction) override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp index d2111ce877ce55..304c12173dd35d 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp @@ -182,10 +182,15 @@ void ScriptedProcess::DidResume() { m_pid = GetInterface().GetProcessID(); } -Status ScriptedProcess::DoResume() { +Status ScriptedProcess::DoResume(RunDirection direction) { LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__); - return GetInterface().Resume(); + if (direction == RunDirection::eRunForward) { + return GetInterface().Resume(); + } else { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); + } } Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) { diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h index 0335364b4010b2..8ebe4ca5f3d449 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h @@ -52,7 +52,7 @@ class ScriptedProcess : public Process { void DidResume() override; - Status DoResume() override; + Status DoResume(lldb::RunDirection direction) override; Status DoAttachToProcessWithID(lldb::pid_t pid, const ProcessAttachInfo &attach_info) override; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index c009d17d3ba507..fd683728388215 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -446,7 +446,8 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, m_memory_cache(*this), m_allocated_memory_cache(*this), m_should_detach(false), m_next_event_action_up(), m_public_run_lock(), m_private_run_lock(), m_currently_handling_do_on_removals(false), - m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID), + m_resume_requested(false), m_last_run_direction(eRunForward), + m_interrupt_tid(LLDB_INVALID_THREAD_ID), m_finalizing(false), m_destructing(false), m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false), m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false), @@ -845,6 +846,7 @@ bool Process::HandleProcessStateChangedEvent( switch (thread_stop_reason) { case eStopReasonInvalid: case eStopReasonNone: + case eStopReasonHistoryBoundary: break; case eStopReasonSignal: { @@ -1352,7 +1354,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) { } } -Status Process::Resume() { +Status Process::Resume(RunDirection direction) { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data()); if (!m_public_run_lock.TrySetRunning()) { @@ -1361,7 +1363,7 @@ Status Process::Resume() { return Status::FromErrorString( "Resume request failed - process still running."); } - Status error = PrivateResume(); + Status error = PrivateResume(direction); if (!error.Success()) { // Undo running state change m_public_run_lock.SetStopped(); @@ -1369,7 +1371,7 @@ Status Process::Resume() { return error; } -Status Process::ResumeSynchronous(Stream *stream) { +Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock"); if (!m_public_run_lock.TrySetRunning()) { @@ -1382,7 +1384,7 @@ Status Process::ResumeSynchronous(Stream *stream) { Listener::MakeListener(ResumeSynchronousHijackListenerName.data())); HijackProcessEvents(listener_sp); - Status error = PrivateResume(); + Status error = PrivateResume(direction); if (error.Success()) { StateType state = WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream, @@ -3239,7 +3241,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) { return error; } -Status Process::PrivateResume() { +Status Process::PrivateResume(RunDirection direction) { Log *log(GetLog(LLDBLog::Process | LLDBLog::Step)); LLDB_LOGF(log, "Process::PrivateResume() m_stop_id = %u, public state: %s " @@ -3255,6 +3257,15 @@ Status Process::PrivateResume() { if (!GetModID().IsLastResumeForUserExpression()) ResetExtendedCrashInfoDict(); + if (m_last_run_direction != direction) { + // In the future we might want to support mixed-direction plans, + // e.g. a forward step-over stops at a breakpoint, the user does + // a reverse-step, then disables the breakpoint and continues forward. + // This code will need to be changed to support that. + m_thread_list.DiscardThreadPlans(); + m_last_run_direction = direction; + } + Status error(WillResume()); // Tell the process it is about to resume before the thread list if (error.Success()) { @@ -3272,7 +3283,7 @@ Status Process::PrivateResume() { "Process::PrivateResume PreResumeActions failed, not resuming."); } else { m_mod_id.BumpResumeID(); - error = DoResume(); + error = DoResume(direction); if (error.Success()) { DidResume(); m_thread_list.DidResume(); @@ -3735,7 +3746,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) { "from state: %s", static_cast(event_ptr), StateAsCString(state)); ProcessEventData::SetRestartedInEvent(event_ptr, true); - PrivateResume(); + PrivateResume(m_last_run_direction); } } else { return_value = true; @@ -4346,7 +4357,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) { SetRestarted(true); // Use the private resume method here, since we aren't changing the run // lock state. - process_sp->PrivateResume(); + process_sp->PrivateResume(process_sp->m_last_run_direction); } else { bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) && !process_sp->StateChangedIsHijackedForSynchronousResume(); diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index bd7032b803df90..08e9a7c099bad2 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -1212,6 +1212,30 @@ class StopInfoProcessorTrace : public StopInfo { } }; +// StopInfoHistoryBoundary + +class StopInfoHistoryBoundary : public StopInfo { +public: + StopInfoHistoryBoundary(Thread &thread, const char *description) + : StopInfo(thread, LLDB_INVALID_UID) { + if (description) + SetDescription(description); + } + + ~StopInfoHistoryBoundary() override = default; + + StopReason GetStopReason() const override { + return eStopReasonHistoryBoundary; + } + + const char *GetDescription() override { + if (m_description.empty()) + return "history boundary"; + else + return m_description.c_str(); + } +}; + // StopInfoThreadPlan class StopInfoThreadPlan : public StopInfo { @@ -1439,6 +1463,11 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread, return StopInfoSP(new StopInfoProcessorTrace(thread, description)); } +StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread, + const char *description) { + return StopInfoSP(new StopInfoHistoryBoundary(thread, description)); +} + StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) { return StopInfoSP(new StopInfoExec(thread)); } diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 902fbb2b519ef7..bbb586f033b746 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -624,10 +624,12 @@ void Thread::SetupForResume() { // what the current plan is. lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext()); - if (reg_ctx_sp) { + ProcessSP process_sp(GetProcess()); + if (reg_ctx_sp && process_sp && + process_sp->GetLastRunDirection() == eRunForward) { const addr_t thread_pc = reg_ctx_sp->GetPC(); BreakpointSiteSP bp_site_sp = - GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc); + process_sp->GetBreakpointSiteList().FindByAddress(thread_pc); if (bp_site_sp) { // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the // target may not require anything special to step over a breakpoint. @@ -1732,6 +1734,8 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) { return "processor trace"; case eStopReasonInterrupt: return "async interrupt"; + case eStopReasonHistoryBoundary: + return "history boundary"; } return "StopReason = " + std::to_string(reason); diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile new file mode 100644 index 00000000000000..10495940055b63 --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py new file mode 100644 index 00000000000000..b37578fbd82468 --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py @@ -0,0 +1,115 @@ +import lldb +import time +import unittest +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test.gdbclientutils import * +from lldbsuite.test.lldbreverse import ReverseTestBase +from lldbsuite.test import lldbutil + + +class TestReverseContinueBreakpoints(ReverseTestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test_reverse_continue(self): + self.reverse_continue_internal(async_mode=False) + + def test_reverse_continue_async(self): + self.reverse_continue_internal(async_mode=True) + + def reverse_continue_internal(self, async_mode): + target, process, initial_threads = self.setup_recording(async_mode) + + # Reverse-continue. We'll stop at the point where we started recording. + status = process.Continue(lldb.eRunReverse) + self.assertSuccess(status) + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) + self.expect( + "thread list", + STOPPED_DUE_TO_HISTORY_BOUNDARY, + substrs=["stopped", "stop reason = history boundary"], + ) + + # Continue forward normally until the target exits. + status = process.Continue() + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited]) + self.assertSuccess(status) + self.assertState(process.GetState(), lldb.eStateExited) + self.assertEqual(process.GetExitStatus(), 0) + + def test_reverse_continue_breakpoint(self): + self.reverse_continue_breakpoint_internal(async_mode=False) + + def test_reverse_continue_breakpoint_async(self): + self.reverse_continue_breakpoint_internal(async_mode=True) + + def reverse_continue_breakpoint_internal(self, async_mode): + target, process, initial_threads = self.setup_recording(async_mode) + + # Reverse-continue to the function "trigger_breakpoint". + trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) + status = process.Continue(lldb.eRunReverse) + self.assertSuccess(status) + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) + threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt) + self.assertEqual(threads_now, initial_threads) + + def test_reverse_continue_skip_breakpoint(self): + self.reverse_continue_skip_breakpoint_internal(async_mode=False) + + def test_reverse_continue_skip_breakpoint_async(self): + self.reverse_continue_skip_breakpoint_internal(async_mode=True) + + def reverse_continue_skip_breakpoint_internal(self, async_mode): + target, process, initial_threads = self.setup_recording(async_mode) + + # Reverse-continue over a breakpoint at "trigger_breakpoint" whose + # condition is false. + # This tests that we continue in the correct direction after hitting + # the breakpoint. + trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) + trigger_bkpt.SetCondition("false_condition") + status = process.Continue(lldb.eRunReverse) + self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) + self.assertSuccess(status) + self.expect( + "thread list", + STOPPED_DUE_TO_HISTORY_BOUNDARY, + substrs=["stopped", "stop reason = history boundary"], + ) + + def setup_recording(self, async_mode): + """ + Record execution of code between "start_recording" and "stop_recording" breakpoints. + + Returns with the target stopped at "stop_recording", with recording disabled, + ready to reverse-execute. + """ + self.build() + target = self.dbg.CreateTarget("") + process = self.connect(target) + + # Record execution from the start of the function "start_recording" + # to the start of the function "stop_recording". We want to keep the + # interval that we record as small as possible to minimize the run-time + # of our single-stepping recorder. + start_recording_bkpt = target.BreakpointCreateByName("start_recording", None) + initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt) + self.assertEqual(len(initial_threads), 1) + target.BreakpointDelete(start_recording_bkpt.GetID()) + self.start_recording() + stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None) + lldbutil.continue_to_breakpoint(process, stop_recording_bkpt) + target.BreakpointDelete(stop_recording_bkpt.GetID()) + self.stop_recording() + + self.dbg.SetAsync(async_mode) + self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped]) + + return target, process, initial_threads + + def expect_async_state_changes(self, async_mode, process, states): + if not async_mode: + return + listener = self.dbg.GetListener() + lldbutil.expect_state_changes(self, listener, process, states) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py new file mode 100644 index 00000000000000..d610761b8cb0bc --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py @@ -0,0 +1,30 @@ +import lldb +import unittest +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test import lldbutil + + +class TestReverseContinueNotSupported(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test_reverse_continue_not_supported(self): + self.build() + exe = self.getBuildArtifact("a.out") + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + + main_bkpt = target.BreakpointCreateByName("main", None) + self.assertTrue(main_bkpt, VALID_BREAKPOINT) + + process = target.LaunchSimple(None, None, self.get_process_working_directory()) + self.assertTrue(process, PROCESS_IS_VALID) + + # This will fail gracefully. + status = process.Continue(lldb.eRunReverse) + self.assertFailure(status, "target does not support reverse-continue") + + status = process.Continue() + self.assertSuccess(status) + self.assertState(process.GetState(), lldb.eStateExited) + self.assertEqual(process.GetExitStatus(), 0) diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c new file mode 100644 index 00000000000000..40e45dc9f5c317 --- /dev/null +++ b/lldb/test/API/functionalities/reverse-execution/main.c @@ -0,0 +1,14 @@ +volatile int false_condition = 0; + +static void start_recording() {} + +static void trigger_breakpoint() {} + +static void stop_recording() {} + +int main() { + start_recording(); + trigger_breakpoint(); + stop_recording(); + return 0; +} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 558f889c4b7f23..211fd34957f496 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -1045,6 +1045,9 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread, case lldb::eStopReasonProcessorTrace: body.try_emplace("reason", "processor trace"); break; + case lldb::eStopReasonHistoryBoundary: + body.try_emplace("reason", "history boundary"); + break; case lldb::eStopReasonSignal: case lldb::eStopReasonException: body.try_emplace("reason", "exception"); diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp index b38833c0fdb6b6..1c5e3ac7008727 100644 --- a/lldb/tools/lldb-dap/LLDBUtils.cpp +++ b/lldb/tools/lldb-dap/LLDBUtils.cpp @@ -111,6 +111,7 @@ bool ThreadHasStopReason(lldb::SBThread &thread) { case lldb::eStopReasonVFork: case lldb::eStopReasonVForkDone: case lldb::eStopReasonInterrupt: + case lldb::eStopReasonHistoryBoundary: return true; case lldb::eStopReasonThreadExiting: case lldb::eStopReasonInvalid: From fae7d6848bbb59fc2bad17adbdb34bd6a11a0651 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 10 Oct 2024 13:22:56 -0700 Subject: [PATCH 092/177] [lldb] SetErrorStringWithFormatv -> FromErrorStringWithFormatv (NFC) --- lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 116c43343c01d1..367fce442bb866 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -407,8 +407,9 @@ Status ProcessKDP::DoResume(RunDirection direction) { Log *log = GetLog(KDPLog::Process); if (direction == RunDirection::eRunReverse) { - error.SetErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); + error.FromErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", + GetPluginName()); return error; } From c686eeb7fcc89673909e7e1f0a0a09a0da269d28 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 10 Oct 2024 16:07:35 -0700 Subject: [PATCH 093/177] [lldb] skip ReverseContinue tests on Darwin This uses lldb-server in gdbserver mode, which requires a ProcessNative plugin. Darwin does not have a ProcessNative plugin; it uses debugserver instead of lldb-server. Skip these tests. --- .../reverse-execution/TestReverseContinueBreakpoints.py | 6 ++++++ .../reverse-execution/TestReverseContinueNotSupported.py | 1 + 2 files changed, 7 insertions(+) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py index b37578fbd82468..8b53d86704f119 100644 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py @@ -11,9 +11,11 @@ class TestReverseContinueBreakpoints(ReverseTestBase): NO_DEBUG_INFO_TESTCASE = True + @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue(self): self.reverse_continue_internal(async_mode=False) + @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_async(self): self.reverse_continue_internal(async_mode=True) @@ -37,9 +39,11 @@ def reverse_continue_internal(self, async_mode): self.assertState(process.GetState(), lldb.eStateExited) self.assertEqual(process.GetExitStatus(), 0) + @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_breakpoint(self): self.reverse_continue_breakpoint_internal(async_mode=False) + @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_breakpoint_async(self): self.reverse_continue_breakpoint_internal(async_mode=True) @@ -54,9 +58,11 @@ def reverse_continue_breakpoint_internal(self, async_mode): threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt) self.assertEqual(threads_now, initial_threads) + @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_skip_breakpoint(self): self.reverse_continue_skip_breakpoint_internal(async_mode=False) + @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_skip_breakpoint_async(self): self.reverse_continue_skip_breakpoint_internal(async_mode=True) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py index d610761b8cb0bc..8a20f0ffdcf660 100644 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py @@ -8,6 +8,7 @@ class TestReverseContinueNotSupported(TestBase): NO_DEBUG_INFO_TESTCASE = True + @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_not_supported(self): self.build() exe = self.getBuildArtifact("a.out") From 1bf271d5a7de58faf525c3b90ef4a4a8ff47e688 Mon Sep 17 00:00:00 2001 From: William Junda Huang Date: Thu, 10 Oct 2024 19:10:07 -0400 Subject: [PATCH 094/177] Revert "[ThinLTO] Do not duplicate import a function that is actually defined in the current module" (#111919) Reverts llvm/llvm-project#110064 --- llvm/lib/Linker/IRMover.cpp | 6 +- .../Inputs/ditemplatevalueparameter-remap.ll | 29 ------- .../X86/ditemplatevalueparameter-remap.ll | 87 ------------------- 3 files changed, 1 insertion(+), 121 deletions(-) delete mode 100644 llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll delete mode 100644 llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 5067fbff2e277b..3a6c2678cd157f 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -595,15 +595,11 @@ Value *IRLinker::materialize(Value *V, bool ForIndirectSymbol) { if (!SGV) return nullptr; - // If SGV is from dest, it was already materialized when dest was loaded. - if (SGV->getParent() == &DstM) - return nullptr; - // When linking a global from other modules than source & dest, skip // materializing it because it would be mapped later when its containing // module is linked. Linking it now would potentially pull in many types that // may not be mapped properly. - if (SGV->getParent() != SrcM.get()) + if (SGV->getParent() != &DstM && SGV->getParent() != SrcM.get()) return nullptr; Expected NewProto = linkGlobalValueProto(SGV, ForIndirectSymbol); diff --git a/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll deleted file mode 100644 index be93160b943397..00000000000000 --- a/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll +++ /dev/null @@ -1,29 +0,0 @@ -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @_Z8thinlto1v() unnamed_addr { - %3 = alloca i64, align 4 - #dbg_declare(ptr %3, !14, !DIExpression(), !15) - ret void -} - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!2, !3, !4, !5} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "B.cpp", directory: ".") -!2 = !{i32 7, !"Dwarf Version", i32 4} -!3 = !{i32 2, !"Debug Info Version", i32 3} -!4 = !{i32 1, !"wchar_size", i32 4} -!5 = !{i32 8, !"PIC Level", i32 2} -!10 = distinct !DISubprogram(name: "thinlto1", linkageName: "_Z8thinlto1v", scope: !11, file: !11, line: 8, type: !12, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) -!11 = !DIFile(filename: "b.cpp", directory: ".") -!12 = !DISubroutineType(types: !13) -!13 = !{null} -!14 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !11, line: 18, type: !16) -!15 = !DILocation(line: 18, column: 19, scope: !10) -!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !17, templateParams: !18, identifier: "_ZTS1SIXadL_Z5func1vEEE") -!17 = !{} -!18 = !{!19} -!19 = !DITemplateValueParameter(name: "Func", type: !20, value: ptr undef) -!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64) diff --git a/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll deleted file mode 100644 index 0651705ccba8b8..00000000000000 --- a/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll +++ /dev/null @@ -1,87 +0,0 @@ -; https://github.com/llvm/llvm-project/pull/110064 -; This test case checks if thinLTO correctly links metadata values in a specific -; situation. Assume we are linking module B into module A, where an extern -; function used in A is defined in B, but the function body has a -; DITemplateValueParameter referring to another function back in A. The -; compiler must check this other function is actually coming from A, thus -; already materialized and does not require remapping. The IR here is modified -; from the following source code. -; -; // A.h -; template -; struct S { -; void Impl() { -; Func(); -; } -; }; -; -; void func1(); -; -; // A.cpp -; #include "A.h" -; __attribute__((weak)) void func1() {} -; extern void thinlto1(); -; void bar() { -; S s; // Force instantiation of S in this compilation unit. -; s.Impl(); -; thinlto1(); -; } -; -; // B.cpp -; #include "A.h" -; void thinlto1() { -; S s; -; } -; -; RUN: opt -module-summary -o %t1.bc %s -; RUN: opt -module-summary -o %t2.bc %S/Inputs/ditemplatevalueparameter-remap.ll -; RUN: ld.lld --plugin-opt=thinlto-index-only -shared %t1.bc %t2.bc -; RUN: clang -O3 -fthinlto-index=%t1.bc.thinlto.bc -x ir %t1.bc -S -emit-llvm -o - | FileCheck %s - -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -$_Z5func1v = comdat any - -define linkonce_odr dso_local void @_Z5func1v() unnamed_addr !dbg !10 { - ret void -} - -; Dummy function to use _Z5func1v so that it is not treated as dead symbol. -define void @_Z3bazv() { - tail call void @_Z5func1v() - ret void -} - -declare void @_Z8thinlto1v() unnamed_addr - -; CHECK: void @_Z3barv() -; CHECK-NOT: call void @_Z8thinlto1v() -; CHECK-NEXT: ret void -define void @_Z3barv() unnamed_addr !dbg !14 { - tail call void @_Z8thinlto1v(), !dbg !25 - ret void -} - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!2, !3, !4, !5} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "A.cpp", directory: ".") -!2 = !{i32 7, !"Dwarf Version", i32 4} -!3 = !{i32 2, !"Debug Info Version", i32 3} -!4 = !{i32 1, !"wchar_size", i32 4} -!5 = !{i32 8, !"PIC Level", i32 2} -!10 = distinct !DISubprogram(name: "func1", linkageName: "_Z5func1v", scope: !11, file: !11, line: 6, type: !12, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) -!11 = !DIFile(filename: "a.h", directory: ".") -!12 = !DISubroutineType(types: !13) -!13 = !{null} -!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !11, file: !11, line: 15, type: !12, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16) -!16 = !{!17} -!17 = !DILocalVariable(name: "s", scope: !14, file: !11, line: 10, type: !18) -!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !19, templateParams: !20, identifier: "_ZTS1SIXadL_Z5func1vEEE") -!19 = !{} -!20 = !{!21} -!21 = !DITemplateValueParameter(name: "Func", type: !22, value: ptr @_Z5func1v) -!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64) -!25 = !DILocation(line: 16, column: 5, scope: !14) From 45cc74357130190b9aef9fab77646c17f2cf2a5e Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 10 Oct 2024 16:22:13 -0700 Subject: [PATCH 095/177] [SandboxVec][DAG][NFC] Add comment about duplicate notes in deps() (#111915) --- .../Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index 7d300ea2b60d2d..5fa57efc1462e8 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -122,6 +122,10 @@ class DGNode { iterator preds_end(DependencyGraph &DAG) const { return const_cast(this)->preds_end(DAG); } + /// \Returns a range of DAG predecessors nodes. If this is a MemDGNode then + /// this will also include the memory dependency predecessors. + /// Please note that this can include the same node more than once, if for + /// example it's both a use-def predecessor and a mem dep predecessor. iterator_range preds(DependencyGraph &DAG) const { return make_range(preds_begin(DAG), preds_end(DAG)); } From cc20dd285ab72292a1d383d0779aecbe5e1ccf81 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 11 Oct 2024 10:19:49 +1100 Subject: [PATCH 096/177] [ORC][ELF] Remove the ExecutionSession& argument to ELFNixPlatform::Create. We can get a reference to the ExecutionSession from the ObjectLinkingLayer argument, so there's no need to pass it in separately. --- .../llvm/ExecutionEngine/Orc/ELFNixPlatform.h | 13 +++++----- .../ExecutionEngine/Orc/ELFNixPlatform.cpp | 24 ++++++++++--------- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 4 ++-- llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 4 ++-- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h index 40b85e32720108..54442c91096b39 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h @@ -106,14 +106,14 @@ class ELFNixPlatform : public Platform { /// RuntimeAliases function, in which case the client is responsible for /// setting up all aliases (including the required ones). static Expected> - Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, std::unique_ptr OrcRuntime, + Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, + std::unique_ptr OrcRuntime, std::optional RuntimeAliases = std::nullopt); /// Construct using a path to the ORC runtime. static Expected> - Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, const char *OrcRuntimePath, + Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, + const char *OrcRuntimePath, std::optional RuntimeAliases = std::nullopt); ExecutionSession &getExecutionSession() const { return ES; } @@ -211,8 +211,7 @@ class ELFNixPlatform : public Platform { static bool supportedTarget(const Triple &TT); - ELFNixPlatform(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, + ELFNixPlatform(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, std::unique_ptr OrcRuntimeGenerator, Error &Err); @@ -308,4 +307,4 @@ using SPSELFNixJITDylibDepInfoMap = } // end namespace orc } // end namespace llvm -#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H \ No newline at end of file +#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp index d92077dbcbd034..610ecbff5c5c4d 100644 --- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp @@ -233,10 +233,13 @@ class DSOHandleMaterializationUnit : public MaterializationUnit { namespace llvm { namespace orc { -Expected> ELFNixPlatform::Create( - ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, std::unique_ptr OrcRuntime, - std::optional RuntimeAliases) { +Expected> +ELFNixPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer, + JITDylib &PlatformJD, + std::unique_ptr OrcRuntime, + std::optional RuntimeAliases) { + + auto &ES = ObjLinkingLayer.getExecutionSession(); // If the target is not supported then bail out immediately. if (!supportedTarget(ES.getTargetTriple())) @@ -271,15 +274,14 @@ Expected> ELFNixPlatform::Create( // Create the instance. Error Err = Error::success(); auto P = std::unique_ptr(new ELFNixPlatform( - ES, ObjLinkingLayer, PlatformJD, std::move(OrcRuntime), Err)); + ObjLinkingLayer, PlatformJD, std::move(OrcRuntime), Err)); if (Err) return std::move(Err); return std::move(P); } Expected> -ELFNixPlatform::Create(ExecutionSession &ES, - ObjectLinkingLayer &ObjLinkingLayer, +ELFNixPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, const char *OrcRuntimePath, std::optional RuntimeAliases) { @@ -289,7 +291,7 @@ ELFNixPlatform::Create(ExecutionSession &ES, if (!OrcRuntimeArchiveGenerator) return OrcRuntimeArchiveGenerator.takeError(); - return Create(ES, ObjLinkingLayer, PlatformJD, + return Create(ObjLinkingLayer, PlatformJD, std::move(*OrcRuntimeArchiveGenerator), std::move(RuntimeAliases)); } @@ -392,10 +394,10 @@ bool ELFNixPlatform::supportedTarget(const Triple &TT) { } ELFNixPlatform::ELFNixPlatform( - ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, + ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, std::unique_ptr OrcRuntimeGenerator, Error &Err) - : ES(ES), PlatformJD(PlatformJD), ObjLinkingLayer(ObjLinkingLayer), + : ES(ObjLinkingLayer.getExecutionSession()), PlatformJD(PlatformJD), + ObjLinkingLayer(ObjLinkingLayer), DSOHandleSymbol(ES.intern("__dso_handle")) { ErrorAsOutParameter _(&Err); ObjLinkingLayer.addPlugin(std::make_unique(*this)); diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index a13443ce57ea5c..d3dd3b6bedfb65 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -1185,8 +1185,8 @@ Expected ExecutorNativePlatform::operator()(LLJIT &J) { if (!G) return G.takeError(); - if (auto P = ELFNixPlatform::Create(ES, *ObjLinkingLayer, PlatformJD, - std::move(*G))) + if (auto P = + ELFNixPlatform::Create(*ObjLinkingLayer, PlatformJD, std::move(*G))) J.getExecutionSession().setPlatform(std::move(*P)); else return P.takeError(); diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index a2c05deefa6bfc..108cadd2e0169c 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -1041,8 +1041,8 @@ Session::Session(std::unique_ptr EPC, Error &Err) return; } } else if (TT.isOSBinFormatELF()) { - if (auto P = ELFNixPlatform::Create(ES, ObjLayer, *PlatformJD, - OrcRuntime.c_str())) + if (auto P = + ELFNixPlatform::Create(ObjLayer, *PlatformJD, OrcRuntime.c_str())) ES.setPlatform(std::move(*P)); else { Err = P.takeError(); From 4f320778148ba481881eb53ba065ed2a9d9bbc03 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 10 Oct 2024 16:22:07 -0700 Subject: [PATCH 097/177] Revert "[lldb] skip ReverseContinue tests on Darwin" This reverts commit c686eeb7fcc89673909e7e1f0a0a09a0da269d28. --- .../reverse-execution/TestReverseContinueBreakpoints.py | 6 ------ .../reverse-execution/TestReverseContinueNotSupported.py | 1 - 2 files changed, 7 deletions(-) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py index 8b53d86704f119..b37578fbd82468 100644 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py @@ -11,11 +11,9 @@ class TestReverseContinueBreakpoints(ReverseTestBase): NO_DEBUG_INFO_TESTCASE = True - @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue(self): self.reverse_continue_internal(async_mode=False) - @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_async(self): self.reverse_continue_internal(async_mode=True) @@ -39,11 +37,9 @@ def reverse_continue_internal(self, async_mode): self.assertState(process.GetState(), lldb.eStateExited) self.assertEqual(process.GetExitStatus(), 0) - @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_breakpoint(self): self.reverse_continue_breakpoint_internal(async_mode=False) - @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_breakpoint_async(self): self.reverse_continue_breakpoint_internal(async_mode=True) @@ -58,11 +54,9 @@ def reverse_continue_breakpoint_internal(self, async_mode): threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt) self.assertEqual(threads_now, initial_threads) - @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_skip_breakpoint(self): self.reverse_continue_skip_breakpoint_internal(async_mode=False) - @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_skip_breakpoint_async(self): self.reverse_continue_skip_breakpoint_internal(async_mode=True) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py index 8a20f0ffdcf660..d610761b8cb0bc 100644 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py +++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py @@ -8,7 +8,6 @@ class TestReverseContinueNotSupported(TestBase): NO_DEBUG_INFO_TESTCASE = True - @skipIfDarwin # No Darwin ProcessNative impl for lldb-server def test_reverse_continue_not_supported(self): self.build() exe = self.getBuildArtifact("a.out") From a28e7ce378d717e6aacbdc3089974b93b6b62948 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 10 Oct 2024 16:22:18 -0700 Subject: [PATCH 098/177] Revert "[lldb] SetErrorStringWithFormatv -> FromErrorStringWithFormatv (NFC)" This reverts commit fae7d6848bbb59fc2bad17adbdb34bd6a11a0651. --- lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 367fce442bb866..116c43343c01d1 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -407,9 +407,8 @@ Status ProcessKDP::DoResume(RunDirection direction) { Log *log = GetLog(KDPLog::Process); if (direction == RunDirection::eRunReverse) { - error.FromErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", - GetPluginName()); + error.SetErrorStringWithFormatv( + "error: {0} does not support reverse execution of processes", GetPluginName()); return error; } From 3bef742559f1556569423ec63c70b97dff1d426e Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 10 Oct 2024 16:22:24 -0700 Subject: [PATCH 099/177] Revert "[lldb] Implement basic support for reverse-continue (#99736)" Reverting this again; I added a commit which added @skipIfDarwin markers to the TestReverseContinueBreakpoints.py and TestReverseContinueNotSupported.py API tests, which use lldb-server in gdbserver mode which does not work on Darwin. But the aarch64 ubuntu bot reported a failure on TestReverseContinueBreakpoints.py, https://lab.llvm.org/buildbot/#/builders/59/builds/6397 File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py", line 63, in test_reverse_continue_skip_breakpoint self.reverse_continue_skip_breakpoint_internal(async_mode=False) File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py", line 81, in reverse_continue_skip_breakpoint_internal self.expect( File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 2372, in expect self.runCmd( File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 1002, in runCmd self.assertTrue(self.res.Succeeded(), msg + output) AssertionError: False is not true : Process should be stopped due to history boundary Error output: error: Process must be launched. This reverts commit 4f297566b3150097de26c6a23a987d2bd5fc19c5. --- lldb/include/lldb/API/SBProcess.h | 1 - lldb/include/lldb/Target/Process.h | 21 +- lldb/include/lldb/Target/StopInfo.h | 6 - lldb/include/lldb/lldb-enumerations.h | 6 - .../Python/lldbsuite/test/gdbclientutils.py | 5 +- .../Python/lldbsuite/test/lldbgdbproxy.py | 175 -------- .../Python/lldbsuite/test/lldbreverse.py | 418 ------------------ .../Python/lldbsuite/test/lldbtest.py | 2 - lldb/source/API/SBProcess.cpp | 8 +- lldb/source/API/SBThread.cpp | 2 - .../source/Interpreter/CommandInterpreter.cpp | 3 +- .../Process/Linux/NativeThreadLinux.cpp | 3 - .../Process/MacOSX-Kernel/ProcessKDP.cpp | 9 +- .../Process/MacOSX-Kernel/ProcessKDP.h | 2 +- .../Process/Windows/Common/ProcessWindows.cpp | 8 +- .../Process/Windows/Common/ProcessWindows.h | 2 +- .../GDBRemoteCommunicationClient.cpp | 22 - .../gdb-remote/GDBRemoteCommunicationClient.h | 6 - .../GDBRemoteCommunicationServerLLGS.cpp | 1 - .../Process/gdb-remote/ProcessGDBRemote.cpp | 77 +--- .../Process/gdb-remote/ProcessGDBRemote.h | 2 +- .../Process/scripted/ScriptedProcess.cpp | 9 +- .../Process/scripted/ScriptedProcess.h | 2 +- lldb/source/Target/Process.cpp | 29 +- lldb/source/Target/StopInfo.cpp | 29 -- lldb/source/Target/Thread.cpp | 8 +- .../reverse-execution/Makefile | 3 - .../TestReverseContinueBreakpoints.py | 115 ----- .../TestReverseContinueNotSupported.py | 30 -- .../functionalities/reverse-execution/main.c | 14 - lldb/tools/lldb-dap/JSONUtils.cpp | 3 - lldb/tools/lldb-dap/LLDBUtils.cpp | 1 - 32 files changed, 44 insertions(+), 978 deletions(-) delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py delete mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py delete mode 100644 lldb/test/API/functionalities/reverse-execution/main.c diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h index 8b8ed830b54cc0..1624e02070b1b2 100644 --- a/lldb/include/lldb/API/SBProcess.h +++ b/lldb/include/lldb/API/SBProcess.h @@ -159,7 +159,6 @@ class LLDB_API SBProcess { lldb::SBError Destroy(); lldb::SBError Continue(); - lldb::SBError Continue(RunDirection direction); lldb::SBError Stop(); diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index fe7fbc50fd5770..b8c53a474ba6b9 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - Status Resume(lldb::RunDirection direction = lldb::eRunForward); + Status Resume(); /// Resume a process, and wait for it to stop. - Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward); + Status ResumeSynchronous(Stream *stream); /// Halts a running process. /// @@ -1104,14 +1104,9 @@ class Process : public std::enable_shared_from_this, /// \see Thread:Resume() /// \see Thread:Step() /// \see Thread:Suspend() - virtual Status DoResume(lldb::RunDirection direction) { - if (direction == lldb::RunDirection::eRunForward) { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support resuming processes", GetPluginName()); - } else { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - } + virtual Status DoResume() { + return Status::FromErrorStringWithFormatv( + "error: {0} does not support resuming processes", GetPluginName()); } /// Called after resuming a process. @@ -2337,8 +2332,6 @@ class Process : public std::enable_shared_from_this, bool IsRunning() const; - lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; } - DynamicCheckerFunctions *GetDynamicCheckers() { return m_dynamic_checkers_up.get(); } @@ -2858,7 +2851,7 @@ void PruneThreadPlans(); /// /// \return /// An Status object describing the success or failure of the resume. - Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward); + Status PrivateResume(); // Called internally void CompleteAttach(); @@ -3134,8 +3127,6 @@ void PruneThreadPlans(); // m_currently_handling_do_on_removals are true, // Resume will only request a resume, using this // flag to check. - // The direction of execution from the last time this process was resumed. - lldb::RunDirection m_last_run_direction; lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async /// interrupt, used by thread plan timeout. It diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h index 072f71f6b1122f..fae90364deaf0a 100644 --- a/lldb/include/lldb/Target/StopInfo.h +++ b/lldb/include/lldb/Target/StopInfo.h @@ -142,12 +142,6 @@ class StopInfo : public std::enable_shared_from_this { static lldb::StopInfoSP CreateStopReasonProcessorTrace(Thread &thread, const char *description); - // This creates a StopInfo indicating that execution stopped because - // it was replaying some recorded execution history, and execution reached - // the end of that recorded history. - static lldb::StopInfoSP - CreateStopReasonHistoryBoundary(Thread &thread, const char *description); - static lldb::StopInfoSP CreateStopReasonFork(Thread &thread, lldb::pid_t child_pid, lldb::tid_t child_tid); diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 232d1dfdb5c9d0..938f6e3abe8f2a 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -135,9 +135,6 @@ FLAGS_ENUM(LaunchFlags){ /// Thread Run Modes. enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping }; -/// Execution directions -enum RunDirection { eRunForward, eRunReverse }; - /// Byte ordering definitions. enum ByteOrder { eByteOrderInvalid = 0, @@ -257,9 +254,6 @@ enum StopReason { eStopReasonVFork, eStopReasonVForkDone, eStopReasonInterrupt, ///< Thread requested interrupt - // Indicates that execution stopped because the debugger backend relies - // on recorded data and we reached the end of that data. - eStopReasonHistoryBoundary, }; /// Command Return Status Types. diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py index 732d6171320680..1784487323ad6b 100644 --- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py +++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py @@ -510,9 +510,8 @@ def start(self): self._thread.start() def stop(self): - if self._thread is not None: - self._thread.join() - self._thread = None + self._thread.join() + self._thread = None def get_connect_address(self): return self._socket.get_connect_address() diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py deleted file mode 100644 index 2a9592bf4545a4..00000000000000 --- a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py +++ /dev/null @@ -1,175 +0,0 @@ -import logging -import os -import os.path -import random - -import lldb -from lldbsuite.test.lldbtest import * -from lldbsuite.test.gdbclientutils import * -import lldbgdbserverutils -from lldbsuite.support import seven - - -class GDBProxyTestBase(TestBase): - """ - Base class for gdbserver proxy tests. - - This class will setup and start a mock GDB server for the test to use. - It pases through requests to a regular lldb-server/debugserver and - forwards replies back to the LLDB under test. - """ - - """The gdbserver that we implement.""" - server = None - """The inner lldb-server/debugserver process that we proxy requests into.""" - monitor_server = None - monitor_sock = None - - server_socket_class = TCPServerSocket - - DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) - - _verbose_log_handler = None - _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s") - - def setUpBaseLogging(self): - self.logger = logging.getLogger(__name__) - - if len(self.logger.handlers) > 0: - return # We have set up this handler already - - self.logger.propagate = False - self.logger.setLevel(logging.DEBUG) - - # log all warnings to stderr - handler = logging.StreamHandler() - handler.setLevel(logging.WARNING) - handler.setFormatter(self._log_formatter) - self.logger.addHandler(handler) - - def setUp(self): - TestBase.setUp(self) - - self.setUpBaseLogging() - - if self.isVerboseLoggingRequested(): - # If requested, full logs go to a log file - log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log" - self._verbose_log_handler = logging.FileHandler( - log_file_name - ) - self._verbose_log_handler.setFormatter(self._log_formatter) - self._verbose_log_handler.setLevel(logging.DEBUG) - self.logger.addHandler(self._verbose_log_handler) - - lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe() - if lldb_server_exe is None: - self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe() - self.assertTrue(self.debug_monitor_exe is not None) - self.debug_monitor_extra_args = [] - else: - self.debug_monitor_exe = lldb_server_exe - self.debug_monitor_extra_args = ["gdbserver"] - - self.server = MockGDBServer(self.server_socket_class()) - self.server.responder = self - - def tearDown(self): - # TestBase.tearDown will kill the process, but we need to kill it early - # so its client connection closes and we can stop the server before - # finally calling the base tearDown. - if self.process() is not None: - self.process().Kill() - self.server.stop() - - self.logger.removeHandler(self._verbose_log_handler) - self._verbose_log_handler = None - - TestBase.tearDown(self) - - def isVerboseLoggingRequested(self): - # We will report our detailed logs if the user requested that the "gdb-remote" channel is - # logged. - return any(("gdb-remote" in channel) for channel in lldbtest_config.channels) - - def connect(self, target): - """ - Create a process by connecting to the mock GDB server. - """ - self.prep_debug_monitor_and_inferior() - self.server.start() - - listener = self.dbg.GetListener() - error = lldb.SBError() - process = target.ConnectRemote( - listener, self.server.get_connect_url(), "gdb-remote", error - ) - self.assertTrue(error.Success(), error.description) - self.assertTrue(process, PROCESS_IS_VALID) - return process - - def get_next_port(self): - return 12000 + random.randint(0, 3999) - - def prep_debug_monitor_and_inferior(self): - inferior_exe_path = self.getBuildArtifact("a.out") - self.connect_to_debug_monitor([inferior_exe_path]) - self.assertIsNotNone(self.monitor_server) - self.initial_handshake() - - def initial_handshake(self): - self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "+") - self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode")) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "+") - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "OK") - self.monitor_server.send_packet(seven.bitcast_to_bytes("+")) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.assertEqual(reply, "+") - - def get_debug_monitor_command_line_args(self, connect_address, launch_args): - return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args - - def launch_debug_monitor(self, launch_args): - family, type, proto, _, addr = socket.getaddrinfo( - "localhost", 0, proto=socket.IPPROTO_TCP - )[0] - sock = socket.socket(family, type, proto) - sock.settimeout(self.DEFAULT_TIMEOUT) - sock.bind(addr) - sock.listen(1) - addr = sock.getsockname() - connect_address = "[{}]:{}".format(*addr) - - commandline_args = self.get_debug_monitor_command_line_args( - connect_address, launch_args - ) - - # Start the server. - self.logger.info(f"Spawning monitor {commandline_args}") - monitor_process = self.spawnSubprocess( - self.debug_monitor_exe, commandline_args, install_remote=False - ) - self.assertIsNotNone(monitor_process) - - self.monitor_sock = sock.accept()[0] - self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT) - return monitor_process - - def connect_to_debug_monitor(self, launch_args): - monitor_process = self.launch_debug_monitor(launch_args) - self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process) - - def respond(self, packet): - """Subclasses can override this to change how packets are handled.""" - return self.pass_through(packet) - - def pass_through(self, packet): - self.logger.info(f"Sending packet {packet}") - self.monitor_server.send_packet(seven.bitcast_to_bytes(packet)) - reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet()) - self.logger.info(f"Received reply {reply}") - return reply diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py deleted file mode 100644 index 0f02fdffbdeada..00000000000000 --- a/lldb/packages/Python/lldbsuite/test/lldbreverse.py +++ /dev/null @@ -1,418 +0,0 @@ -import os -import os.path -import lldb -from lldbsuite.test.lldbtest import * -from lldbsuite.test.gdbclientutils import * -from lldbsuite.test.lldbgdbproxy import * -import lldbgdbserverutils -import re - - -class ThreadSnapshot: - def __init__(self, thread_id, registers): - self.thread_id = thread_id - self.registers = registers - - -class MemoryBlockSnapshot: - def __init__(self, address, data): - self.address = address - self.data = data - - -class StateSnapshot: - def __init__(self, thread_snapshots, memory): - self.thread_snapshots = thread_snapshots - self.memory = memory - self.thread_id = None - - -class RegisterInfo: - def __init__(self, lldb_index, bitsize, little_endian): - self.lldb_index = lldb_index - self.bitsize = bitsize - self.little_endian = little_endian - - -BELOW_STACK_POINTER = 16384 -ABOVE_STACK_POINTER = 4096 - -BLOCK_SIZE = 1024 - -SOFTWARE_BREAKPOINTS = 0 -HARDWARE_BREAKPOINTS = 1 -WRITE_WATCHPOINTS = 2 - - -class ReverseTestBase(GDBProxyTestBase): - """ - Base class for tests that need reverse execution. - - This class uses a gdbserver proxy to add very limited reverse- - execution capability to lldb-server/debugserver for testing - purposes only. - - To use this class, run the inferior forward until some stopping point. - Then call `start_recording()` and execute forward again until reaching - a software breakpoint; this class records the state before each execution executes. - At that point, the server will accept "bc" and "bs" packets to step - backwards through the state. - When executing during recording, we only allow single-step and continue without - delivering a signal, and only software breakpoint stops are allowed. - - We assume that while recording is enabled, the only effects of instructions - are on general-purpose registers (read/written by the 'g' and 'G' packets) - and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER). - """ - - """ - A list of StateSnapshots in time order. - - There is one snapshot per single-stepped instruction, - representing the state before that instruction was - executed. The last snapshot in the list is the - snapshot before the last instruction was executed. - This is an undo log; we snapshot a superset of the state that may have - been changed by the instruction's execution. - """ - snapshots = None - recording_enabled = False - - breakpoints = None - - pid = None - - pc_register_info = None - sp_register_info = None - general_purpose_register_info = None - - def __init__(self, *args, **kwargs): - GDBProxyTestBase.__init__(self, *args, **kwargs) - self.breakpoints = [set(), set(), set(), set(), set()] - - def respond(self, packet): - if not packet: - raise ValueError("Invalid empty packet") - if packet == self.server.PACKET_INTERRUPT: - # Don't send a response. We'll just run to completion. - return [] - if self.is_command(packet, "qSupported", ":"): - reply = self.pass_through(packet) - return reply + ";ReverseStep+;ReverseContinue+" - if self.is_command(packet, "vCont", ";"): - if self.recording_enabled: - return self.continue_with_recording(packet) - snapshots = [] - if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S": - raise ValueError("LLDB should not be sending old-style continuation packets") - if packet == "bc": - return self.reverse_continue() - if packet == "bs": - return self.reverse_step() - if packet == 'jThreadsInfo': - # Suppress this because it contains thread stop reasons which we might - # need to modify, and we don't want to have to implement that. - return "" - if packet[0] == "z" or packet[0] == "Z": - reply = self.pass_through(packet) - if reply == "OK": - self.update_breakpoints(packet) - return reply - return GDBProxyTestBase.respond(self, packet) - - def start_recording(self): - self.recording_enabled = True - self.snapshots = [] - - def stop_recording(self): - """ - Don't record when executing foward. - - Reverse execution is still supported until the next forward continue. - """ - self.recording_enabled = False - - def is_command(self, packet, cmd, follow_token): - return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token - - def update_breakpoints(self, packet): - m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet) - if m is None: - raise ValueError("Invalid breakpoint packet: " + packet) - t = int(m.group(2)) - addr = int(m.group(3), 16) - kind = int(m.group(4), 16) - if m.group(1) == 'Z': - self.breakpoints[t].add((addr, kind)) - else: - self.breakpoints[t].discard((addr, kind)) - - def breakpoint_triggered_at(self, pc): - if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]): - return True - if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]): - return True - return False - - def watchpoint_triggered(self, new_value_block, current_contents): - """Returns the address or None.""" - for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]: - for offset in range(0, kind): - addr = watch_addr + offset - if (addr >= new_value_block.address and - addr < new_value_block.address + len(new_value_block.data)): - index = addr - new_value_block.address - if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]: - return watch_addr - return None - - def continue_with_recording(self, packet): - self.logger.debug("Continue with recording enabled") - - step_packet = "vCont;s" - if packet == "vCont": - requested_step = False - else: - m = re.match("vCont;(c|s)(.*)", packet) - if m is None: - raise ValueError("Unsupported vCont packet: " + packet) - requested_step = m.group(1) == 's' - step_packet += m.group(2) - - while True: - snapshot = self.capture_snapshot() - reply = self.pass_through(step_packet) - (stop_signal, stop_pairs) = self.parse_stop(reply) - if stop_signal != 5: - raise ValueError("Unexpected stop signal: " + reply) - is_swbreak = False - thread_id = None - for key, value in stop_pairs.items(): - if key == "thread": - thread_id = self.parse_thread_id(value) - continue - if re.match('[0-9a-f]+', key): - continue - if key == "swbreak" or (key == "reason" and value == "breakpoint"): - is_swbreak = True - continue - if key in ["name", "threads", "thread-pcs", "reason"]: - continue - raise ValueError(f"Unknown stop key '{key}' in {reply}") - if is_swbreak: - self.logger.debug("Recording stopped") - return reply - if thread_id is None: - return ValueError("Expected thread ID: " + reply) - snapshot.thread_id = thread_id - self.snapshots.append(snapshot) - if requested_step: - self.logger.debug("Recording stopped for step") - return reply - - def parse_stop(self, reply): - result = {} - if not reply: - raise ValueError("Invalid empty packet") - if reply[0] == "T" and len(reply) >= 3: - result = {k:v for k, v in self.parse_pairs(reply[3:])} - return (int(reply[1:3], 16), result) - raise "Unsupported stop reply: " + reply - - def parse_pairs(self, text): - for pair in text.split(";"): - if not pair: - continue - m = re.match("([^:]+):(.*)", pair) - if m is None: - raise ValueError("Invalid pair text: " + text) - yield (m.group(1), m.group(2)) - - def capture_snapshot(self): - """Snapshot all threads and their stack memories.""" - self.ensure_register_info() - current_thread = self.get_current_thread() - thread_snapshots = [] - memory = [] - for thread_id in self.get_thread_list(): - registers = {} - for index in sorted(self.general_purpose_register_info.keys()): - reply = self.pass_through(f"p{index:x};thread:{thread_id:x};") - if reply == "" or reply[0] == 'E': - raise ValueError("Can't read register") - registers[index] = reply - thread_snapshot = ThreadSnapshot(thread_id, registers) - thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers) - memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER) - thread_snapshots.append(thread_snapshot) - self.set_current_thread(current_thread) - return StateSnapshot(thread_snapshots, memory) - - def restore_snapshot(self, snapshot): - """ - Restore the snapshot during reverse execution. - - If this triggers a breakpoint or watchpoint, return the stop reply, - otherwise None. - """ - current_thread = self.get_current_thread() - stop_reasons = [] - for thread_snapshot in snapshot.thread_snapshots: - thread_id = thread_snapshot.thread_id - for lldb_index in sorted(thread_snapshot.registers.keys()): - data = thread_snapshot.registers[lldb_index] - reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};") - if reply != "OK": - raise ValueError("Can't restore thread register") - if thread_id == snapshot.thread_id: - new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers) - if self.breakpoint_triggered_at(new_pc): - stop_reasons.append([("reason", "breakpoint")]) - self.set_current_thread(current_thread) - for block in snapshot.memory: - current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}") - if not current_memory or current_memory[0] == 'E': - raise ValueError("Can't read back memory") - reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data) - if reply != "OK": - raise ValueError("Can't restore memory") - watch_addr = self.watchpoint_triggered(block, current_memory[1:]) - if watch_addr is not None: - stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")]) - if stop_reasons: - pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0]) - return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};" - return None - - def reverse_step(self): - if not self.snapshots: - self.logger.debug("Reverse-step at history boundary") - return self.history_boundary_reply(self.get_current_thread()) - self.logger.debug("Reverse-step started") - snapshot = self.snapshots.pop() - stop_reply = self.restore_snapshot(snapshot) - self.set_current_thread(snapshot.thread_id) - self.logger.debug("Reverse-step stopped") - if stop_reply is None: - return self.singlestep_stop_reply(snapshot.thread_id) - return stop_reply - - def reverse_continue(self): - self.logger.debug("Reverse-continue started") - thread_id = None - while self.snapshots: - snapshot = self.snapshots.pop() - stop_reply = self.restore_snapshot(snapshot) - thread_id = snapshot.thread_id - if stop_reply is not None: - self.set_current_thread(thread_id) - self.logger.debug("Reverse-continue stopped") - return stop_reply - if thread_id is None: - thread_id = self.get_current_thread() - else: - self.set_current_thread(snapshot.thread_id) - self.logger.debug("Reverse-continue stopped at history boundary") - return self.history_boundary_reply(thread_id) - - def get_current_thread(self): - reply = self.pass_through("qC") - return self.parse_thread_id(reply[2:]) - - def parse_thread_id(self, thread_id): - m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id) - if m is None: - raise ValueError("Invalid thread ID: " + thread_id) - if self.pid is None: - self.pid = int(m.group(2), 16) - return int(m.group(3), 16) - - def history_boundary_reply(self, thread_id): - return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;" - - def singlestep_stop_reply(self, thread_id): - return f"T05thread:{self.pid:x}.{thread_id:x};" - - def set_current_thread(self, thread_id): - """ - Set current thread in inner gdbserver. - """ - if thread_id >= 0: - self.pass_through(f"Hg{self.pid:x}.{thread_id:x}") - self.pass_through(f"Hc{self.pid:x}.{thread_id:x}") - else: - self.pass_through(f"Hc-1.-1") - self.pass_through(f"Hg-1.-1") - - def get_register(self, register_info, registers): - if register_info.bitsize % 8 != 0: - raise ValueError("Register size must be a multiple of 8 bits") - if register_info.lldb_index not in registers: - raise ValueError("Register value not captured") - data = registers[register_info.lldb_index] - num_bytes = register_info.bitsize//8 - bytes = [] - for i in range(0, num_bytes): - bytes.append(int(data[i*2:(i + 1)*2], 16)) - if register_info.little_endian: - bytes.reverse() - result = 0 - for byte in bytes: - result = (result << 8) + byte - return result - - def read_memory(self, start_addr, end_addr): - """ - Read a region of memory from the target. - - Some of the addresses may extend into invalid virtual memory; - skip those areas. - Return a list of blocks containing the valid area(s) in the - requested range. - """ - regions = [] - start_addr = start_addr & (BLOCK_SIZE - 1) - end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1) - for addr in range(start_addr, end_addr, BLOCK_SIZE): - reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}") - if reply and reply[0] != 'E': - block = MemoryBlockSnapshot(addr, reply[1:]) - regions.append(block) - return regions - - def ensure_register_info(self): - if self.general_purpose_register_info is not None: - return - reply = self.pass_through("qHostInfo") - little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply)) - self.general_purpose_register_info = {} - lldb_index = 0 - while True: - reply = self.pass_through(f"qRegisterInfo{lldb_index:x}") - if not reply or reply[0] == 'E': - break - info = {k:v for k, v in self.parse_pairs(reply)} - reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian) - if info["set"] == "General Purpose Registers" and not "container-regs" in info: - self.general_purpose_register_info[lldb_index] = reg_info - if "generic" in info: - if info["generic"] == "pc": - self.pc_register_info = reg_info - elif info["generic"] == "sp": - self.sp_register_info = reg_info - lldb_index += 1 - if self.pc_register_info is None or self.sp_register_info is None: - raise ValueError("Can't find generic pc or sp register") - - def get_thread_list(self): - threads = [] - reply = self.pass_through("qfThreadInfo") - while True: - if not reply: - raise ValueError("Missing reply packet") - if reply[0] == 'm': - for id in reply[1:].split(","): - threads.append(self.parse_thread_id(id)) - elif reply[0] == 'l': - return threads - reply = self.pass_through("qsThreadInfo") diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index 7cc1ac9749ec93..8884ef5933ada8 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -143,8 +143,6 @@ STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint" -STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary" - DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly" VALID_BREAKPOINT = "Got a valid breakpoint" diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp index 07780f9f9c8393..9773144723c34c 100644 --- a/lldb/source/API/SBProcess.cpp +++ b/lldb/source/API/SBProcess.cpp @@ -564,10 +564,6 @@ uint32_t SBProcess::GetAddressByteSize() const { } SBError SBProcess::Continue() { - return Continue(RunDirection::eRunForward); -} - -SBError SBProcess::Continue(RunDirection direction) { LLDB_INSTRUMENT_VA(this); SBError sb_error; @@ -578,9 +574,9 @@ SBError SBProcess::Continue(RunDirection direction) { process_sp->GetTarget().GetAPIMutex()); if (process_sp->GetTarget().GetDebugger().GetAsyncExecution()) - sb_error.ref() = process_sp->Resume(direction); + sb_error.ref() = process_sp->Resume(); else - sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction); + sb_error.ref() = process_sp->ResumeSynchronous(nullptr); } else sb_error = Status::FromErrorString("SBProcess is invalid"); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index aca8a039952960..a99456e06d0329 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -172,7 +172,6 @@ size_t SBThread::GetStopReasonDataCount() { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: - case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; @@ -234,7 +233,6 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) { case eStopReasonInstrumentation: case eStopReasonProcessorTrace: case eStopReasonVForkDone: - case eStopReasonHistoryBoundary: // There is no data for these stop reasons. return 0; diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index ea60492ac46a10..8d3a82ef6c990a 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2553,8 +2553,7 @@ bool CommandInterpreter::DidProcessStopAbnormally() const { const StopReason reason = stop_info->GetStopReason(); if (reason == eStopReasonException || reason == eStopReasonInstrumentation || - reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt || - reason == eStopReasonHistoryBoundary) + reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt) return true; if (reason == eStopReasonSignal) { diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp index b0aa664775b463..de047ee214c11e 100644 --- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp @@ -82,9 +82,6 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info, case eStopReasonProcessorTrace: log.Printf("%s: %s processor trace", __FUNCTION__, header); return; - case eStopReasonHistoryBoundary: - log.Printf("%s: %s history boundary", __FUNCTION__, header); - return; default: log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header, static_cast(stop_info.reason)); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index 116c43343c01d1..9b2907c6809965 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -402,16 +402,9 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() { Status ProcessKDP::WillResume() { return Status(); } -Status ProcessKDP::DoResume(RunDirection direction) { +Status ProcessKDP::DoResume() { Status error; Log *log = GetLog(KDPLog::Process); - - if (direction == RunDirection::eRunReverse) { - error.SetErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - return error; - } - // Only start the async thread if we try to do any process control if (!m_async_thread.IsJoinable()) StartAsyncThread(); diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h index 1b71d83f70b087..e5ec5914f9600d 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h @@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process { // Process Control lldb_private::Status WillResume() override; - lldb_private::Status DoResume(lldb::RunDirection direction) override; + lldb_private::Status DoResume() override; lldb_private::Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index 76b7095deaa503..703aa082f0476f 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -204,17 +204,11 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid, return error; } -Status ProcessWindows::DoResume(RunDirection direction) { +Status ProcessWindows::DoResume() { Log *log = GetLog(WindowsLog::Process); llvm::sys::ScopedLock lock(m_mutex); Status error; - if (direction == RunDirection::eRunReverse) { - error.SetErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - return error; - } - StateType private_state = GetPrivateState(); if (private_state == eStateStopped || private_state == eStateCrashed) { LLDB_LOG(log, "process {0} is in state {1}. Resuming...", diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h index 97284b7cd1436e..e97cfb790248be 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h @@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger { Status DoAttachToProcessWithID( lldb::pid_t pid, const lldb_private::ProcessAttachInfo &attach_info) override; - Status DoResume(lldb::RunDirection direction) override; + Status DoResume() override; Status DoDestroy() override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index fc792a4409410b..e42526c8fd7266 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -199,20 +199,6 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() { return m_max_packet_size; } -bool GDBRemoteCommunicationClient::GetReverseContinueSupported() { - if (m_supports_reverse_continue == eLazyBoolCalculate) { - GetRemoteQSupported(); - } - return m_supports_reverse_continue == eLazyBoolYes; -} - -bool GDBRemoteCommunicationClient::GetReverseStepSupported() { - if (m_supports_reverse_step == eLazyBoolCalculate) { - GetRemoteQSupported(); - } - return m_supports_reverse_step == eLazyBoolYes; -} - bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() { if (m_supports_not_sending_acks == eLazyBoolCalculate) { m_send_acks = true; @@ -309,8 +295,6 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) { m_supports_qXfer_siginfo_read = eLazyBoolCalculate; m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate; m_uses_native_signals = eLazyBoolCalculate; - m_supports_reverse_continue = eLazyBoolCalculate; - m_supports_reverse_step = eLazyBoolCalculate; m_supports_qProcessInfoPID = true; m_supports_qfProcessInfo = true; m_supports_qUserName = true; @@ -364,8 +348,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_memory_tagging = eLazyBoolNo; m_supports_qSaveCore = eLazyBoolNo; m_uses_native_signals = eLazyBoolNo; - m_supports_reverse_continue = eLazyBoolNo; - m_supports_reverse_step = eLazyBoolNo; m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if // not, we assume no limit @@ -419,10 +401,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() { m_supports_qSaveCore = eLazyBoolYes; else if (x == "native-signals+") m_uses_native_signals = eLazyBoolYes; - else if (x == "ReverseContinue+") - m_supports_reverse_continue = eLazyBoolYes; - else if (x == "ReverseStep+") - m_supports_reverse_step = eLazyBoolYes; // Look for a list of compressions in the features list e.g. // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib- // deflate,lzma diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h index 116b47c1edf033..898d176abc3465 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h @@ -331,10 +331,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { bool GetMultiprocessSupported(); - bool GetReverseContinueSupported(); - - bool GetReverseStepSupported(); - LazyBool SupportsAllocDeallocMemory() // const { // Uncomment this to have lldb pretend the debug server doesn't respond to @@ -565,8 +561,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase { LazyBool m_supports_memory_tagging = eLazyBoolCalculate; LazyBool m_supports_qSaveCore = eLazyBoolCalculate; LazyBool m_uses_native_signals = eLazyBoolCalculate; - LazyBool m_supports_reverse_continue = eLazyBoolCalculate; - LazyBool m_supports_reverse_step = eLazyBoolCalculate; bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1, m_supports_qUserName : 1, m_supports_qGroupName : 1, diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp index 4016cde74ebea8..35fa93e53bc66f 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp @@ -716,7 +716,6 @@ static const char *GetStopReasonString(StopReason stop_reason) { return "vforkdone"; case eStopReasonInterrupt: return "async interrupt"; - case eStopReasonHistoryBoundary: case eStopReasonInstrumentation: case eStopReasonInvalid: case eStopReasonPlanComplete: diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 3fc03bd05d5df0..3e09c316d74f44 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -169,10 +169,6 @@ class PluginProperties : public Properties { } }; -std::chrono::seconds ResumeTimeout() { - return std::chrono::seconds(5); -} - } // namespace static PluginProperties &GetGlobalPluginProperties() { @@ -1184,11 +1180,10 @@ Status ProcessGDBRemote::WillResume() { return Status(); } -Status ProcessGDBRemote::DoResume(RunDirection direction) { +Status ProcessGDBRemote::DoResume() { Status error; Log *log = GetLog(GDBRLog::Process); - LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)", - direction == RunDirection::eRunForward ? "" : "reverse"); + LLDB_LOGF(log, "ProcessGDBRemote::Resume()"); ListenerSP listener_sp( Listener::MakeListener("gdb-remote.resume-packet-sent")); @@ -1202,21 +1197,12 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { StreamString continue_packet; bool continue_packet_error = false; - // Number of threads continuing with "c", i.e. continuing without a signal to deliver. - const size_t num_continue_c_tids = m_continue_c_tids.size(); - // Number of threads continuing with "C", i.e. continuing with a signal to deliver. - const size_t num_continue_C_tids = m_continue_C_tids.size(); - // Number of threads continuing with "s", i.e. single-stepping. - const size_t num_continue_s_tids = m_continue_s_tids.size(); - // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver. - const size_t num_continue_S_tids = m_continue_S_tids.size(); - if (direction == RunDirection::eRunForward && - m_gdb_comm.HasAnyVContSupport()) { + if (m_gdb_comm.HasAnyVContSupport()) { std::string pid_prefix; if (m_gdb_comm.GetMultiprocessSupported()) pid_prefix = llvm::formatv("p{0:x-}.", GetID()); - if (num_continue_c_tids == num_threads || + if (m_continue_c_tids.size() == num_threads || (m_continue_c_tids.empty() && m_continue_C_tids.empty() && m_continue_s_tids.empty() && m_continue_S_tids.empty())) { // All threads are continuing @@ -1279,11 +1265,14 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { } else continue_packet_error = true; - if (direction == RunDirection::eRunForward && continue_packet_error) { + if (continue_packet_error) { // Either no vCont support, or we tried to use part of the vCont packet - // that wasn't supported by the remote GDB server, or it's the reverse - // direction. We need to try and make a simple packet that can do our - // continue. + // that wasn't supported by the remote GDB server. We need to try and + // make a simple packet that can do our continue + const size_t num_continue_c_tids = m_continue_c_tids.size(); + const size_t num_continue_C_tids = m_continue_C_tids.size(); + const size_t num_continue_s_tids = m_continue_s_tids.size(); + const size_t num_continue_S_tids = m_continue_S_tids.size(); if (num_continue_c_tids > 0) { if (num_continue_c_tids == num_threads) { // All threads are resuming... @@ -1374,41 +1363,9 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { } } - if (direction == RunDirection::eRunReverse && continue_packet_error) { - if (num_continue_C_tids > 0 || num_continue_S_tids > 0) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported"); - return Status::FromErrorString("can't deliver signals while running in reverse"); - } - - if (num_continue_s_tids > 0) { - if (num_continue_s_tids > 1) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads"); - return Status::FromErrorString("can't step multiple threads while reverse-stepping"); - } - - if (!m_gdb_comm.GetReverseStepSupported()) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping"); - return Status::FromErrorString("target does not support reverse-stepping"); - } - - m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front()); - continue_packet.PutCString("bs"); - } else { - if (!m_gdb_comm.GetReverseContinueSupported()) { - LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue"); - return Status::FromErrorString("target does not support reverse-continue"); - } - - // All threads continue whether requested or not --- - // we can't change how threads ran in the past. - continue_packet.PutCString("bc"); - } - - continue_packet_error = false; - } - if (continue_packet_error) { - return Status::FromErrorString("can't make continue packet for this resume"); + error = + Status::FromErrorString("can't make continue packet for this resume"); } else { EventSP event_sp; if (!m_async_thread.IsJoinable()) { @@ -1423,7 +1380,7 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) { std::make_shared(continue_packet.GetString()); m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp); - if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) { + if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) { error = Status::FromErrorString("Resume timed out."); LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out."); } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) { @@ -1906,10 +1863,6 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo( thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException( *thread_sp, description.c_str())); handled = true; - } else if (reason == "replaylog") { - thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary( - *thread_sp, description.c_str())); - handled = true; } else if (reason == "exec") { did_exec = true; thread_sp->SetStopInfo( @@ -2365,8 +2318,6 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) { description = std::string(ostr.GetString()); } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) { reason = "breakpoint"; - } else if (key.compare("replaylog") == 0) { - reason = "replaylog"; } else if (key.compare("library") == 0) { auto error = LoadModules(); if (error) { diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h index fa3e1cec76e2b3..2492795851388a 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h @@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process, // Process Control Status WillResume() override; - Status DoResume(lldb::RunDirection direction) override; + Status DoResume() override; Status DoHalt(bool &caused_stop) override; diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp index 304c12173dd35d..d2111ce877ce55 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp @@ -182,15 +182,10 @@ void ScriptedProcess::DidResume() { m_pid = GetInterface().GetProcessID(); } -Status ScriptedProcess::DoResume(RunDirection direction) { +Status ScriptedProcess::DoResume() { LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__); - if (direction == RunDirection::eRunForward) { - return GetInterface().Resume(); - } else { - return Status::FromErrorStringWithFormatv( - "error: {0} does not support reverse execution of processes", GetPluginName()); - } + return GetInterface().Resume(); } Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) { diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h index 8ebe4ca5f3d449..0335364b4010b2 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h +++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h @@ -52,7 +52,7 @@ class ScriptedProcess : public Process { void DidResume() override; - Status DoResume(lldb::RunDirection direction) override; + Status DoResume() override; Status DoAttachToProcessWithID(lldb::pid_t pid, const ProcessAttachInfo &attach_info) override; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index fd683728388215..c009d17d3ba507 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -446,8 +446,7 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, m_memory_cache(*this), m_allocated_memory_cache(*this), m_should_detach(false), m_next_event_action_up(), m_public_run_lock(), m_private_run_lock(), m_currently_handling_do_on_removals(false), - m_resume_requested(false), m_last_run_direction(eRunForward), - m_interrupt_tid(LLDB_INVALID_THREAD_ID), + m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID), m_finalizing(false), m_destructing(false), m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false), m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false), @@ -846,7 +845,6 @@ bool Process::HandleProcessStateChangedEvent( switch (thread_stop_reason) { case eStopReasonInvalid: case eStopReasonNone: - case eStopReasonHistoryBoundary: break; case eStopReasonSignal: { @@ -1354,7 +1352,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) { } } -Status Process::Resume(RunDirection direction) { +Status Process::Resume() { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data()); if (!m_public_run_lock.TrySetRunning()) { @@ -1363,7 +1361,7 @@ Status Process::Resume(RunDirection direction) { return Status::FromErrorString( "Resume request failed - process still running."); } - Status error = PrivateResume(direction); + Status error = PrivateResume(); if (!error.Success()) { // Undo running state change m_public_run_lock.SetStopped(); @@ -1371,7 +1369,7 @@ Status Process::Resume(RunDirection direction) { return error; } -Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) { +Status Process::ResumeSynchronous(Stream *stream) { Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock"); if (!m_public_run_lock.TrySetRunning()) { @@ -1384,7 +1382,7 @@ Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) { Listener::MakeListener(ResumeSynchronousHijackListenerName.data())); HijackProcessEvents(listener_sp); - Status error = PrivateResume(direction); + Status error = PrivateResume(); if (error.Success()) { StateType state = WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream, @@ -3241,7 +3239,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) { return error; } -Status Process::PrivateResume(RunDirection direction) { +Status Process::PrivateResume() { Log *log(GetLog(LLDBLog::Process | LLDBLog::Step)); LLDB_LOGF(log, "Process::PrivateResume() m_stop_id = %u, public state: %s " @@ -3257,15 +3255,6 @@ Status Process::PrivateResume(RunDirection direction) { if (!GetModID().IsLastResumeForUserExpression()) ResetExtendedCrashInfoDict(); - if (m_last_run_direction != direction) { - // In the future we might want to support mixed-direction plans, - // e.g. a forward step-over stops at a breakpoint, the user does - // a reverse-step, then disables the breakpoint and continues forward. - // This code will need to be changed to support that. - m_thread_list.DiscardThreadPlans(); - m_last_run_direction = direction; - } - Status error(WillResume()); // Tell the process it is about to resume before the thread list if (error.Success()) { @@ -3283,7 +3272,7 @@ Status Process::PrivateResume(RunDirection direction) { "Process::PrivateResume PreResumeActions failed, not resuming."); } else { m_mod_id.BumpResumeID(); - error = DoResume(direction); + error = DoResume(); if (error.Success()) { DidResume(); m_thread_list.DidResume(); @@ -3746,7 +3735,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) { "from state: %s", static_cast(event_ptr), StateAsCString(state)); ProcessEventData::SetRestartedInEvent(event_ptr, true); - PrivateResume(m_last_run_direction); + PrivateResume(); } } else { return_value = true; @@ -4357,7 +4346,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) { SetRestarted(true); // Use the private resume method here, since we aren't changing the run // lock state. - process_sp->PrivateResume(process_sp->m_last_run_direction); + process_sp->PrivateResume(); } else { bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) && !process_sp->StateChangedIsHijackedForSynchronousResume(); diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index 08e9a7c099bad2..bd7032b803df90 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -1212,30 +1212,6 @@ class StopInfoProcessorTrace : public StopInfo { } }; -// StopInfoHistoryBoundary - -class StopInfoHistoryBoundary : public StopInfo { -public: - StopInfoHistoryBoundary(Thread &thread, const char *description) - : StopInfo(thread, LLDB_INVALID_UID) { - if (description) - SetDescription(description); - } - - ~StopInfoHistoryBoundary() override = default; - - StopReason GetStopReason() const override { - return eStopReasonHistoryBoundary; - } - - const char *GetDescription() override { - if (m_description.empty()) - return "history boundary"; - else - return m_description.c_str(); - } -}; - // StopInfoThreadPlan class StopInfoThreadPlan : public StopInfo { @@ -1463,11 +1439,6 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread, return StopInfoSP(new StopInfoProcessorTrace(thread, description)); } -StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread, - const char *description) { - return StopInfoSP(new StopInfoHistoryBoundary(thread, description)); -} - StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) { return StopInfoSP(new StopInfoExec(thread)); } diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index bbb586f033b746..902fbb2b519ef7 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -624,12 +624,10 @@ void Thread::SetupForResume() { // what the current plan is. lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext()); - ProcessSP process_sp(GetProcess()); - if (reg_ctx_sp && process_sp && - process_sp->GetLastRunDirection() == eRunForward) { + if (reg_ctx_sp) { const addr_t thread_pc = reg_ctx_sp->GetPC(); BreakpointSiteSP bp_site_sp = - process_sp->GetBreakpointSiteList().FindByAddress(thread_pc); + GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc); if (bp_site_sp) { // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the // target may not require anything special to step over a breakpoint. @@ -1734,8 +1732,6 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) { return "processor trace"; case eStopReasonInterrupt: return "async interrupt"; - case eStopReasonHistoryBoundary: - return "history boundary"; } return "StopReason = " + std::to_string(reason); diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile deleted file mode 100644 index 10495940055b63..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -C_SOURCES := main.c - -include Makefile.rules diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py deleted file mode 100644 index b37578fbd82468..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py +++ /dev/null @@ -1,115 +0,0 @@ -import lldb -import time -import unittest -from lldbsuite.test.lldbtest import * -from lldbsuite.test.decorators import * -from lldbsuite.test.gdbclientutils import * -from lldbsuite.test.lldbreverse import ReverseTestBase -from lldbsuite.test import lldbutil - - -class TestReverseContinueBreakpoints(ReverseTestBase): - NO_DEBUG_INFO_TESTCASE = True - - def test_reverse_continue(self): - self.reverse_continue_internal(async_mode=False) - - def test_reverse_continue_async(self): - self.reverse_continue_internal(async_mode=True) - - def reverse_continue_internal(self, async_mode): - target, process, initial_threads = self.setup_recording(async_mode) - - # Reverse-continue. We'll stop at the point where we started recording. - status = process.Continue(lldb.eRunReverse) - self.assertSuccess(status) - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) - self.expect( - "thread list", - STOPPED_DUE_TO_HISTORY_BOUNDARY, - substrs=["stopped", "stop reason = history boundary"], - ) - - # Continue forward normally until the target exits. - status = process.Continue() - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited]) - self.assertSuccess(status) - self.assertState(process.GetState(), lldb.eStateExited) - self.assertEqual(process.GetExitStatus(), 0) - - def test_reverse_continue_breakpoint(self): - self.reverse_continue_breakpoint_internal(async_mode=False) - - def test_reverse_continue_breakpoint_async(self): - self.reverse_continue_breakpoint_internal(async_mode=True) - - def reverse_continue_breakpoint_internal(self, async_mode): - target, process, initial_threads = self.setup_recording(async_mode) - - # Reverse-continue to the function "trigger_breakpoint". - trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) - status = process.Continue(lldb.eRunReverse) - self.assertSuccess(status) - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) - threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt) - self.assertEqual(threads_now, initial_threads) - - def test_reverse_continue_skip_breakpoint(self): - self.reverse_continue_skip_breakpoint_internal(async_mode=False) - - def test_reverse_continue_skip_breakpoint_async(self): - self.reverse_continue_skip_breakpoint_internal(async_mode=True) - - def reverse_continue_skip_breakpoint_internal(self, async_mode): - target, process, initial_threads = self.setup_recording(async_mode) - - # Reverse-continue over a breakpoint at "trigger_breakpoint" whose - # condition is false. - # This tests that we continue in the correct direction after hitting - # the breakpoint. - trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None) - trigger_bkpt.SetCondition("false_condition") - status = process.Continue(lldb.eRunReverse) - self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped]) - self.assertSuccess(status) - self.expect( - "thread list", - STOPPED_DUE_TO_HISTORY_BOUNDARY, - substrs=["stopped", "stop reason = history boundary"], - ) - - def setup_recording(self, async_mode): - """ - Record execution of code between "start_recording" and "stop_recording" breakpoints. - - Returns with the target stopped at "stop_recording", with recording disabled, - ready to reverse-execute. - """ - self.build() - target = self.dbg.CreateTarget("") - process = self.connect(target) - - # Record execution from the start of the function "start_recording" - # to the start of the function "stop_recording". We want to keep the - # interval that we record as small as possible to minimize the run-time - # of our single-stepping recorder. - start_recording_bkpt = target.BreakpointCreateByName("start_recording", None) - initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt) - self.assertEqual(len(initial_threads), 1) - target.BreakpointDelete(start_recording_bkpt.GetID()) - self.start_recording() - stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None) - lldbutil.continue_to_breakpoint(process, stop_recording_bkpt) - target.BreakpointDelete(stop_recording_bkpt.GetID()) - self.stop_recording() - - self.dbg.SetAsync(async_mode) - self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped]) - - return target, process, initial_threads - - def expect_async_state_changes(self, async_mode, process, states): - if not async_mode: - return - listener = self.dbg.GetListener() - lldbutil.expect_state_changes(self, listener, process, states) diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py deleted file mode 100644 index d610761b8cb0bc..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py +++ /dev/null @@ -1,30 +0,0 @@ -import lldb -import unittest -from lldbsuite.test.lldbtest import * -from lldbsuite.test.decorators import * -from lldbsuite.test import lldbutil - - -class TestReverseContinueNotSupported(TestBase): - NO_DEBUG_INFO_TESTCASE = True - - def test_reverse_continue_not_supported(self): - self.build() - exe = self.getBuildArtifact("a.out") - target = self.dbg.CreateTarget(exe) - self.assertTrue(target, VALID_TARGET) - - main_bkpt = target.BreakpointCreateByName("main", None) - self.assertTrue(main_bkpt, VALID_BREAKPOINT) - - process = target.LaunchSimple(None, None, self.get_process_working_directory()) - self.assertTrue(process, PROCESS_IS_VALID) - - # This will fail gracefully. - status = process.Continue(lldb.eRunReverse) - self.assertFailure(status, "target does not support reverse-continue") - - status = process.Continue() - self.assertSuccess(status) - self.assertState(process.GetState(), lldb.eStateExited) - self.assertEqual(process.GetExitStatus(), 0) diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c deleted file mode 100644 index 40e45dc9f5c317..00000000000000 --- a/lldb/test/API/functionalities/reverse-execution/main.c +++ /dev/null @@ -1,14 +0,0 @@ -volatile int false_condition = 0; - -static void start_recording() {} - -static void trigger_breakpoint() {} - -static void stop_recording() {} - -int main() { - start_recording(); - trigger_breakpoint(); - stop_recording(); - return 0; -} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 211fd34957f496..558f889c4b7f23 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -1045,9 +1045,6 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread, case lldb::eStopReasonProcessorTrace: body.try_emplace("reason", "processor trace"); break; - case lldb::eStopReasonHistoryBoundary: - body.try_emplace("reason", "history boundary"); - break; case lldb::eStopReasonSignal: case lldb::eStopReasonException: body.try_emplace("reason", "exception"); diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp index 1c5e3ac7008727..b38833c0fdb6b6 100644 --- a/lldb/tools/lldb-dap/LLDBUtils.cpp +++ b/lldb/tools/lldb-dap/LLDBUtils.cpp @@ -111,7 +111,6 @@ bool ThreadHasStopReason(lldb::SBThread &thread) { case lldb::eStopReasonVFork: case lldb::eStopReasonVForkDone: case lldb::eStopReasonInterrupt: - case lldb::eStopReasonHistoryBoundary: return true; case lldb::eStopReasonThreadExiting: case lldb::eStopReasonInvalid: From 2647505027d8c01fc920b04aced8cec742a4b2ed Mon Sep 17 00:00:00 2001 From: Finn Plummer <50529406+inbelic@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:34:26 -0700 Subject: [PATCH 100/177] [HLSL] Implement the `degrees` intrinsic (#111209) - add degrees builtin - link degrees api in hlsl_intrinsics.h - add degrees intrinsic to IntrinsicsDirectX.td - add degrees intrinsic to IntrinsicsSPIRV.td - add lowering from clang builtin to dx/spv intrinsics in CGBuiltin.cpp - add semantic checks to SemaHLSL.cpp - add expansion of directx intrinsic to llvm fmul for DirectX in DXILIntrinsicExpansion.cpp - add mapping to spir-v intrinsic in SPIRVInstructionSelector.cpp - add test coverage: - degrees.hlsl -> check hlsl lowering to dx/spv degrees intrinsics - degrees-errors.hlsl/half-float-only-errors -> check semantic warnings - hlsl-intrinsics/degrees.ll -> check lowering of spir-v degrees intrinsic to SPIR-V backend - DirectX/degrees.ll -> check expansion and scalarization of directx degrees intrinsic to fmul Resolves #99104 --- clang/include/clang/Basic/Builtins.td | 6 ++ clang/lib/CodeGen/CGBuiltin.cpp | 10 +++ clang/lib/CodeGen/CGHLSLRuntime.h | 1 + clang/lib/Headers/hlsl/hlsl_intrinsics.h | 30 +++++++++ clang/lib/Sema/SemaHLSL.cpp | 1 + clang/test/CodeGenHLSL/builtins/degrees.hlsl | 64 +++++++++++++++++++ .../SemaHLSL/BuiltIns/degrees-errors.hlsl | 26 ++++++++ .../BuiltIns/half-float-only-errors.hlsl | 1 + llvm/include/llvm/IR/IntrinsicsDirectX.td | 1 + llvm/include/llvm/IR/IntrinsicsSPIRV.td | 1 + .../Target/DirectX/DXILIntrinsicExpansion.cpp | 12 ++++ .../Target/SPIRV/SPIRVInstructionSelector.cpp | 2 + llvm/test/CodeGen/DirectX/degrees.ll | 54 ++++++++++++++++ .../CodeGen/SPIRV/hlsl-intrinsics/degrees.ll | 52 +++++++++++++++ llvm/test/CodeGen/SPIRV/opencl/degrees.ll | 50 +++++++++++++++ 15 files changed, 311 insertions(+) create mode 100644 clang/test/CodeGenHLSL/builtins/degrees.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl create mode 100644 llvm/test/CodeGen/DirectX/degrees.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll create mode 100644 llvm/test/CodeGen/SPIRV/opencl/degrees.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 9ebee81fcb0d3d..7068473a0e12ac 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4745,6 +4745,12 @@ def HLSLCross: LangBuiltin<"HLSL_LANG"> { let Prototype = "void(...)"; } +def HLSLDegrees : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_elementwise_degrees"]; + let Attributes = [NoThrow, Const]; + let Prototype = "void(...)"; +} + def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_dot"]; let Attributes = [NoThrow, Const]; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 06140d6d4ce27b..ff678ee04f9c2a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18755,6 +18755,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, CGM.getHLSLRuntime().getNormalizeIntrinsic(), ArrayRef{X}, nullptr, "hlsl.normalize"); } + case Builtin::BI__builtin_hlsl_elementwise_degrees: { + Value *X = EmitScalarExpr(E->getArg(0)); + + assert(E->getArg(0)->getType()->hasFloatingRepresentation() && + "degree operand must have a float representation"); + + return Builder.CreateIntrinsic( + /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(), + ArrayRef{X}, nullptr, "hlsl.degrees"); + } case Builtin::BI__builtin_hlsl_elementwise_frac: { Value *Op0 = EmitScalarExpr(E->getArg(0)); if (!E->getArg(0)->getType()->hasFloatingRepresentation()) diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 05ff325216f55b..282fa44af212fb 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -75,6 +75,7 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(All, all) GENERATE_HLSL_INTRINSIC_FUNCTION(Any, any) GENERATE_HLSL_INTRINSIC_FUNCTION(Cross, cross) + GENERATE_HLSL_INTRINSIC_FUNCTION(Degrees, degrees) GENERATE_HLSL_INTRINSIC_FUNCTION(Frac, frac) GENERATE_HLSL_INTRINSIC_FUNCTION(Length, length) GENERATE_HLSL_INTRINSIC_FUNCTION(Lerp, lerp) diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index 813f8a317bf6bf..137467e5a782ce 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -766,6 +766,36 @@ uint64_t3 countbits(uint64_t3); _HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) uint64_t4 countbits(uint64_t4); +//===----------------------------------------------------------------------===// +// degrees builtins +//===----------------------------------------------------------------------===// + +/// \fn T degrees(T x) +/// \brief Converts the specified value from radians to degrees. +/// \param x The specified input value. + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +half degrees(half); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +half2 degrees(half2); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +half3 degrees(half3); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +half4 degrees(half4); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +float degrees(float); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +float2 degrees(float2); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +float3 degrees(float3); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees) +float4 degrees(float4); + //===----------------------------------------------------------------------===// // dot product builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index b0acbbbbb2b1f0..137b15c8fcfe98 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -1896,6 +1896,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { return true; break; } + case Builtin::BI__builtin_hlsl_elementwise_degrees: case Builtin::BI__builtin_hlsl_elementwise_radians: case Builtin::BI__builtin_hlsl_elementwise_rsqrt: case Builtin::BI__builtin_hlsl_elementwise_frac: { diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl new file mode 100644 index 00000000000000..9e131f4badc19a --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl @@ -0,0 +1,64 @@ +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv + +// NATIVE_HALF: define [[FNATTRS]] half @ +// NATIVE_HALF: %hlsl.degrees = call half @llvm.[[TARGET]].degrees.f16( +// NATIVE_HALF: ret half %hlsl.degrees +// NO_HALF: define [[FNATTRS]] float @ +// NO_HALF: %hlsl.degrees = call float @llvm.[[TARGET]].degrees.f32( +// NO_HALF: ret float %hlsl.degrees +half test_degrees_half(half p0) { return degrees(p0); } +// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ +// NATIVE_HALF: %hlsl.degrees = call <2 x half> @llvm.[[TARGET]].degrees.v2f16 +// NATIVE_HALF: ret <2 x half> %hlsl.degrees +// NO_HALF: define [[FNATTRS]] <2 x float> @ +// NO_HALF: %hlsl.degrees = call <2 x float> @llvm.[[TARGET]].degrees.v2f32( +// NO_HALF: ret <2 x float> %hlsl.degrees +half2 test_degrees_half2(half2 p0) { return degrees(p0); } +// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ +// NATIVE_HALF: %hlsl.degrees = call <3 x half> @llvm.[[TARGET]].degrees.v3f16 +// NATIVE_HALF: ret <3 x half> %hlsl.degrees +// NO_HALF: define [[FNATTRS]] <3 x float> @ +// NO_HALF: %hlsl.degrees = call <3 x float> @llvm.[[TARGET]].degrees.v3f32( +// NO_HALF: ret <3 x float> %hlsl.degrees +half3 test_degrees_half3(half3 p0) { return degrees(p0); } +// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ +// NATIVE_HALF: %hlsl.degrees = call <4 x half> @llvm.[[TARGET]].degrees.v4f16 +// NATIVE_HALF: ret <4 x half> %hlsl.degrees +// NO_HALF: define [[FNATTRS]] <4 x float> @ +// NO_HALF: %hlsl.degrees = call <4 x float> @llvm.[[TARGET]].degrees.v4f32( +// NO_HALF: ret <4 x float> %hlsl.degrees +half4 test_degrees_half4(half4 p0) { return degrees(p0); } + +// CHECK: define [[FNATTRS]] float @ +// CHECK: %hlsl.degrees = call float @llvm.[[TARGET]].degrees.f32( +// CHECK: ret float %hlsl.degrees +float test_degrees_float(float p0) { return degrees(p0); } +// CHECK: define [[FNATTRS]] <2 x float> @ +// CHECK: %hlsl.degrees = call <2 x float> @llvm.[[TARGET]].degrees.v2f32 +// CHECK: ret <2 x float> %hlsl.degrees +float2 test_degrees_float2(float2 p0) { return degrees(p0); } +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %hlsl.degrees = call <3 x float> @llvm.[[TARGET]].degrees.v3f32 +// CHECK: ret <3 x float> %hlsl.degrees +float3 test_degrees_float3(float3 p0) { return degrees(p0); } +// CHECK: define [[FNATTRS]] <4 x float> @ +// CHECK: %hlsl.degrees = call <4 x float> @llvm.[[TARGET]].degrees.v4f32 +// CHECK: ret <4 x float> %hlsl.degrees +float4 test_degrees_float4(float4 p0) { return degrees(p0); } diff --git a/clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl new file mode 100644 index 00000000000000..9e981f6973572d --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify + +float test_too_few_arg() { + return __builtin_hlsl_elementwise_degrees(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} +} + +float2 test_too_many_arg(float2 p0) { + return __builtin_hlsl_elementwise_degrees(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} +} + +float builtin_bool_to_float_type_promotion(bool p1) { + return __builtin_hlsl_elementwise_degrees(p1); + // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}} +} + +float builtin_degrees_int_to_float_promotion(int p1) { + return __builtin_hlsl_elementwise_degrees(p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +float2 builtin_degrees_int2_to_float2_promotion(int2 p1) { + return __builtin_hlsl_elementwise_degrees(p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl index 2cecf7aeb00e46..cdd130052b6a67 100644 --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl @@ -17,6 +17,7 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians double test_double_builtin(double p0) { diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 1cf6acbf126475..45aea1ccdb6d4c 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -71,6 +71,7 @@ def int_dx_udot : [IntrNoMem, Commutative] >; def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; +def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 92d2f67399d263..3d61456589ee0d 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -62,6 +62,7 @@ let TargetPrefix = "spv" in { def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>; def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>; def int_spv_cross : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + def int_spv_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_frac : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], [IntrNoMem] >; diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 1e84a7216013da..fb5383b3514a5a 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -56,6 +56,7 @@ static bool isIntrinsicExpansion(Function &F) { case Intrinsic::dx_clamp: case Intrinsic::dx_cross: case Intrinsic::dx_uclamp: + case Intrinsic::dx_degrees: case Intrinsic::dx_lerp: case Intrinsic::dx_length: case Intrinsic::dx_normalize: @@ -490,6 +491,14 @@ static Value *expandClampIntrinsic(CallInst *Orig, {MaxCall, Max}, nullptr, "dx.min"); } +static Value *expandDegreesIntrinsic(CallInst *Orig) { + Value *X = Orig->getOperand(0); + Type *Ty = X->getType(); + IRBuilder<> Builder(Orig); + Value *DegreesRatio = ConstantFP::get(Ty, 180.0 * llvm::numbers::inv_pi); + return Builder.CreateFMul(X, DegreesRatio); +} + static Value *expandSignIntrinsic(CallInst *Orig) { Value *X = Orig->getOperand(0); Type *Ty = X->getType(); @@ -549,6 +558,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) { case Intrinsic::dx_clamp: Result = expandClampIntrinsic(Orig, IntrinsicId); break; + case Intrinsic::dx_degrees: + Result = expandDegreesIntrinsic(Orig); + break; case Intrinsic::dx_lerp: Result = expandLerpIntrinsic(Orig); break; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index e8b769b6fd6900..fd92346717c415 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -2513,6 +2513,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::mix, GL::FMix); case Intrinsic::spv_length: return selectExtInst(ResVReg, ResType, I, CL::length, GL::Length); + case Intrinsic::spv_degrees: + return selectExtInst(ResVReg, ResType, I, CL::degrees, GL::Degrees); case Intrinsic::spv_frac: return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_normalize: diff --git a/llvm/test/CodeGen/DirectX/degrees.ll b/llvm/test/CodeGen/DirectX/degrees.ll new file mode 100644 index 00000000000000..b38ac13d5f24e2 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/degrees.ll @@ -0,0 +1,54 @@ +; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + +; Make sure dxil op function calls for degrees are expanded and lowered as fmul for float and half. + +define noundef half @degrees_half(half noundef %a) { +; CHECK-LABEL: define noundef half @degrees_half( +; CHECK-SAME: half noundef [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DX_DEGREES1:%.*]] = fmul half [[A]], 0xH5329 +; CHECK-NEXT: ret half [[DX_DEGREES1]] +; +entry: + %dx.degrees = call half @llvm.dx.degrees.f16(half %a) + ret half %dx.degrees +} + +define noundef float @degrees_float(float noundef %a) #0 { +; CHECK-LABEL: define noundef float @degrees_float( +; CHECK-SAME: float noundef [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DEGREES:%.*]] = fmul float [[A]], 0x404CA5DC20000000 +; CHECK-NEXT: ret float [[DEGREES]] +; +entry: + %dx.degrees = call float @llvm.dx.degrees.f32(float %a) + ret float %dx.degrees +} + +define noundef <4 x float> @degrees_float4(<4 x float> noundef %a) #0 { +; CHECK-LABEL: define noundef <4 x float> @degrees_float4( +; CHECK-SAME: <4 x float> noundef [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A]], i64 0 +; CHECK-NEXT: [[DEGREES_A0:%.*]] = fmul float [[A0]], 0x404CA5DC20000000 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i64 1 +; CHECK-NEXT: [[DEGREES_A1:%.*]] = fmul float [[A1]], 0x404CA5DC20000000 +; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i64 2 +; CHECK-NEXT: [[DEGREES_A2:%.*]] = fmul float [[A2]], 0x404CA5DC20000000 +; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 +; CHECK-NEXT: [[DEGREES_A3:%.*]] = fmul float [[A3]], 0x404CA5DC20000000 +; CHECK-NEXT: [[INSERT_0:%.*]] = insertelement <4 x float> poison, float [[DEGREES_A0]], i64 0 +; CHECK-NEXT: [[INSERT_1:%.*]] = insertelement <4 x float> [[INSERT_0]], float [[DEGREES_A1]], i64 1 +; CHECK-NEXT: [[INSERT_2:%.*]] = insertelement <4 x float> [[INSERT_1]], float [[DEGREES_A2]], i64 2 +; CHECK-NEXT: [[RES:%.*]] = insertelement <4 x float> [[INSERT_2]], float [[DEGREES_A3]], i64 3 +; CHECK-NEXT: ret <4 x float> [[RES]] +; +entry: + %2 = call <4 x float> @llvm.dx.degrees.v4f32(<4 x float> %a) + ret <4 x float> %2 +} + +declare half @llvm.dx.degrees.f16(half) +declare float @llvm.dx.degrees.f32(float) +declare <4 x float> @llvm.dx.degrees.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll new file mode 100644 index 00000000000000..533bcca6f62169 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll @@ -0,0 +1,52 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" + +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 + +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 + +; CHECK-LABEL: Begin function degrees_float +define noundef float @degrees_float(float noundef %a) { +entry: +; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]] +; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Degrees %[[#float_32_arg]] + %elt.degrees = call float @llvm.spv.degrees.f32(float %a) + ret float %elt.degrees +} + +; CHECK-LABEL: Begin function degrees_half +define noundef half @degrees_half(half noundef %a) { +entry: +; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]] +; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Degrees %[[#float_16_arg]] + %elt.degrees = call half @llvm.spv.degrees.f16(half %a) + ret half %elt.degrees +} + +; CHECK-LABEL: Begin function degrees_float_vector +define noundef <4 x float> @degrees_float_vector(<4 x float> noundef %a) { +entry: +; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]] +; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Degrees %[[#vec4_float_32_arg]] + %elt.degrees = call <4 x float> @llvm.spv.degrees.v4f32(<4 x float> %a) + ret <4 x float> %elt.degrees +} + +; CHECK-LABEL: Begin function degrees_half_vector +define noundef <4 x half> @degrees_half_vector(<4 x half> noundef %a) { +entry: +; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]] +; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Degrees %[[#vec4_float_16_arg]] + %elt.degrees = call <4 x half> @llvm.spv.degrees.v4f16(<4 x half> %a) + ret <4 x half> %elt.degrees +} + +declare half @llvm.spv.degrees.f16(half) +declare float @llvm.spv.degrees.f32(float) + +declare <4 x float> @llvm.spv.degrees.v4f32(<4 x float>) +declare <4 x half> @llvm.spv.degrees.v4f16(<4 x half>) diff --git a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll new file mode 100644 index 00000000000000..88f97835fe7194 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll @@ -0,0 +1,50 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std" + +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 + +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 + +declare half @llvm.spv.degrees.f16(half) +declare float @llvm.spv.degrees.f32(float) + +declare <4 x float> @llvm.spv.degrees.v4f32(<4 x float>) +declare <4 x half> @llvm.spv.degrees.v4f16(<4 x half>) + +define noundef float @degrees_float(float noundef %a) { +entry: +; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]] +; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] degrees %[[#float_32_arg]] + %elt.degrees = call float @llvm.spv.degrees.f32(float %a) + ret float %elt.degrees +} + +define noundef half @degrees_half(half noundef %a) { +entry: +; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]] +; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] degrees %[[#float_16_arg]] + %elt.degrees = call half @llvm.spv.degrees.f16(half %a) + ret half %elt.degrees +} + +define noundef <4 x float> @degrees_float_vector(<4 x float> noundef %a) { +entry: +; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]] +; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] degrees %[[#vec4_float_32_arg]] + %elt.degrees = call <4 x float> @llvm.spv.degrees.v4f32(<4 x float> %a) + ret <4 x float> %elt.degrees +} + +define noundef <4 x half> @degrees_half_vector(<4 x half> noundef %a) { +entry: +; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]] +; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] degrees %[[#vec4_float_16_arg]] + %elt.degrees = call <4 x half> @llvm.spv.degrees.v4f16(<4 x half> %a) + ret <4 x half> %elt.degrees +} From 6640dac22b567e5f6c328ca56cf9bf43d45509e6 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Thu, 10 Oct 2024 16:43:29 -0700 Subject: [PATCH 101/177] [bazel] Add include-cleaner tests (#111924) --- .../include-cleaner/BUILD.bazel | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel index 28f90efb3ba7f3..5b210ad80c8740 100644 --- a/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel @@ -2,7 +2,9 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +load("@bazel_skylib//rules:expand_template.bzl", "expand_template") load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") +load("//llvm:lit_test.bzl", "lit_test", "package_path") package( default_visibility = ["//visibility:public"], @@ -61,3 +63,71 @@ cc_binary( "//llvm:Support", ], ) + +cc_test( + name = "unittests", + srcs = glob(["unittests/*.cpp"]), + deps = [ + ":include_cleaner", + ":include_cleaner_internal", + "//clang:ast", + "//clang:basic", + "//clang:format", + "//clang:frontend", + "//clang:lex", + "//clang:serialization", + "//clang:testing", + "//clang:tooling", + "//clang:tooling_inclusions", + "//llvm:Support", + "//llvm:TestingAnnotations", + "//third-party/unittest:gmock", + "//third-party/unittest:gtest", + ], +) + +LLVM_LIT_PATH_FUNCTION = " " + \ + "# Allow generated file to be relocatable.\n" + \ + "from pathlib import Path\n" + \ + "def path(p):\n" + \ + " p = Path(p)\n" + \ + " if p.exists: return str(p.resolve())\n" + \ + " if not p: return ''\n" + \ + " return str((Path(__file__).parent / p).resolve())\n" + +LIT_SITE_CFG_IN_HEADER = "# Autogenerated, do not edit." + LLVM_LIT_PATH_FUNCTION + +expand_template( + name = "lit_site_cfg_py", + testonly = True, + out = "test/lit.site.cfg.py", + substitutions = { + "@CMAKE_CURRENT_BINARY_DIR@": package_path("//clang-tools-extra/include-cleaner:BUILD") + "/test", + "@CMAKE_CURRENT_SOURCE_DIR@": package_path("//clang-tools-extra/include-cleaner:BUILD") + "/test", + "@CURRENT_TOOLS_DIR@": package_path("//clang-tools-extra/include-cleaner:BUILD"), + "@LIT_SITE_CFG_IN_HEADER@": LIT_SITE_CFG_IN_HEADER, + "@LLVM_LIBS_DIR@": package_path("//llvm:BUILD"), + "@LLVM_LIT_TOOLS_DIR@": package_path("//llvm:BUILD"), + "@LLVM_TOOLS_DIR@": package_path("//llvm:BUILD"), + "@TARGET_TRIPLE@": "", + '"@Python3_EXECUTABLE@"': "sys.executable", + }, + template = "test/lit.site.cfg.py.in", +) + +[ + lit_test( + name = "%s.test" % src, + srcs = [src], + data = glob(["test/Inputs/**/*"]) + [ + "test/lit.cfg.py", + "test/lit.site.cfg.py", + ":clang-include-cleaner", + "//llvm:FileCheck", + "//llvm:count", + "//llvm:not", + ], + args = ["-svv"], + ) + for src in glob(["test/*.cpp"]) +] From 1037f577bd66ab03bc494120f024f2a52008e285 Mon Sep 17 00:00:00 2001 From: Igor Kudrin Date: Fri, 11 Oct 2024 06:51:27 +0700 Subject: [PATCH 102/177] [lld][elf] Warn if '*' pattern is used multiple times in version scripts (#102669) If this pattern is used more than once in version script(s), only one will have an effect, so it's probably a user error and can be diagnosed. --- lld/ELF/SymbolTable.cpp | 34 ++++++++++++++++++-- lld/test/ELF/version-script-reassign-glob.s | 4 ++- lld/test/ELF/version-script-warn.s | 35 +++++++++++++++++++++ 3 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 lld/test/ELF/version-script-warn.s diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp index db8ee8f4d7b3bb..b9ef28f0436f88 100644 --- a/lld/ELF/SymbolTable.cpp +++ b/lld/ELF/SymbolTable.cpp @@ -309,13 +309,43 @@ void SymbolTable::scanVersionScript() { // Then, assign versions to "*". In GNU linkers they have lower priority than // other wildcards. + bool globalAsteriskFound = false; + bool localAsteriskFound = false; + bool asteriskReported = false; + auto assignAsterisk = [&](SymbolVersion &pat, VersionDefinition *ver, + bool isLocal) { + // Avoid issuing a warning if both '--retain-symbol-file' and a version + // script with `global: *` are used. + // + // '--retain-symbol-file' adds a "*" pattern to + // 'config->versionDefinitions[VER_NDX_LOCAL].nonLocalPatterns', see + // 'readConfigs()' in 'Driver.cpp'. Note that it is not '.localPatterns', + // and may seem counterintuitive, but still works as expected. Here we can + // exploit that and skip analyzing the pattern added for this option. + if (!asteriskReported && (isLocal || ver->id > VER_NDX_LOCAL)) { + if ((isLocal && globalAsteriskFound) || + (!isLocal && localAsteriskFound)) { + warn("wildcard pattern '*' is used for both 'local' and 'global' " + "scopes in version script"); + asteriskReported = true; + } else if (!isLocal && globalAsteriskFound) { + warn("wildcard pattern '*' is used for multiple version definitions in " + "version script"); + asteriskReported = true; + } else { + localAsteriskFound = isLocal; + globalAsteriskFound = !isLocal; + } + } + assignWildcard(pat, isLocal ? VER_NDX_LOCAL : ver->id, ver->name); + }; for (VersionDefinition &v : llvm::reverse(ctx.arg.versionDefinitions)) { for (SymbolVersion &pat : v.nonLocalPatterns) if (pat.hasWildcard && pat.name == "*") - assignWildcard(pat, v.id, v.name); + assignAsterisk(pat, &v, false); for (SymbolVersion &pat : v.localPatterns) if (pat.hasWildcard && pat.name == "*") - assignWildcard(pat, VER_NDX_LOCAL, v.name); + assignAsterisk(pat, &v, true); } // Symbol themselves might know their versions because symbols diff --git a/lld/test/ELF/version-script-reassign-glob.s b/lld/test/ELF/version-script-reassign-glob.s index 39d19a26fc4498..8de36467bd8ee6 100644 --- a/lld/test/ELF/version-script-reassign-glob.s +++ b/lld/test/ELF/version-script-reassign-glob.s @@ -10,7 +10,8 @@ # RUN: llvm-readelf --dyn-syms %t.so | FileCheck --check-prefix=BAR %s # RUN: echo 'bar1 { *; }; bar2 { *; };' > %t2.ver -# RUN: ld.lld --version-script %t2.ver %t.o -shared -o %t2.so --fatal-warnings +# RUN: ld.lld --version-script %t2.ver %t.o -shared -o %t2.so 2>&1 | \ +# RUN: FileCheck --check-prefix=DUPWARN %s # RUN: llvm-readelf --dyn-syms %t2.so | FileCheck --check-prefix=BAR2 %s ## If both a non-* glob and a * match, non-* wins. @@ -21,6 +22,7 @@ ## When there are multiple * patterns, the last wins. # BAR2: GLOBAL DEFAULT 7 foo@@bar2 +# DUPWARN: warning: wildcard pattern '*' is used for multiple version definitions in version script .globl foo foo: diff --git a/lld/test/ELF/version-script-warn.s b/lld/test/ELF/version-script-warn.s new file mode 100644 index 00000000000000..9aba596165796b --- /dev/null +++ b/lld/test/ELF/version-script-warn.s @@ -0,0 +1,35 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o + +# RUN: echo 'foo { *; }; bar { *; };' > %t.ver +# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \ +# RUN: FileCheck --check-prefix=MULTVER %s + +# RUN: echo '{ global: *; local: *;};' > %t.ver +# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \ +# RUN: FileCheck --check-prefix=LOCGLOB %s + +# RUN: echo 'V1 { global: *; }; V2 { local: *;};' > %t.ver +# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \ +# RUN: FileCheck --check-prefix=LOCGLOB %s + +# RUN: echo 'V1 { local: *; }; V2 { global: *;};' > %t.ver +# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \ +# RUN: FileCheck --check-prefix=LOCGLOB %s + +# RUN: echo 'V1 { local: *; }; V2 { local: *;};' > %t.ver +# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so --fatal-warnings + +## --retain-symbols-file uses the same internal infrastructure as the support +## for version scripts. Do not show the warings if they both are used. +# RUN: echo 'foo' > %t_retain.txt +# RUN: echo '{ local: *; };' > %t_local.ver +# RUN: echo '{ global: *; };' > %t_global.ver +# RUN: ld.lld --retain-symbols-file=%t_retain.txt --version-script %t_local.ver %t.o -shared -o %t.so --fatal-warnings +# RUN: ld.lld --retain-symbols-file=%t_retain.txt --version-script %t_global.ver %t.o -shared -o %t.so --fatal-warnings + +# MULTVER: warning: wildcard pattern '*' is used for multiple version definitions in version script +# LOCGLOB: warning: wildcard pattern '*' is used for both 'local' and 'global' scopes in version script + +.globl foo +foo: From 0add1741d58e4b8d6cbc5f50e1fac86296680e5b Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Thu, 10 Oct 2024 16:55:41 -0700 Subject: [PATCH 103/177] [bazel] Port e9c8f75d45ababe7f805078bbf7bda2e7425f1b7 (#111928) --- .../bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 96202bf47b8486..38493411addebf 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -2248,6 +2248,8 @@ cc_library( hdrs = glob(["Process/minidump/*.h"]), include_prefix = "Plugins", deps = [ + ":PluginDynamicLoaderPosixDYLD", + ":PluginDynamicLoaderPosixDYLDHeaders", ":PluginObjectFilePlaceholder", ":PluginProcessElfCore", ":PluginProcessUtility", From 774c953cf8f8ff2fe45b07f388a687748b775878 Mon Sep 17 00:00:00 2001 From: yronglin Date: Fri, 11 Oct 2024 08:15:27 +0800 Subject: [PATCH 104/177] [NFC][clang] Fix typo in ReleaseNotes (#111930) Fix a typo in ReleaseNotes that introduced by https://github.com/llvm/llvm-project/pull/86960. Signed-off-by: yronglin --- clang/docs/ReleaseNotes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e48835d4738007..df165b91252505 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -172,7 +172,7 @@ C++23 Feature Support - Removed the restriction to literal types in constexpr functions in C++23 mode. - Extend lifetime of temporaries in mem-default-init for P2718R0. Clang now fully - supported `P2718R0 Lifetime extension in range-based for loops `_. + supports `P2718R0 Lifetime extension in range-based for loops `_. C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ From 9c81a2476566b068ef54fd51ab2540933542b2a6 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 17:44:57 -0700 Subject: [PATCH 105/177] [asan] Prevent printing invalid parent thread (#111916) By default reuse can happend only after `UINT32_MAX` threads, so it's almost NFC. --- compiler-rt/lib/asan/asan_descriptions.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp index 1c2f20a76343bb..674fe9c1e90be0 100644 --- a/compiler-rt/lib/asan/asan_descriptions.cpp +++ b/compiler-rt/lib/asan/asan_descriptions.cpp @@ -48,9 +48,20 @@ void DescribeThread(AsanThreadContext *context) { return; } context->announced = true; + + AsanThreadContext *parent_context = + context->parent_tid == kInvalidTid + ? nullptr + : GetThreadContextByTidLocked(context->parent_tid); + + // `context->parent_tid` may point to reused slot. Check `unique_id` which + // is always smaller for the parent, always greater for a new user. + if (context->unique_id <= parent_context->unique_id) + parent_context = nullptr; + InternalScopedString str; str.AppendF("Thread %s", AsanThreadIdAndName(context).c_str()); - if (context->parent_tid == kInvalidTid) { + if (!parent_context) { str.Append(" created by unknown thread\n"); Printf("%s", str.data()); return; @@ -60,11 +71,8 @@ void DescribeThread(AsanThreadContext *context) { Printf("%s", str.data()); StackDepotGet(context->stack_id).Print(); // Recursively described parent thread if needed. - if (flags()->print_full_thread_history) { - AsanThreadContext *parent_context = - GetThreadContextByTidLocked(context->parent_tid); + if (flags()->print_full_thread_history) DescribeThread(parent_context); - } } // Shadow descriptions From 72fb37922577997f3666203dbdb2601f0fc97748 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Fri, 11 Oct 2024 08:45:14 +0800 Subject: [PATCH 106/177] AArch64: Select FCANONICALIZE (#104429) FMINNM/FMAXNM instructions of AArch64 follow IEEE754-2008. We can use them to canonicalize a floating point number. And FMINNUM_IEEE/FMAXNUM_IEEE is used by something like expanding FMINIMUMNUM/FMAXIMUMNUM, so let's define them. --------- Co-authored-by: Your Name --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 53 +- llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll | 587 ++++++++++++++++++ .../AArch64/fp-maximumnum-minimumnum.ll | 560 +++++++++++++++++ 4 files changed, 1188 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c1aefee3793c96..8a217cd1ec5cf9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -775,6 +775,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM, + ISD::FCANONICALIZE, ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, @@ -818,6 +819,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32); setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32); setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32); + setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32); setOperationAction(ISD::FABS, V4Narrow, Legal); setOperationAction(ISD::FNEG, V4Narrow, Legal); @@ -851,6 +853,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, V8Narrow, Expand); setOperationAction(ISD::SELECT_CC, V8Narrow, Expand); setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand); + setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32); }; if (!Subtarget->hasFullFP16()) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 33d05d6039b096..325508b62a9f14 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5052,17 +5052,25 @@ def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), def : Pat<(fminnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)), (FMINNMDrr FPR64:$a, FPR64:$b)>; -def : Pat<(fminnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)), - (FMINNMSrr FPR32:$a, FPR32:$b)>; -def : Pat<(fminnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)), - (FMINNMHrr FPR16:$a, FPR16:$b)>; def : Pat<(fmaxnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)), (FMAXNMDrr FPR64:$a, FPR64:$b)>; +def : Pat<(f64 (fcanonicalize f64:$a)), + (FMINNMDrr f64:$a, f64:$a)>; +def : Pat<(fminnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)), + (FMINNMSrr FPR32:$a, FPR32:$b)>; def : Pat<(fmaxnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)), (FMAXNMSrr FPR32:$a, FPR32:$b)>; +def : Pat<(f32 (fcanonicalize f32:$a)), + (FMINNMSrr f32:$a, f32:$a)>; + +let Predicates = [HasFullFP16] in { +def : Pat<(fminnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)), + (FMINNMHrr FPR16:$a, FPR16:$b)>; def : Pat<(fmaxnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)), (FMAXNMHrr FPR16:$a, FPR16:$b)>; - +def : Pat<(f16 (fcanonicalize f16:$a)), + (FMINNMHrr f16:$a, f16:$a)>; +} //===----------------------------------------------------------------------===// // Floating point three operand instructions. //===----------------------------------------------------------------------===// @@ -5567,26 +5575,41 @@ defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>; defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>; +let Predicates = [HasNEON] in { def : Pat<(v2f64 (fminnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))), (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>; -def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))), - (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>; -def : Pat<(v8f16 (fminnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))), - (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>; -def : Pat<(v2f32 (fminnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))), - (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>; -def : Pat<(v4f16 (fminnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))), - (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>; def : Pat<(v2f64 (fmaxnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))), (v2f64 (FMAXNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>; +def : Pat<(v2f64 (fcanonicalize (v2f64 V128:$Rn))), + (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rn)))>; +def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))), + (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>; def : Pat<(v4f32 (fmaxnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))), (v4f32 (FMAXNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>; -def : Pat<(v8f16 (fmaxnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))), - (v8f16 (FMAXNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>; +def : Pat<(v4f32 (fcanonicalize (v4f32 V128:$Rn))), + (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rn)))>; +def : Pat<(v2f32 (fminnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>; def : Pat<(v2f32 (fmaxnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))), (v2f32 (FMAXNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>; +def : Pat<(v2f32 (fcanonicalize (v2f32 V64:$Rn))), + (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rn)))>; +} + +let Predicates = [HasNEON, HasFullFP16] in { +def : Pat<(v8f16 (fminnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>; +def : Pat<(v8f16 (fmaxnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (v8f16 (FMAXNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>; +def : Pat<(v8f16 (fcanonicalize (v8f16 V128:$Rn))), + (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rn)))>; +def : Pat<(v4f16 (fminnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>; def : Pat<(v4f16 (fmaxnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))), (v4f16 (FMAXNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>; +def : Pat<(v4f16 (fcanonicalize (v4f16 V64:$Rn))), + (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rn)))>; +} // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the // instruction expects the addend first, while the fma intrinsic puts it last. diff --git a/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll new file mode 100644 index 00000000000000..753e2b73433994 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll @@ -0,0 +1,587 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=aarch64 --mattr=-fullfp16,-neon < %s | FileCheck %s --check-prefix=CHECK-NOFP16-NONEON +; RUN: llc --mtriple=aarch64 --mattr=+fullfp16,-neon < %s | FileCheck %s --check-prefix=CHECK-FP16-NONEON +; RUN: llc --mtriple=aarch64 --mattr=-fullfp16,+neon < %s | FileCheck %s --check-prefix=CHECK-NOFP16-NEON +; RUN: llc --mtriple=aarch64 --mattr=+fullfp16,+neon < %s | FileCheck %s --check-prefixes=CHECK-FP16-NEON + +declare half @llvm.fcanonicalize.f16(half) +declare float @llvm.fcanonicalize.f32(float) +declare double @llvm.fcanonicalize.f64(double) + +define half @fcanonicalize_f16(half %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f16: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_f16: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f16: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_f16: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NEON-NEXT: ret + %z = call half @llvm.canonicalize.f16(half %x) + ret half %z +} + +define half @fcanonicalize_f16_nnan(half %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f16_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_f16_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f16_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_f16_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan half @llvm.canonicalize.f16(half %x) + ret half %z +} + +define <2 x half> @fcanonicalize_v2f16(<2 x half> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f16: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fcvt s1, h1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h1, s1 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f16: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: fminnm h1, h1, h1 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f16: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NOFP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f16: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.4h, v0.4h, v0.4h +; CHECK-FP16-NEON-NEXT: ret + %z = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %x) + ret <2 x half> %z +} + +define <2 x half> @fcanonicalize_v2f16_nnan(<2 x half> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f16_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fcvt s1, h1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h1, s1 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f16_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: fminnm h1, h1, h1 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f16_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NOFP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f16_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.4h, v0.4h, v0.4h +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan <2 x half> @llvm.canonicalize.v2f16(<2 x half> %x) + ret <2 x half> %z +} + +define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f16: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fcvt s1, h1 +; CHECK-NOFP16-NONEON-NEXT: fcvt s2, h2 +; CHECK-NOFP16-NONEON-NEXT: fcvt s3, h3 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-NOFP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h1, s1 +; CHECK-NOFP16-NONEON-NEXT: fcvt h2, s2 +; CHECK-NOFP16-NONEON-NEXT: fcvt h3, s3 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f16: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: fminnm h1, h1, h1 +; CHECK-FP16-NONEON-NEXT: fminnm h2, h2, h2 +; CHECK-FP16-NONEON-NEXT: fminnm h3, h3, h3 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f16: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NOFP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f16: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.4h, v0.4h, v0.4h +; CHECK-FP16-NEON-NEXT: ret + %z = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %x) + ret <4 x half> %z +} + +define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f16_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fcvt s1, h1 +; CHECK-NOFP16-NONEON-NEXT: fcvt s2, h2 +; CHECK-NOFP16-NONEON-NEXT: fcvt s3, h3 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-NOFP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h1, s1 +; CHECK-NOFP16-NONEON-NEXT: fcvt h2, s2 +; CHECK-NOFP16-NONEON-NEXT: fcvt h3, s3 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f16_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: fminnm h1, h1, h1 +; CHECK-FP16-NONEON-NEXT: fminnm h2, h2, h2 +; CHECK-FP16-NONEON-NEXT: fminnm h3, h3, h3 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f16_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NOFP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f16_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.4h, v0.4h, v0.4h +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan <4 x half> @llvm.canonicalize.v4f16(<4 x half> %x) + ret <4 x half> %z +} + +define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v8f16: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fcvt s1, h1 +; CHECK-NOFP16-NONEON-NEXT: fcvt s2, h2 +; CHECK-NOFP16-NONEON-NEXT: fcvt s3, h3 +; CHECK-NOFP16-NONEON-NEXT: fcvt s4, h4 +; CHECK-NOFP16-NONEON-NEXT: fcvt s5, h5 +; CHECK-NOFP16-NONEON-NEXT: fcvt s6, h6 +; CHECK-NOFP16-NONEON-NEXT: fcvt s7, h7 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-NOFP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-NOFP16-NONEON-NEXT: fminnm s4, s4, s4 +; CHECK-NOFP16-NONEON-NEXT: fminnm s5, s5, s5 +; CHECK-NOFP16-NONEON-NEXT: fminnm s6, s6, s6 +; CHECK-NOFP16-NONEON-NEXT: fminnm s7, s7, s7 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h1, s1 +; CHECK-NOFP16-NONEON-NEXT: fcvt h2, s2 +; CHECK-NOFP16-NONEON-NEXT: fcvt h3, s3 +; CHECK-NOFP16-NONEON-NEXT: fcvt h4, s4 +; CHECK-NOFP16-NONEON-NEXT: fcvt h5, s5 +; CHECK-NOFP16-NONEON-NEXT: fcvt h6, s6 +; CHECK-NOFP16-NONEON-NEXT: fcvt h7, s7 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v8f16: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: fminnm h1, h1, h1 +; CHECK-FP16-NONEON-NEXT: fminnm h2, h2, h2 +; CHECK-FP16-NONEON-NEXT: fminnm h3, h3, h3 +; CHECK-FP16-NONEON-NEXT: fminnm h4, h4, h4 +; CHECK-FP16-NONEON-NEXT: fminnm h5, h5, h5 +; CHECK-FP16-NONEON-NEXT: fminnm h6, h6, h6 +; CHECK-FP16-NONEON-NEXT: fminnm h7, h7, h7 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v8f16: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvtl v1.4s, v0.4h +; CHECK-NOFP16-NEON-NEXT: fcvtl2 v2.4s, v0.8h +; CHECK-NOFP16-NEON-NEXT: fminnm v1.4s, v1.4s, v1.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-NEON-NEXT: fminnm v1.4s, v2.4s, v2.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn2 v0.8h, v1.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v8f16: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.8h, v0.8h, v0.8h +; CHECK-FP16-NEON-NEXT: ret + %z = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %x) + ret <8 x half> %z +} + +define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v8f16_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fcvt s0, h0 +; CHECK-NOFP16-NONEON-NEXT: fcvt s1, h1 +; CHECK-NOFP16-NONEON-NEXT: fcvt s2, h2 +; CHECK-NOFP16-NONEON-NEXT: fcvt s3, h3 +; CHECK-NOFP16-NONEON-NEXT: fcvt s4, h4 +; CHECK-NOFP16-NONEON-NEXT: fcvt s5, h5 +; CHECK-NOFP16-NONEON-NEXT: fcvt s6, h6 +; CHECK-NOFP16-NONEON-NEXT: fcvt s7, h7 +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-NOFP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-NOFP16-NONEON-NEXT: fminnm s4, s4, s4 +; CHECK-NOFP16-NONEON-NEXT: fminnm s5, s5, s5 +; CHECK-NOFP16-NONEON-NEXT: fminnm s6, s6, s6 +; CHECK-NOFP16-NONEON-NEXT: fminnm s7, s7, s7 +; CHECK-NOFP16-NONEON-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NONEON-NEXT: fcvt h1, s1 +; CHECK-NOFP16-NONEON-NEXT: fcvt h2, s2 +; CHECK-NOFP16-NONEON-NEXT: fcvt h3, s3 +; CHECK-NOFP16-NONEON-NEXT: fcvt h4, s4 +; CHECK-NOFP16-NONEON-NEXT: fcvt h5, s5 +; CHECK-NOFP16-NONEON-NEXT: fcvt h6, s6 +; CHECK-NOFP16-NONEON-NEXT: fcvt h7, s7 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v8f16_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm h0, h0, h0 +; CHECK-FP16-NONEON-NEXT: fminnm h1, h1, h1 +; CHECK-FP16-NONEON-NEXT: fminnm h2, h2, h2 +; CHECK-FP16-NONEON-NEXT: fminnm h3, h3, h3 +; CHECK-FP16-NONEON-NEXT: fminnm h4, h4, h4 +; CHECK-FP16-NONEON-NEXT: fminnm h5, h5, h5 +; CHECK-FP16-NONEON-NEXT: fminnm h6, h6, h6 +; CHECK-FP16-NONEON-NEXT: fminnm h7, h7, h7 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v8f16_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fcvtl v1.4s, v0.4h +; CHECK-NOFP16-NEON-NEXT: fcvtl2 v2.4s, v0.8h +; CHECK-NOFP16-NEON-NEXT: fminnm v1.4s, v1.4s, v1.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-NEON-NEXT: fminnm v1.4s, v2.4s, v2.4s +; CHECK-NOFP16-NEON-NEXT: fcvtn2 v0.8h, v1.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v8f16_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.8h, v0.8h, v0.8h +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan <8 x half> @llvm.canonicalize.v8f16(<8 x half> %x) + ret <8 x half> %z +} + +define float @fcanonicalize_f32(float %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f32: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_f32: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f32: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_f32: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NEON-NEXT: ret + %z = call float @llvm.canonicalize.f32(float %x) + ret float %z +} + +define float @fcanonicalize_f32_nnan(float %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f32_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_f32_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f32_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_f32_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan float @llvm.canonicalize.f32(float %x) + ret float %z +} + +define <2 x float> @fcanonicalize_v2f32(<2 x float> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f32: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f32: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f32: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm v0.2s, v0.2s, v0.2s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f32: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.2s, v0.2s, v0.2s +; CHECK-FP16-NEON-NEXT: ret + %z = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %x) + ret <2 x float> %z +} + +define <2 x float> @fcanonicalize_v2f32_nnan(<2 x float> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f32_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f32_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f32_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm v0.2s, v0.2s, v0.2s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f32_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.2s, v0.2s, v0.2s +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan <2 x float> @llvm.canonicalize.v2f32(<2 x float> %x) + ret <2 x float> %z +} + +define <4 x float> @fcanonicalize_v4f32(<4 x float> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f32: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-NOFP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f32: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-FP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-FP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f32: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f32: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-FP16-NEON-NEXT: ret + %z = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %x) + ret <4 x float> %z +} + +define <4 x float> @fcanonicalize_v4f32_nnan(<4 x float> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f32_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-NOFP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-NOFP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-NOFP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f32_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm s0, s0, s0 +; CHECK-FP16-NONEON-NEXT: fminnm s1, s1, s1 +; CHECK-FP16-NONEON-NEXT: fminnm s2, s2, s2 +; CHECK-FP16-NONEON-NEXT: fminnm s3, s3, s3 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f32_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f32_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan <4 x float> @llvm.canonicalize.v4f32(<4 x float> %x) + ret <4 x float> %z +} + +define double @fcanonicalize_f64(double %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f64: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_f64: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f64: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm d0, d0, d0 +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_f64: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm d0, d0, d0 +; CHECK-FP16-NEON-NEXT: ret + %z = call double @llvm.canonicalize.f64(double %x) + ret double %z +} + +define double @fcanonicalize_f64_nnan(double %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f64_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_f64_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f64_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm d0, d0, d0 +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_f64_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm d0, d0, d0 +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan double @llvm.canonicalize.f64(double %x) + ret double %z +} + +define <2 x double> @fcanonicalize_v2f64(<2 x double> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f64: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-NOFP16-NONEON-NEXT: fminnm d1, d1, d1 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f64: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-FP16-NONEON-NEXT: fminnm d1, d1, d1 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f64: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm v0.2d, v0.2d, v0.2d +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f64: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.2d, v0.2d, v0.2d +; CHECK-FP16-NEON-NEXT: ret + %z = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %x) + ret <2 x double> %z +} + +define <2 x double> @fcanonicalize_v2f64_nnan(<2 x double> %x) { +; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f64_nnan: +; CHECK-NOFP16-NONEON: // %bb.0: +; CHECK-NOFP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-NOFP16-NONEON-NEXT: fminnm d1, d1, d1 +; CHECK-NOFP16-NONEON-NEXT: ret +; +; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f64_nnan: +; CHECK-FP16-NONEON: // %bb.0: +; CHECK-FP16-NONEON-NEXT: fminnm d0, d0, d0 +; CHECK-FP16-NONEON-NEXT: fminnm d1, d1, d1 +; CHECK-FP16-NONEON-NEXT: ret +; +; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f64_nnan: +; CHECK-NOFP16-NEON: // %bb.0: +; CHECK-NOFP16-NEON-NEXT: fminnm v0.2d, v0.2d, v0.2d +; CHECK-NOFP16-NEON-NEXT: ret +; +; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f64_nnan: +; CHECK-FP16-NEON: // %bb.0: +; CHECK-FP16-NEON-NEXT: fminnm v0.2d, v0.2d, v0.2d +; CHECK-FP16-NEON-NEXT: ret + %z = call nnan <2 x double> @llvm.canonicalize.v2f64(<2 x double> %x) + ret <2 x double> %z +} diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index b8406179f3cb32..bb3f9a3e52a16b 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -472,3 +472,563 @@ entry: %c = call nnan <16 x half> @llvm.minimumnum.v16f16(<16 x half> %a, <16 x half> %b) ret <16 x half> %c } + +;;;;;;;;;;;;;;;; max_f64 +define double @max_f64(double %a, double %b) { +; AARCH64-LABEL: max_f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm d1, d1, d1 +; AARCH64-NEXT: fminnm d0, d0, d0 +; AARCH64-NEXT: fmaxnm d0, d0, d1 +; AARCH64-NEXT: ret +entry: + %c = call double @llvm.maximumnum.f64(double %a, double %b) + ret double %c +} + +define <2 x double> @max_v2f64(<2 x double> %a, <2 x double> %b) { +; AARCH64-LABEL: max_v2f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.2d, v1.2d, v1.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v0.2d +; AARCH64-NEXT: fmaxnm v0.2d, v0.2d, v1.2d +; AARCH64-NEXT: ret +entry: + %c = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %c +} + +define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) { +; AARCH64-LABEL: max_v3f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: // kill: def $d3 killed $d3 def $q3 +; AARCH64-NEXT: // kill: def $d0 killed $d0 def $q0 +; AARCH64-NEXT: // kill: def $d4 killed $d4 def $q4 +; AARCH64-NEXT: // kill: def $d1 killed $d1 def $q1 +; AARCH64-NEXT: // kill: def $d2 killed $d2 def $q2 +; AARCH64-NEXT: // kill: def $d5 killed $d5 def $q5 +; AARCH64-NEXT: mov v0.d[1], v1.d[0] +; AARCH64-NEXT: mov v3.d[1], v4.d[0] +; AARCH64-NEXT: fminnm v2.2d, v2.2d, v2.2d +; AARCH64-NEXT: fminnm v1.2d, v3.2d, v3.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v0.2d +; AARCH64-NEXT: fmaxnm v0.2d, v0.2d, v1.2d +; AARCH64-NEXT: fminnm v1.2d, v5.2d, v5.2d +; AARCH64-NEXT: fmaxnm v2.2d, v2.2d, v1.2d +; AARCH64-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; AARCH64-NEXT: // kill: def $d0 killed $d0 killed $q0 +; AARCH64-NEXT: // kill: def $d1 killed $d1 killed $q1 +; AARCH64-NEXT: // kill: def $d2 killed $d2 killed $q2 +; AARCH64-NEXT: ret +entry: + %c = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> %a, <3 x double> %b) + ret <3 x double> %c +} + +define <4 x double> @max_v4f64(<4 x double> %a, <4 x double> %b) { +; AARCH64-LABEL: max_v4f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v2.2d, v2.2d, v2.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v0.2d +; AARCH64-NEXT: fminnm v3.2d, v3.2d, v3.2d +; AARCH64-NEXT: fminnm v1.2d, v1.2d, v1.2d +; AARCH64-NEXT: fmaxnm v0.2d, v0.2d, v2.2d +; AARCH64-NEXT: fmaxnm v1.2d, v1.2d, v3.2d +; AARCH64-NEXT: ret +entry: + %c = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %a, <4 x double> %b) + ret <4 x double> %c +} + +;;;;;;;;;;;;;;;;;; max_f32 +define float @max_f32(float %a, float %b) { +; AARCH64-LABEL: max_f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm s1, s1, s1 +; AARCH64-NEXT: fminnm s0, s0, s0 +; AARCH64-NEXT: fmaxnm s0, s0, s1 +; AARCH64-NEXT: ret +entry: + %c = call float @llvm.maximumnum.f32(float %a, float %b) + ret float %c +} + +define <2 x float> @max_v2f32(<2 x float> %a, <2 x float> %b) { +; AARCH64-LABEL: max_v2f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.2s, v1.2s, v1.2s +; AARCH64-NEXT: fminnm v0.2s, v0.2s, v0.2s +; AARCH64-NEXT: fmaxnm v0.2s, v0.2s, v1.2s +; AARCH64-NEXT: ret +entry: + %c = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %c +} + +define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) { +; AARCH64-LABEL: max_v3f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4s, v1.4s, v1.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; AARCH64-NEXT: ret +entry: + %c = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> %a, <3 x float> %b) + ret <3 x float> %c +} + +define <4 x float> @max_v4f32(<4 x float> %a, <4 x float> %b) { +; AARCH64-LABEL: max_v4f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4s, v1.4s, v1.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; AARCH64-NEXT: ret +entry: + %c = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %c +} + +define <5 x float> @max_v5f32(<5 x float> %a, <5 x float> %b) { +; AARCH64-LABEL: max_v5f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: // kill: def $s0 killed $s0 def $q0 +; AARCH64-NEXT: // kill: def $s5 killed $s5 def $q5 +; AARCH64-NEXT: // kill: def $s1 killed $s1 def $q1 +; AARCH64-NEXT: // kill: def $s6 killed $s6 def $q6 +; AARCH64-NEXT: // kill: def $s2 killed $s2 def $q2 +; AARCH64-NEXT: // kill: def $s7 killed $s7 def $q7 +; AARCH64-NEXT: // kill: def $s3 killed $s3 def $q3 +; AARCH64-NEXT: mov x8, sp +; AARCH64-NEXT: // kill: def $s4 killed $s4 def $q4 +; AARCH64-NEXT: mov v0.s[1], v1.s[0] +; AARCH64-NEXT: mov v5.s[1], v6.s[0] +; AARCH64-NEXT: mov v0.s[2], v2.s[0] +; AARCH64-NEXT: mov v5.s[2], v7.s[0] +; AARCH64-NEXT: ldr s2, [sp, #8] +; AARCH64-NEXT: fminnm v2.4s, v2.4s, v2.4s +; AARCH64-NEXT: mov v0.s[3], v3.s[0] +; AARCH64-NEXT: ld1 { v5.s }[3], [x8] +; AARCH64-NEXT: fminnm v3.4s, v4.4s, v4.4s +; AARCH64-NEXT: fminnm v1.4s, v5.4s, v5.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fmaxnm v4.4s, v3.4s, v2.4s +; AARCH64-NEXT: // kill: def $s4 killed $s4 killed $q4 +; AARCH64-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; AARCH64-NEXT: mov s1, v0.s[1] +; AARCH64-NEXT: mov s2, v0.s[2] +; AARCH64-NEXT: mov s3, v0.s[3] +; AARCH64-NEXT: // kill: def $s0 killed $s0 killed $q0 +; AARCH64-NEXT: ret +entry: + %c = call <5 x float> @llvm.maximumnum.v5f32(<5 x float> %a, <5 x float> %b) + ret <5 x float> %c +} + +define <8 x float> @max_v8f32(<8 x float> %a, <8 x float> %b) { +; AARCH64-LABEL: max_v8f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v2.4s, v2.4s, v2.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fminnm v3.4s, v3.4s, v3.4s +; AARCH64-NEXT: fminnm v1.4s, v1.4s, v1.4s +; AARCH64-NEXT: fmaxnm v0.4s, v0.4s, v2.4s +; AARCH64-NEXT: fmaxnm v1.4s, v1.4s, v3.4s +; AARCH64-NEXT: ret +entry: + %c = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> %a, <8 x float> %b) + ret <8 x float> %c +} + +;;;;;;;;;;;;;;;;;; max_f16 +define half @max_f16(half %a, half %b) { +; AARCH64-LABEL: max_f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm h1, h1, h1 +; AARCH64-NEXT: fminnm h0, h0, h0 +; AARCH64-NEXT: fmaxnm h0, h0, h1 +; AARCH64-NEXT: ret +entry: + %c = call half @llvm.maximumnum.f16(half %a, half %b) + ret half %c +} + +define <2 x half> @max_v2f16(<2 x half> %a, <2 x half> %b) { +; AARCH64-LABEL: max_v2f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h +; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h +; AARCH64-NEXT: fmaxnm v0.4h, v0.4h, v1.4h +; AARCH64-NEXT: ret +entry: + %c = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %c +} + +define <4 x half> @max_v4f16(<4 x half> %a, <4 x half> %b) { +; AARCH64-LABEL: max_v4f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h +; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h +; AARCH64-NEXT: fmaxnm v0.4h, v0.4h, v1.4h +; AARCH64-NEXT: ret +entry: + %c = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %c +} + +define <8 x half> @max_v8f16(<8 x half> %a, <8 x half> %b) { +; AARCH64-LABEL: max_v8f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h +; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v1.8h +; AARCH64-NEXT: ret +entry: + %c = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %a, <8 x half> %b) + ret <8 x half> %c +} + +define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { +; AARCH64-LABEL: max_v9f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: // kill: def $h0 killed $h0 def $q0 +; AARCH64-NEXT: // kill: def $h1 killed $h1 def $q1 +; AARCH64-NEXT: // kill: def $h2 killed $h2 def $q2 +; AARCH64-NEXT: add x9, sp, #16 +; AARCH64-NEXT: // kill: def $h3 killed $h3 def $q3 +; AARCH64-NEXT: // kill: def $h4 killed $h4 def $q4 +; AARCH64-NEXT: // kill: def $h5 killed $h5 def $q5 +; AARCH64-NEXT: // kill: def $h6 killed $h6 def $q6 +; AARCH64-NEXT: // kill: def $h7 killed $h7 def $q7 +; AARCH64-NEXT: mov v0.h[1], v1.h[0] +; AARCH64-NEXT: ldr h1, [sp, #8] +; AARCH64-NEXT: ld1 { v1.h }[1], [x9] +; AARCH64-NEXT: add x9, sp, #24 +; AARCH64-NEXT: mov v0.h[2], v2.h[0] +; AARCH64-NEXT: ldr h2, [sp] +; AARCH64-NEXT: ld1 { v1.h }[2], [x9] +; AARCH64-NEXT: add x9, sp, #32 +; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h +; AARCH64-NEXT: mov v0.h[3], v3.h[0] +; AARCH64-NEXT: ld1 { v1.h }[3], [x9] +; AARCH64-NEXT: add x9, sp, #40 +; AARCH64-NEXT: ldr h3, [sp, #72] +; AARCH64-NEXT: ld1 { v1.h }[4], [x9] +; AARCH64-NEXT: add x9, sp, #48 +; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h +; AARCH64-NEXT: mov v0.h[4], v4.h[0] +; AARCH64-NEXT: ld1 { v1.h }[5], [x9] +; AARCH64-NEXT: add x9, sp, #56 +; AARCH64-NEXT: fmaxnm v2.8h, v2.8h, v3.8h +; AARCH64-NEXT: mov v0.h[5], v5.h[0] +; AARCH64-NEXT: ld1 { v1.h }[6], [x9] +; AARCH64-NEXT: add x9, sp, #64 +; AARCH64-NEXT: str h2, [x8, #16] +; AARCH64-NEXT: mov v0.h[6], v6.h[0] +; AARCH64-NEXT: ld1 { v1.h }[7], [x9] +; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h +; AARCH64-NEXT: mov v0.h[7], v7.h[0] +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h +; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v1.8h +; AARCH64-NEXT: str q0, [x8] +; AARCH64-NEXT: ret +entry: + %c = call <9 x half> @llvm.maximumnum.v9f16(<9 x half> %a, <9 x half> %b) + ret <9 x half> %c +} + +define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) { +; AARCH64-LABEL: max_v16f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h +; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h +; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h +; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v2.8h +; AARCH64-NEXT: fmaxnm v1.8h, v1.8h, v3.8h +; AARCH64-NEXT: ret +entry: + %c = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %a, <16 x half> %b) + ret <16 x half> %c +} + +;;;;;;;;;;;;;;;; min_f64 +define double @min_f64(double %a, double %b) { +; AARCH64-LABEL: min_f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm d1, d1, d1 +; AARCH64-NEXT: fminnm d0, d0, d0 +; AARCH64-NEXT: fminnm d0, d0, d1 +; AARCH64-NEXT: ret +entry: + %c = call double @llvm.minimumnum.f64(double %a, double %b) + ret double %c +} + +define <2 x double> @min_v2f64(<2 x double> %a, <2 x double> %b) { +; AARCH64-LABEL: min_v2f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.2d, v1.2d, v1.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v0.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v1.2d +; AARCH64-NEXT: ret +entry: + %c = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %c +} + +define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) { +; AARCH64-LABEL: min_v3f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: // kill: def $d3 killed $d3 def $q3 +; AARCH64-NEXT: // kill: def $d0 killed $d0 def $q0 +; AARCH64-NEXT: // kill: def $d4 killed $d4 def $q4 +; AARCH64-NEXT: // kill: def $d1 killed $d1 def $q1 +; AARCH64-NEXT: // kill: def $d2 killed $d2 def $q2 +; AARCH64-NEXT: // kill: def $d5 killed $d5 def $q5 +; AARCH64-NEXT: mov v0.d[1], v1.d[0] +; AARCH64-NEXT: mov v3.d[1], v4.d[0] +; AARCH64-NEXT: fminnm v2.2d, v2.2d, v2.2d +; AARCH64-NEXT: fminnm v1.2d, v3.2d, v3.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v0.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v1.2d +; AARCH64-NEXT: fminnm v1.2d, v5.2d, v5.2d +; AARCH64-NEXT: fminnm v2.2d, v2.2d, v1.2d +; AARCH64-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; AARCH64-NEXT: // kill: def $d0 killed $d0 killed $q0 +; AARCH64-NEXT: // kill: def $d1 killed $d1 killed $q1 +; AARCH64-NEXT: // kill: def $d2 killed $d2 killed $q2 +; AARCH64-NEXT: ret +entry: + %c = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> %a, <3 x double> %b) + ret <3 x double> %c +} + +define <4 x double> @min_v4f64(<4 x double> %a, <4 x double> %b) { +; AARCH64-LABEL: min_v4f64: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v2.2d, v2.2d, v2.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v0.2d +; AARCH64-NEXT: fminnm v3.2d, v3.2d, v3.2d +; AARCH64-NEXT: fminnm v1.2d, v1.2d, v1.2d +; AARCH64-NEXT: fminnm v0.2d, v0.2d, v2.2d +; AARCH64-NEXT: fminnm v1.2d, v1.2d, v3.2d +; AARCH64-NEXT: ret +entry: + %c = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %a, <4 x double> %b) + ret <4 x double> %c +} + +;;;;;;;;;;;;;;;;;; min_f32 +define float @min_f32(float %a, float %b) { +; AARCH64-LABEL: min_f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm s1, s1, s1 +; AARCH64-NEXT: fminnm s0, s0, s0 +; AARCH64-NEXT: fminnm s0, s0, s1 +; AARCH64-NEXT: ret +entry: + %c = call float @llvm.minimumnum.f32(float %a, float %b) + ret float %c +} + +define <2 x float> @min_v2f32(<2 x float> %a, <2 x float> %b) { +; AARCH64-LABEL: min_v2f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.2s, v1.2s, v1.2s +; AARCH64-NEXT: fminnm v0.2s, v0.2s, v0.2s +; AARCH64-NEXT: fminnm v0.2s, v0.2s, v1.2s +; AARCH64-NEXT: ret +entry: + %c = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %c +} + +define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) { +; AARCH64-LABEL: min_v3f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4s, v1.4s, v1.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v1.4s +; AARCH64-NEXT: ret +entry: + %c = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> %a, <3 x float> %b) + ret <3 x float> %c +} + +define <4 x float> @min_v4f32(<4 x float> %a, <4 x float> %b) { +; AARCH64-LABEL: min_v4f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4s, v1.4s, v1.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v1.4s +; AARCH64-NEXT: ret +entry: + %c = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %c +} + +define <5 x float> @min_v5f32(<5 x float> %a, <5 x float> %b) { +; AARCH64-LABEL: min_v5f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: // kill: def $s0 killed $s0 def $q0 +; AARCH64-NEXT: // kill: def $s5 killed $s5 def $q5 +; AARCH64-NEXT: // kill: def $s1 killed $s1 def $q1 +; AARCH64-NEXT: // kill: def $s6 killed $s6 def $q6 +; AARCH64-NEXT: // kill: def $s2 killed $s2 def $q2 +; AARCH64-NEXT: // kill: def $s7 killed $s7 def $q7 +; AARCH64-NEXT: // kill: def $s3 killed $s3 def $q3 +; AARCH64-NEXT: mov x8, sp +; AARCH64-NEXT: // kill: def $s4 killed $s4 def $q4 +; AARCH64-NEXT: mov v0.s[1], v1.s[0] +; AARCH64-NEXT: mov v5.s[1], v6.s[0] +; AARCH64-NEXT: mov v0.s[2], v2.s[0] +; AARCH64-NEXT: mov v5.s[2], v7.s[0] +; AARCH64-NEXT: ldr s2, [sp, #8] +; AARCH64-NEXT: fminnm v2.4s, v2.4s, v2.4s +; AARCH64-NEXT: mov v0.s[3], v3.s[0] +; AARCH64-NEXT: ld1 { v5.s }[3], [x8] +; AARCH64-NEXT: fminnm v3.4s, v4.4s, v4.4s +; AARCH64-NEXT: fminnm v1.4s, v5.4s, v5.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fminnm v4.4s, v3.4s, v2.4s +; AARCH64-NEXT: // kill: def $s4 killed $s4 killed $q4 +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v1.4s +; AARCH64-NEXT: mov s1, v0.s[1] +; AARCH64-NEXT: mov s2, v0.s[2] +; AARCH64-NEXT: mov s3, v0.s[3] +; AARCH64-NEXT: // kill: def $s0 killed $s0 killed $q0 +; AARCH64-NEXT: ret +entry: + %c = call <5 x float> @llvm.minimumnum.v5f32(<5 x float> %a, <5 x float> %b) + ret <5 x float> %c +} + +define <8 x float> @min_v8f32(<8 x float> %a, <8 x float> %b) { +; AARCH64-LABEL: min_v8f32: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v2.4s, v2.4s, v2.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v0.4s +; AARCH64-NEXT: fminnm v3.4s, v3.4s, v3.4s +; AARCH64-NEXT: fminnm v1.4s, v1.4s, v1.4s +; AARCH64-NEXT: fminnm v0.4s, v0.4s, v2.4s +; AARCH64-NEXT: fminnm v1.4s, v1.4s, v3.4s +; AARCH64-NEXT: ret +entry: + %c = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> %a, <8 x float> %b) + ret <8 x float> %c +} + +;;;;;;;;;;;;;;;;;; min_f16 +define half @min_f16(half %a, half %b) { +; AARCH64-LABEL: min_f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm h1, h1, h1 +; AARCH64-NEXT: fminnm h0, h0, h0 +; AARCH64-NEXT: fminnm h0, h0, h1 +; AARCH64-NEXT: ret +entry: + %c = call half @llvm.minimumnum.f16(half %a, half %b) + ret half %c +} + +define <2 x half> @min_v2f16(<2 x half> %a, <2 x half> %b) { +; AARCH64-LABEL: min_v2f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h +; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h +; AARCH64-NEXT: fminnm v0.4h, v0.4h, v1.4h +; AARCH64-NEXT: ret +entry: + %c = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %c +} + +define <4 x half> @min_v4f16(<4 x half> %a, <4 x half> %b) { +; AARCH64-LABEL: min_v4f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h +; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h +; AARCH64-NEXT: fminnm v0.4h, v0.4h, v1.4h +; AARCH64-NEXT: ret +entry: + %c = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %c +} + +define <8 x half> @min_v8f16(<8 x half> %a, <8 x half> %b) { +; AARCH64-LABEL: min_v8f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v1.8h +; AARCH64-NEXT: ret +entry: + %c = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %a, <8 x half> %b) + ret <8 x half> %c +} + +define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { +; AARCH64-LABEL: min_v9f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: // kill: def $h0 killed $h0 def $q0 +; AARCH64-NEXT: // kill: def $h1 killed $h1 def $q1 +; AARCH64-NEXT: // kill: def $h2 killed $h2 def $q2 +; AARCH64-NEXT: add x9, sp, #16 +; AARCH64-NEXT: // kill: def $h3 killed $h3 def $q3 +; AARCH64-NEXT: // kill: def $h4 killed $h4 def $q4 +; AARCH64-NEXT: // kill: def $h5 killed $h5 def $q5 +; AARCH64-NEXT: // kill: def $h6 killed $h6 def $q6 +; AARCH64-NEXT: // kill: def $h7 killed $h7 def $q7 +; AARCH64-NEXT: mov v0.h[1], v1.h[0] +; AARCH64-NEXT: ldr h1, [sp, #8] +; AARCH64-NEXT: ld1 { v1.h }[1], [x9] +; AARCH64-NEXT: add x9, sp, #24 +; AARCH64-NEXT: mov v0.h[2], v2.h[0] +; AARCH64-NEXT: ldr h2, [sp] +; AARCH64-NEXT: ld1 { v1.h }[2], [x9] +; AARCH64-NEXT: add x9, sp, #32 +; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h +; AARCH64-NEXT: mov v0.h[3], v3.h[0] +; AARCH64-NEXT: ld1 { v1.h }[3], [x9] +; AARCH64-NEXT: add x9, sp, #40 +; AARCH64-NEXT: ldr h3, [sp, #72] +; AARCH64-NEXT: ld1 { v1.h }[4], [x9] +; AARCH64-NEXT: add x9, sp, #48 +; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h +; AARCH64-NEXT: mov v0.h[4], v4.h[0] +; AARCH64-NEXT: ld1 { v1.h }[5], [x9] +; AARCH64-NEXT: add x9, sp, #56 +; AARCH64-NEXT: fminnm v2.8h, v2.8h, v3.8h +; AARCH64-NEXT: mov v0.h[5], v5.h[0] +; AARCH64-NEXT: ld1 { v1.h }[6], [x9] +; AARCH64-NEXT: add x9, sp, #64 +; AARCH64-NEXT: str h2, [x8, #16] +; AARCH64-NEXT: mov v0.h[6], v6.h[0] +; AARCH64-NEXT: ld1 { v1.h }[7], [x9] +; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h +; AARCH64-NEXT: mov v0.h[7], v7.h[0] +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v1.8h +; AARCH64-NEXT: str q0, [x8] +; AARCH64-NEXT: ret +entry: + %c = call <9 x half> @llvm.minimumnum.v9f16(<9 x half> %a, <9 x half> %b) + ret <9 x half> %c +} + +define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) { +; AARCH64-LABEL: min_v16f16: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h +; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h +; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h +; AARCH64-NEXT: fminnm v0.8h, v0.8h, v2.8h +; AARCH64-NEXT: fminnm v1.8h, v1.8h, v3.8h +; AARCH64-NEXT: ret +entry: + %c = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %a, <16 x half> %b) + ret <16 x half> %c +} From 6c398abb75da5413152f97a780ddb3b3b2b6a0b7 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Thu, 10 Oct 2024 18:08:18 -0700 Subject: [PATCH 107/177] [NFC][mlir][spirv] Fix syntax warnings in gen_spirv_dialect.py (#111775) In the context of regular expressions, Python (used to) gracefully ignore the escape behavior of `\` in some contexts, e.g. for representing the regular expression `\w+`. However in newer versions of Python this now gives a warning in the form ``` SyntaxWarning: invalid escape sequence '\w' ``` Fix by explicitly using raw strings instead. --- mlir/utils/spirv/gen_spirv_dialect.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/utils/spirv/gen_spirv_dialect.py b/mlir/utils/spirv/gen_spirv_dialect.py index 78c1022428d8a1..6d82c012158196 100755 --- a/mlir/utils/spirv/gen_spirv_dialect.py +++ b/mlir/utils/spirv/gen_spirv_dialect.py @@ -538,7 +538,7 @@ def gen_instr_coverage_report(path, instructions): prefix = "def SPIRV_OC_" existing_opcodes = [ - k[len(prefix) :] for k in re.findall(prefix + "\w+", content[1]) + k[len(prefix) :] for k in re.findall(prefix + r"\w+", content[1]) ] existing_instructions = list( filter(lambda inst: (inst["opname"] in existing_opcodes), instructions) @@ -597,7 +597,7 @@ def update_td_opcodes(path, instructions, filter_list): # Extend opcode list with existing list prefix = "def SPIRV_OC_" existing_opcodes = [ - k[len(prefix) :] for k in re.findall(prefix + "\w+", content[1]) + k[len(prefix) :] for k in re.findall(prefix + r"\w+", content[1]) ] filter_list.extend(existing_opcodes) filter_list = list(set(filter_list)) @@ -644,7 +644,7 @@ def update_td_enum_attrs(path, operand_kinds, filter_list): suffix = "Attr" existing_kinds = [ k[len(prefix) : -len(suffix)] - for k in re.findall(prefix + "\w+" + suffix, content[1]) + for k in re.findall(prefix + r"\w+" + suffix, content[1]) ] filter_list.extend(existing_kinds) @@ -971,7 +971,7 @@ def extract_td_op_info(op_def): suffix = "Op" opname = [ o[len(prefix) : -len(suffix)] - for o in re.findall(prefix + "\w+" + suffix, op_def) + for o in re.findall(prefix + r"\w+" + suffix, op_def) ] assert len(opname) == 1, "more than one ops in the same section!" opname = opname[0] @@ -979,7 +979,7 @@ def extract_td_op_info(op_def): # Get instruction category prefix = "SPIRV_" inst_category = [ - o[len(prefix) :] for o in re.findall(prefix + "\w+Op", op_def.split(":", 1)[1]) + o[len(prefix) :] for o in re.findall(prefix + r"\w+Op", op_def.split(":", 1)[1]) ] assert len(inst_category) <= 1, "more than one ops in the same section!" inst_category = inst_category[0] if len(inst_category) == 1 else "Op" From e3894f58e1a534c57f53b3beb21d6b2f0d3382b2 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 11 Oct 2024 09:08:38 +0800 Subject: [PATCH 108/177] [CodeGenPrepare] Convert `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` (#111284) Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After https://github.com/llvm/llvm-project/pull/100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix https://github.com/llvm/llvm-project/issues/95255. --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 28 ++++++++ llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 68 +++++++++++++++++-- llvm/test/CodeGen/RISCV/rv32zbb.ll | 39 +++++++++++ llvm/test/CodeGen/RISCV/rv64zbb.ll | 81 +++++++++++++++++++++++ llvm/test/CodeGen/X86/ispow2.ll | 45 ++++++++++++- llvm/test/CodeGen/X86/known-never-zero.ll | 12 ++-- 6 files changed, 258 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 3e09fbad6ab198..86f28293ba9ff8 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2111,6 +2111,31 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) { return false; } +/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. +/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` if the +/// result cannot be zero. +static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI, + const TargetTransformInfo &TTI, + const DataLayout &DL) { + ICmpInst::Predicate Pred; + if (!match(Cmp, m_ICmp(Pred, m_Intrinsic(), m_One()))) + return false; + if (!ICmpInst::isEquality(Pred)) + return false; + auto *II = cast(Cmp->getOperand(0)); + + if (isKnownNonZero(II, DL)) { + if (Pred == ICmpInst::ICMP_EQ) { + Cmp->setOperand(1, ConstantInt::get(II->getType(), 2)); + Cmp->setPredicate(ICmpInst::ICMP_ULT); + } else { + Cmp->setPredicate(ICmpInst::ICMP_UGT); + } + return true; + } + return false; +} + bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (sinkCmpExpression(Cmp, *TLI)) return true; @@ -2130,6 +2155,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (foldFCmpToFPClassTest(Cmp, *TLI, *DL)) return true; + if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL)) + return true; + return false; } diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll index f5ce73a366125b..0030e9ce80abb4 100644 --- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -15,7 +15,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone { ; CHECK-NONEON-LABEL: cnt32_advsimd: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr w9, w0, #1 -; CHECK-NONEON-NEXT: mov w8, #16843009 +; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101 ; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 ; CHECK-NONEON-NEXT: sub w9, w0, w9 ; CHECK-NONEON-NEXT: lsr w10, w9, #2 @@ -50,7 +50,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) { ; CHECK-NONEON-LABEL: cnt32_advsimd_2: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr w9, w0, #1 -; CHECK-NONEON-NEXT: mov w8, #16843009 +; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101 ; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 ; CHECK-NONEON-NEXT: sub w9, w0, w9 ; CHECK-NONEON-NEXT: lsr w10, w9, #2 @@ -86,7 +86,7 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone { ; CHECK-NONEON-LABEL: cnt64_advsimd: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr x9, x0, #1 -; CHECK-NONEON-NEXT: mov x8, #72340172838076673 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101 ; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 ; CHECK-NONEON-NEXT: sub x9, x0, x9 ; CHECK-NONEON-NEXT: lsr x10, x9, #2 @@ -114,7 +114,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat { ; CHECK-LABEL: cnt32: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr w9, w0, #1 -; CHECK-NEXT: mov w8, #16843009 +; CHECK-NEXT: mov w8, #16843009 // =0x1010101 ; CHECK-NEXT: and w9, w9, #0x55555555 ; CHECK-NEXT: sub w9, w0, w9 ; CHECK-NEXT: lsr w10, w9, #2 @@ -130,7 +130,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat { ; CHECK-NONEON-LABEL: cnt32: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr w9, w0, #1 -; CHECK-NONEON-NEXT: mov w8, #16843009 +; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101 ; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 ; CHECK-NONEON-NEXT: sub w9, w0, w9 ; CHECK-NONEON-NEXT: lsr w10, w9, #2 @@ -155,7 +155,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { ; CHECK-LABEL: cnt64: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr x9, x0, #1 -; CHECK-NEXT: mov x8, #72340172838076673 +; CHECK-NEXT: mov x8, #72340172838076673 // =0x101010101010101 ; CHECK-NEXT: and x9, x9, #0x5555555555555555 ; CHECK-NEXT: sub x9, x0, x9 ; CHECK-NEXT: lsr x10, x9, #2 @@ -171,7 +171,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { ; CHECK-NONEON-LABEL: cnt64: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr x9, x0, #1 -; CHECK-NONEON-NEXT: mov x8, #72340172838076673 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101 ; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 ; CHECK-NONEON-NEXT: sub x9, x0, x9 ; CHECK-NONEON-NEXT: lsr x10, x9, #2 @@ -278,5 +278,59 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone { ret i1 %cmp } +define i1 @ctpop32_eq_one_nonzero(i32 %x) { +; CHECK-LABEL: ctpop32_eq_one_nonzero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub w8, w0, #1 +; CHECK-NEXT: tst w0, w8 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +; +; CHECK-NONEON-LABEL: ctpop32_eq_one_nonzero: +; CHECK-NONEON: // %bb.0: // %entry +; CHECK-NONEON-NEXT: sub w8, w0, #1 +; CHECK-NONEON-NEXT: tst w0, w8 +; CHECK-NONEON-NEXT: cset w0, eq +; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: ctpop32_eq_one_nonzero: +; CHECK-CSSC: // %bb.0: // %entry +; CHECK-CSSC-NEXT: sub w8, w0, #1 +; CHECK-CSSC-NEXT: tst w0, w8 +; CHECK-CSSC-NEXT: cset w0, eq +; CHECK-CSSC-NEXT: ret +entry: + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp eq i32 %popcnt, 1 + ret i1 %cmp +} + +define i1 @ctpop32_ne_one_nonzero(i32 %x) { +; CHECK-LABEL: ctpop32_ne_one_nonzero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub w8, w0, #1 +; CHECK-NEXT: tst w0, w8 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +; +; CHECK-NONEON-LABEL: ctpop32_ne_one_nonzero: +; CHECK-NONEON: // %bb.0: // %entry +; CHECK-NONEON-NEXT: sub w8, w0, #1 +; CHECK-NONEON-NEXT: tst w0, w8 +; CHECK-NONEON-NEXT: cset w0, ne +; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: ctpop32_ne_one_nonzero: +; CHECK-CSSC: // %bb.0: // %entry +; CHECK-CSSC-NEXT: sub w8, w0, #1 +; CHECK-CSSC-NEXT: tst w0, w8 +; CHECK-CSSC-NEXT: cset w0, ne +; CHECK-CSSC-NEXT: ret +entry: + %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp ne i32 %popcnt, 1 + ret i1 %cmp +} + declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i64 @llvm.ctpop.i64(i64) nounwind readnone diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index e24b1b41645cdf..4c52047b928f4d 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -1441,3 +1441,42 @@ define i32 @srai_slli2(i16 signext %0) { %3 = sext i16 %sext to i32 ret i32 %3 } + +define i1 @ctpop32_eq_one_nonzero(i32 %x) { +; RV32I-LABEL: ctpop32_eq_one_nonzero: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop32_eq_one_nonzero: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: ret +entry: + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp eq i32 %popcnt, 1 + ret i1 %cmp +} + +define i1 @ctpop32_ne_one_nonzero(i32 %x) { +; RV32I-LABEL: ctpop32_ne_one_nonzero: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop32_ne_one_nonzero: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: xori a0, a0, 1 +; RV32ZBB-NEXT: ret +entry: + %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp ne i32 %popcnt, 1 + ret i1 %cmp +} diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 43a499806ab5ae..1e7814d588e4c0 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -1618,3 +1618,84 @@ entry: %5 = add nsw i32 %4, %0 ret i32 %5 } + +define i1 @ctpop32_eq_one_nonzero(i32 %x) { +; RV64I-LABEL: ctpop32_eq_one_nonzero: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop32_eq_one_nonzero: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: ret +entry: + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp eq i32 %popcnt, 1 + ret i1 %cmp +} + +define i1 @ctpop32_ne_one_nonzero(i32 %x) { +; RV64I-LABEL: ctpop32_ne_one_nonzero: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop32_ne_one_nonzero: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: xori a0, a0, 1 +; RV64ZBB-NEXT: ret +entry: + %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp ne i32 %popcnt, 1 + ret i1 %cmp +} + +define i1 @ctpop64_eq_one_nonzero(i64 %x) { +; RV64I-LABEL: ctpop64_eq_one_nonzero: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop64_eq_one_nonzero: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: ret +entry: + %popcnt = call range(i64 1, 65) i64 @llvm.ctpop.i64(i64 %x) + %cmp = icmp eq i64 %popcnt, 1 + ret i1 %cmp +} + +define i1 @ctpop32_eq_one_maybezero(i32 %x) { +; RV64I-LABEL: ctpop32_eq_one_maybezero: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: xor a0, a0, a1 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop32_eq_one_maybezero: +; RV64ZBB: # %bb.0: # %entry +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: seqz a0, a0 +; RV64ZBB-NEXT: ret +entry: + %popcnt = call range(i32 0, 16) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp eq i32 %popcnt, 1 + ret i1 %cmp +} diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll index 8723432de8b6b0..649d257b28d762 100644 --- a/llvm/test/CodeGen/X86/ispow2.ll +++ b/llvm/test/CodeGen/X86/ispow2.ll @@ -102,7 +102,7 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) { ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 +; CHECK-AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512-NEXT: vzeroupper @@ -155,7 +155,7 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) { ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 +; CHECK-AVX512-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512-NEXT: vzeroupper @@ -220,3 +220,44 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) { %r = icmp ne <4 x i64> %cnt, ret <4 x i1> %r } + + +define i1 @ctpop32_eq_one_nonzero(i32 %x) { +; CHECK-NOBMI-LABEL: ctpop32_eq_one_nonzero: +; CHECK-NOBMI: # %bb.0: # %entry +; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax +; CHECK-NOBMI-NEXT: testl %eax, %edi +; CHECK-NOBMI-NEXT: sete %al +; CHECK-NOBMI-NEXT: retq +; +; CHECK-BMI2-LABEL: ctpop32_eq_one_nonzero: +; CHECK-BMI2: # %bb.0: # %entry +; CHECK-BMI2-NEXT: blsrl %edi, %eax +; CHECK-BMI2-NEXT: sete %al +; CHECK-BMI2-NEXT: retq +entry: + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp eq i32 %popcnt, 1 + ret i1 %cmp +} + +define i1 @ctpop32_ne_one_nonzero(i32 %x) { +; CHECK-NOBMI-LABEL: ctpop32_ne_one_nonzero: +; CHECK-NOBMI: # %bb.0: # %entry +; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax +; CHECK-NOBMI-NEXT: testl %eax, %edi +; CHECK-NOBMI-NEXT: setne %al +; CHECK-NOBMI-NEXT: retq +; +; CHECK-BMI2-LABEL: ctpop32_ne_one_nonzero: +; CHECK-BMI2: # %bb.0: # %entry +; CHECK-BMI2-NEXT: blsrl %edi, %eax +; CHECK-BMI2-NEXT: setne %al +; CHECK-BMI2-NEXT: retq +entry: + %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp ne i32 %popcnt, 1 + ret i1 %cmp +} diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index ac41a3fe6bb7e4..6c0aaeb451e14a 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -555,9 +555,9 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { ; X86-NEXT: por %xmm2, %xmm0 ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: pxor %xmm1, %xmm0 -; X86-NEXT: pcmpgtd %xmm1, %xmm0 +; X86-NEXT: pand %xmm1, %xmm0 +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-NEXT: psrld $31, %xmm0 ; X86-NEXT: retl ; @@ -566,10 +566,10 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { ; X64-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpsrld $31, %xmm0, %xmm0 ; X64-NEXT: retq %z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> ) %r = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %z) From cbfcea1fc2154c92880278878610e16faba979be Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 10 Oct 2024 20:18:54 -0500 Subject: [PATCH 109/177] [libc] Temporarily disable strerror test on NVPTX Summary: This is failing on the NVPTX buildbot, https://lab.llvm.org/buildbot/#/builders/69/builds/6997/. I cannot reproduce it locally so I'm disabling it temporarily so the bot is green. --- libc/test/src/string/CMakeLists.txt | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt index c1caec5fd912c8..44535957e740be 100644 --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -215,16 +215,18 @@ add_libc_test( libc.src.errno.errno ) -add_libc_test( - strerror_test - SUITE - libc-string-tests - SRCS - strerror_test.cpp - DEPENDS - libc.src.string.strerror -) - +# FIXME: This is failing on the bot for some reason, disable for now. +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + add_libc_test( + strerror_test + SUITE + libc-string-tests + SRCS + strerror_test.cpp + DEPENDS + libc.src.string.strerror + ) +endif() add_libc_test( strerror_r_test From 9f8ae7844dee7bb5527a59249e74885fb3bfb4a9 Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Fri, 11 Oct 2024 03:31:26 +0200 Subject: [PATCH 110/177] [lldb-dap] Implement value locations for function pointers (#104589) This commit adds `valueLocationReference` to function pointers and function references. Thereby, users can navigate directly to the pointed-to function from within the "variables" pane. In general, it would be useful to also a add similar location references also to member function pointers, `std::source_location`, `std::function`, and many more. Doing so would require extending the formatters to provide such a source code location. There were two RFCs about this a while ago: https://discourse.llvm.org/t/rfc-extending-formatters-with-a-source-code-reference/68375 https://discourse.llvm.org/t/rfc-sbvalue-metadata-provider/68377/26 However, both RFCs ended without a conclusion. As such, this commit now implements the lowest-hanging fruit, i.e. function pointers. If people find it useful, I will revive the RFC afterwards. --- .../API/tools/lldb-dap/locations/Makefile | 2 +- .../lldb-dap/locations/TestDAP_locations.py | 49 +++++++- lldb/test/API/tools/lldb-dap/locations/main.c | 5 - .../API/tools/lldb-dap/locations/main.cpp | 10 ++ lldb/tools/lldb-dap/JSONUtils.cpp | 41 ++++++- lldb/tools/lldb-dap/JSONUtils.h | 10 ++ lldb/tools/lldb-dap/lldb-dap.cpp | 115 ++++++++++++++---- 7 files changed, 192 insertions(+), 40 deletions(-) delete mode 100644 lldb/test/API/tools/lldb-dap/locations/main.c create mode 100644 lldb/test/API/tools/lldb-dap/locations/main.cpp diff --git a/lldb/test/API/tools/lldb-dap/locations/Makefile b/lldb/test/API/tools/lldb-dap/locations/Makefile index 10495940055b63..99998b20bcb050 100644 --- a/lldb/test/API/tools/lldb-dap/locations/Makefile +++ b/lldb/test/API/tools/lldb-dap/locations/Makefile @@ -1,3 +1,3 @@ -C_SOURCES := main.c +CXX_SOURCES := main.cpp include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py b/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py index 76d938d3908492..45f836a2fa3c39 100644 --- a/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py +++ b/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py @@ -19,11 +19,11 @@ def test_locations(self): """ program = self.getBuildArtifact("a.out") self.build_and_launch(program) - source = "main.c" + source = "main.cpp" self.source_path = os.path.join(os.getcwd(), source) self.set_source_breakpoints( source, - [line_number(source, "// BREAK HERE")], + [line_number(source, "break here")], ) self.continue_to_next_stop() @@ -36,5 +36,46 @@ def test_locations(self): locals["var1"]["declarationLocationReference"] ) self.assertTrue(loc_var1["success"]) - self.assertTrue(loc_var1["body"]["source"]["path"].endswith("main.c")) - self.assertEqual(loc_var1["body"]["line"], 2) + self.assertTrue(loc_var1["body"]["source"]["path"].endswith("main.cpp")) + self.assertEqual(loc_var1["body"]["line"], 6) + + # func_ptr has both a declaration and a valueLocation + self.assertIn("declarationLocationReference", locals["func_ptr"].keys()) + self.assertIn("valueLocationReference", locals["func_ptr"].keys()) + decl_loc_func_ptr = self.dap_server.request_locations( + locals["func_ptr"]["declarationLocationReference"] + ) + self.assertTrue(decl_loc_func_ptr["success"]) + self.assertTrue( + decl_loc_func_ptr["body"]["source"]["path"].endswith("main.cpp") + ) + self.assertEqual(decl_loc_func_ptr["body"]["line"], 7) + val_loc_func_ptr = self.dap_server.request_locations( + locals["func_ptr"]["valueLocationReference"] + ) + self.assertTrue(val_loc_func_ptr["success"]) + self.assertTrue(val_loc_func_ptr["body"]["source"]["path"].endswith("main.cpp")) + self.assertEqual(val_loc_func_ptr["body"]["line"], 3) + + # func_ref has both a declaration and a valueLocation + self.assertIn("declarationLocationReference", locals["func_ref"].keys()) + self.assertIn("valueLocationReference", locals["func_ref"].keys()) + decl_loc_func_ref = self.dap_server.request_locations( + locals["func_ref"]["declarationLocationReference"] + ) + self.assertTrue(decl_loc_func_ref["success"]) + self.assertTrue( + decl_loc_func_ref["body"]["source"]["path"].endswith("main.cpp") + ) + self.assertEqual(decl_loc_func_ref["body"]["line"], 8) + val_loc_func_ref = self.dap_server.request_locations( + locals["func_ref"]["valueLocationReference"] + ) + self.assertTrue(val_loc_func_ref["success"]) + self.assertTrue(val_loc_func_ref["body"]["source"]["path"].endswith("main.cpp")) + self.assertEqual(val_loc_func_ref["body"]["line"], 3) + + # `evaluate` responses for function pointers also have locations associated + eval_res = self.dap_server.request_evaluate("greet") + self.assertTrue(eval_res["success"]) + self.assertIn("valueLocationReference", eval_res["body"].keys()) diff --git a/lldb/test/API/tools/lldb-dap/locations/main.c b/lldb/test/API/tools/lldb-dap/locations/main.c deleted file mode 100644 index 6a8c86d00cb562..00000000000000 --- a/lldb/test/API/tools/lldb-dap/locations/main.c +++ /dev/null @@ -1,5 +0,0 @@ -int main(void) { - int var1 = 1; - // BREAK HERE - return 0; -} diff --git a/lldb/test/API/tools/lldb-dap/locations/main.cpp b/lldb/test/API/tools/lldb-dap/locations/main.cpp new file mode 100644 index 00000000000000..fb7789ffd86fdf --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/locations/main.cpp @@ -0,0 +1,10 @@ +#include + +void greet() { printf("Hello"); } + +int main(void) { + int var1 = 1; + void (*func_ptr)() = &greet; + void (&func_ref)() = greet; + return 0; // break here +} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 558f889c4b7f23..e42a6d9d699804 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -1223,6 +1223,25 @@ std::string VariableDescription::GetResult(llvm::StringRef context) { return description.trim().str(); } +bool ValuePointsToCode(lldb::SBValue v) { + if (!v.GetType().GetPointeeType().IsFunctionType()) + return false; + + lldb::addr_t addr = v.GetValueAsAddress(); + lldb::SBLineEntry line_entry = + g_dap.target.ResolveLoadAddress(addr).GetLineEntry(); + + return line_entry.IsValid(); +} + +int64_t PackLocation(int64_t var_ref, bool is_value_location) { + return var_ref << 1 | is_value_location; +} + +std::pair UnpackLocation(int64_t location_id) { + return std::pair{location_id >> 1, location_id & 1}; +} + // "Variable": { // "type": "object", // "description": "A Variable is a name/value pair. Optionally a variable @@ -1302,6 +1321,18 @@ std::string VariableDescription::GetResult(llvm::StringRef context) { // Object References' in the Overview section for // details." // }, +// "valueLocationReference": { +// "type": "integer", +// "description": "A reference that allows the client to request the +// location where the variable's value is declared. For +// example, if the variable contains a function pointer, +// the adapter may be able to look up the function's +// location. This should be present only if the adapter +// is likely to be able to resolve the location.\n\nThis +// reference shares the same lifetime as the +// `variablesReference`. See 'Lifetime of Object +// References' in the Overview section for details." +// }, // // "$__lldb_extensions": { // "description": "Unofficial extensions to the protocol", @@ -1415,7 +1446,11 @@ llvm::json::Value CreateVariable(lldb::SBValue v, int64_t var_ref, object.try_emplace("variablesReference", 0); if (v.GetDeclaration().IsValid()) - object.try_emplace("declarationLocationReference", var_ref); + object.try_emplace("declarationLocationReference", + PackLocation(var_ref, false)); + + if (ValuePointsToCode(v)) + object.try_emplace("valueLocationReference", PackLocation(var_ref, true)); if (lldb::addr_t addr = v.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS) object.try_emplace("memoryReference", EncodeMemoryReference(addr)); @@ -1441,8 +1476,8 @@ CreateRunInTerminalReverseRequest(const llvm::json::Object &launch_request, llvm::StringRef comm_file, lldb::pid_t debugger_pid) { llvm::json::Object run_in_terminal_args; - // This indicates the IDE to open an embedded terminal, instead of opening the - // terminal in a new window. + // This indicates the IDE to open an embedded terminal, instead of opening + // the terminal in a new window. run_in_terminal_args.try_emplace("kind", "integrated"); auto launch_request_arguments = launch_request.getObject("arguments"); diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h index 18cfb4081fece1..54fc4323475723 100644 --- a/lldb/tools/lldb-dap/JSONUtils.h +++ b/lldb/tools/lldb-dap/JSONUtils.h @@ -480,6 +480,16 @@ struct VariableDescription { std::string GetResult(llvm::StringRef context); }; +/// Does the given variable have an associated value location? +bool ValuePointsToCode(lldb::SBValue v); + +/// Pack a location into a single integer which we can send via +/// the debug adapter protocol. +int64_t PackLocation(int64_t var_ref, bool is_value_location); + +/// Reverse of `PackLocation` +std::pair UnpackLocation(int64_t location_id); + /// Create a "Variable" object for a LLDB thread object. /// /// This function will fill in the following keys in the returned diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index ac18e8f24a4e39..a167088c8901ca 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -1561,6 +1561,19 @@ void request_completions(const llvm::json::Object &request) { // client can use this optional information to // present the variables in a paged UI and fetch // them in chunks." +// }, +// "valueLocationReference": { +// "type": "integer", +// "description": "A reference that allows the client to request +// the location where the returned value is +// declared. For example, if a function pointer is +// returned, the adapter may be able to look up the +// function's location. This should be present only +// if the adapter is likely to be able to resolve +// the location.\n\nThis reference shares the same +// lifetime as the `variablesReference`. See +// 'Lifetime of Object References' in the +// Overview section for details." // } // "memoryReference": { // "type": "string", @@ -1647,16 +1660,19 @@ void request_evaluate(const llvm::json::Object &request) { VariableDescription desc(value); EmplaceSafeString(body, "result", desc.GetResult(context)); EmplaceSafeString(body, "type", desc.display_type_name); - if (value.MightHaveChildren()) { - auto variableReference = g_dap.variables.InsertVariable( + int64_t var_ref = 0; + if (value.MightHaveChildren() || ValuePointsToCode(value)) + var_ref = g_dap.variables.InsertVariable( value, /*is_permanent=*/context == "repl"); - body.try_emplace("variablesReference", variableReference); - } else { + if (value.MightHaveChildren()) + body.try_emplace("variablesReference", var_ref); + else body.try_emplace("variablesReference", (int64_t)0); - } if (lldb::addr_t addr = value.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS) body.try_emplace("memoryReference", EncodeMemoryReference(addr)); + if (ValuePointsToCode(value)) + body.try_emplace("valueLocationReference", var_ref); } } response.try_emplace("body", std::move(body)); @@ -3770,6 +3786,17 @@ void request_threads(const llvm::json::Object &request) { // "description": "The number of indexed child variables. The client // can use this optional information to present the variables in a // paged UI and fetch them in chunks." +// }, +// "valueLocationReference": { +// "type": "integer", +// "description": "A reference that allows the client to request the +// location where the new value is declared. For example, if the new +// value is function pointer, the adapter may be able to look up the +// function's location. This should be present only if the adapter +// is likely to be able to resolve the location.\n\nThis reference +// shares the same lifetime as the `variablesReference`. See +// 'Lifetime of Object References' in the Overview section for +// details." // } // }, // "required": [ "value" ] @@ -3794,7 +3821,6 @@ void request_setVariable(const llvm::json::Object &request) { response.try_emplace("success", false); lldb::SBValue variable; - int64_t newVariablesReference = 0; // The "id" is the unique integer ID that is unique within the enclosing // variablesReference. It is optionally added to any "interface Variable" @@ -3824,14 +3850,17 @@ void request_setVariable(const llvm::json::Object &request) { // so always insert a new one to get its variablesReference. // is_permanent is false because debug console does not support // setVariable request. + int64_t new_var_ref = + g_dap.variables.InsertVariable(variable, /*is_permanent=*/false); if (variable.MightHaveChildren()) - newVariablesReference = - g_dap.variables.InsertVariable(variable, /*is_permanent=*/false); - body.try_emplace("variablesReference", newVariablesReference); - + body.try_emplace("variablesReference", new_var_ref); + else + body.try_emplace("variablesReference", 0); if (lldb::addr_t addr = variable.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS) body.try_emplace("memoryReference", EncodeMemoryReference(addr)); + if (ValuePointsToCode(variable)) + body.try_emplace("valueLocationReference", new_var_ref); } else { EmplaceSafeString(body, "message", std::string(error.GetCString())); } @@ -4122,10 +4151,13 @@ void request_variables(const llvm::json::Object &request) { void request_locations(const llvm::json::Object &request) { llvm::json::Object response; FillResponse(request, response); - auto arguments = request.getObject("arguments"); + auto *arguments = request.getObject("arguments"); - uint64_t reference_id = GetUnsigned(arguments, "locationReference", 0); - lldb::SBValue variable = g_dap.variables.GetVariable(reference_id); + uint64_t location_id = GetUnsigned(arguments, "locationReference", 0); + // We use the lowest bit to distinguish between value location and declaration + // location + auto [var_ref, is_value_location] = UnpackLocation(location_id); + lldb::SBValue variable = g_dap.variables.GetVariable(var_ref); if (!variable.IsValid()) { response["success"] = false; response["message"] = "Invalid variable reference"; @@ -4133,21 +4165,50 @@ void request_locations(const llvm::json::Object &request) { return; } - // Get the declaration location - lldb::SBDeclaration decl = variable.GetDeclaration(); - if (!decl.IsValid()) { - response["success"] = false; - response["message"] = "No declaration location available"; - g_dap.SendJSON(llvm::json::Value(std::move(response))); - return; - } - llvm::json::Object body; - body.try_emplace("source", CreateSource(decl.GetFileSpec())); - if (int line = decl.GetLine()) - body.try_emplace("line", line); - if (int column = decl.GetColumn()) - body.try_emplace("column", column); + if (is_value_location) { + // Get the value location + if (!variable.GetType().IsPointerType() && + !variable.GetType().IsReferenceType()) { + response["success"] = false; + response["message"] = + "Value locations are only available for pointers and references"; + g_dap.SendJSON(llvm::json::Value(std::move(response))); + return; + } + + lldb::addr_t addr = variable.GetValueAsAddress(); + lldb::SBLineEntry line_entry = + g_dap.target.ResolveLoadAddress(addr).GetLineEntry(); + + if (!line_entry.IsValid()) { + response["success"] = false; + response["message"] = "Failed to resolve line entry for location"; + g_dap.SendJSON(llvm::json::Value(std::move(response))); + return; + } + + body.try_emplace("source", CreateSource(line_entry.GetFileSpec())); + if (int line = line_entry.GetLine()) + body.try_emplace("line", line); + if (int column = line_entry.GetColumn()) + body.try_emplace("column", column); + } else { + // Get the declaration location + lldb::SBDeclaration decl = variable.GetDeclaration(); + if (!decl.IsValid()) { + response["success"] = false; + response["message"] = "No declaration location available"; + g_dap.SendJSON(llvm::json::Value(std::move(response))); + return; + } + + body.try_emplace("source", CreateSource(decl.GetFileSpec())); + if (int line = decl.GetLine()) + body.try_emplace("line", line); + if (int column = decl.GetColumn()) + body.try_emplace("column", column); + } response.try_emplace("body", std::move(body)); g_dap.SendJSON(llvm::json::Value(std::move(response))); From 9882b35a3a3e46d749b801bd0b98c3d90af6006c Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Fri, 11 Oct 2024 10:18:40 +0800 Subject: [PATCH 111/177] [X86][StrictFP] Combine fcmp + select to fmin/fmax for some predicates (#109512) X86 maxss/minss etc. instructions won't turn SNaN to QNaN, so we can combine fcmp + select to them for some predicates. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 32 ++-- llvm/lib/Target/X86/X86ISelLowering.h | 4 + llvm/lib/Target/X86/X86InstrAVX512.td | 8 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 12 ++ llvm/lib/Target/X86/X86InstrSSE.td | 8 +- llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll | 149 +++++++++++++++++- 6 files changed, 195 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 77c10baa31bd21..7a6d20c6a121b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34219,10 +34219,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMAXS) NODE_NAME_CASE(FMAX_SAE) NODE_NAME_CASE(FMAXS_SAE) + NODE_NAME_CASE(STRICT_FMAX) NODE_NAME_CASE(FMIN) NODE_NAME_CASE(FMINS) NODE_NAME_CASE(FMIN_SAE) NODE_NAME_CASE(FMINS_SAE) + NODE_NAME_CASE(STRICT_FMIN) NODE_NAME_CASE(FMAXC) NODE_NAME_CASE(FMINC) NODE_NAME_CASE(FRSQRT) @@ -46461,17 +46463,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // x<=y?x:y, because of how they handle negative zero (which can be // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. - if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) && - (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + if ((Cond.getOpcode() == ISD::SETCC || + Cond.getOpcode() == ISD::STRICT_FSETCCS) && + VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 && + !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget.hasSSE2() || (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) { - ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + bool IsStrict = Cond->isStrictFPOpcode(); + ISD::CondCode CC = + cast(Cond.getOperand(IsStrict ? 3 : 2))->get(); + SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0); + SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1); unsigned Opcode = 0; // Check for x CC y ? x : y. - if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && - DAG.isEqualTo(RHS, Cond.getOperand(1))) { + if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) { switch (CC) { default: break; case ISD::SETULT: @@ -46539,8 +46545,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, break; } // Check for x CC y ? y : x -- a min/max with reversed arms. - } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && - DAG.isEqualTo(RHS, Cond.getOperand(0))) { + } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) { switch (CC) { default: break; case ISD::SETOGE: @@ -46605,8 +46610,17 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } } - if (Opcode) + if (Opcode) { + if (IsStrict) { + SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN + : X86ISD::STRICT_FMAX, + DL, {N->getValueType(0), MVT::Other}, + {Cond.getOperand(0), LHS, RHS}); + DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1)); + return Ret; + } return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); + } } // Some mask scalar intrinsics rely on checking if only one bit is set diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index a2515ff35e6925..3b1bd0ad9a267e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -850,6 +850,10 @@ namespace llvm { // Perform an FP80 add after changing precision control in FPCW. STRICT_FP80_ADD, + /// Floating point max and min. + STRICT_FMAX, + STRICT_FMIN, + // WARNING: Only add nodes here if they are strict FP nodes. Non-memory and // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE. diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index b9ff4a5280ec3e..98c31867e6b22b 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5395,7 +5395,7 @@ multiclass avx512_fp_scalar_round opc, string OpcodeStr,X86VectorVTInfo EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, - SDNode OpNode, SDNode VecNode, SDNode SaeNode, + SDPatternOperator OpNode, SDNode VecNode, SDNode SaeNode, X86FoldableSchedWrite sched, bit IsCommutable> { let ExeDomain = _.ExeDomain in { defm rr_Int : AVX512_maskable_scalar opc, string OpcodeStr, SDPatternOperator T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>; } -multiclass avx512_binop_s_sae opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_binop_s_sae opc, string OpcodeStr, SDPatternOperator OpNode, SDNode VecNode, SDNode SaeNode, X86SchedWriteSizes sched, bit IsCommutable> { defm SSZ : avx512_fp_scalar_sae; defm VDIV : avx512_binop_s_round<0x5E, "vdiv", any_fdiv, X86fdivs, X86fdivRnds, SchedWriteFDivSizes, 0>; -defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs, +defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs, SchedWriteFCmpSizes, 0>; -defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs, +defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs, SchedWriteFCmpSizes, 0>; // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index ed1bff05b7316c..c09522709d2f0d 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -46,6 +46,18 @@ def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp, def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; +def X86strict_fmin : SDNode<"X86ISD::STRICT_FMIN", SDTFPBinOp, + [SDNPHasChain]>; +def X86strict_fmax : SDNode<"X86ISD::STRICT_FMAX", SDTFPBinOp, + [SDNPHasChain]>; + +def X86any_fmin : PatFrags<(ops node:$src1, node:$src2), + [(X86strict_fmin node:$src1, node:$src2), + (X86fmin node:$src1, node:$src2)]>; +def X86any_fmax : PatFrags<(ops node:$src1, node:$src2), + [(X86strict_fmax node:$src1, node:$src2), + (X86fmax node:$src1, node:$src2)]>; + def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index d51125a209db9d..e77e56aa96c670 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2730,11 +2730,11 @@ let isCommutable = 0 in { defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; - defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, - basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, + defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>, basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; - defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, - basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, + defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>, basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; } diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll index cb1876fee05aea..e3e2b6225a7ba0 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll @@ -4202,7 +4202,154 @@ define void @foo(float %0, float %1) #0 { } declare dso_local void @bar() -attributes #0 = { strictfp } +define float @fcmp_select_ogt(float %f1, float %f2) #0 { +; SSE-32-LABEL: fcmp_select_ogt: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %eax +; SSE-32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-32-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: movss %xmm0, (%esp) +; SSE-32-NEXT: flds (%esp) +; SSE-32-NEXT: wait +; SSE-32-NEXT: popl %eax +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: fcmp_select_ogt: +; SSE-64: # %bb.0: +; SSE-64-NEXT: maxss %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: fcmp_select_ogt: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %eax +; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovss %xmm0, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: popl %eax +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: fcmp_select_ogt: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; +; X87-LABEL: fcmp_select_ogt: +; X87: # %bb.0: +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: fcom %st(1) +; X87-NEXT: wait +; X87-NEXT: fnstsw %ax +; X87-NEXT: # kill: def $ah killed $ah killed $ax +; X87-NEXT: sahf +; X87-NEXT: ja .LBB57_2 +; X87-NEXT: # %bb.1: +; X87-NEXT: fstp %st(0) +; X87-NEXT: fldz +; X87-NEXT: fxch %st(1) +; X87-NEXT: .LBB57_2: +; X87-NEXT: fstp %st(1) +; X87-NEXT: wait +; X87-NEXT: retl +; +; X87-CMOV-LABEL: fcmp_select_ogt: +; X87-CMOV: # %bb.0: +; X87-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; X87-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; X87-CMOV-NEXT: fcomi %st(1), %st +; X87-CMOV-NEXT: fxch %st(1) +; X87-CMOV-NEXT: fcmovnbe %st(1), %st +; X87-CMOV-NEXT: fstp %st(1) +; X87-CMOV-NEXT: wait +; X87-CMOV-NEXT: retl + %cond = call i1 @llvm.experimental.constrained.fcmps.f32( + float %f1, float %f2, metadata !"ogt", + metadata !"fpexcept.strict") + %res = select i1 %cond, float %f1, float %f2 + ret float %res +} + +define double @fcmp_select_ule(double %f1, double %f2) #0 { +; SSE-32-LABEL: fcmp_select_ule: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $8, %esp +; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-32-NEXT: minsd 8(%ebp), %xmm0 +; SSE-32-NEXT: movsd %xmm0, (%esp) +; SSE-32-NEXT: fldl (%esp) +; SSE-32-NEXT: wait +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: fcmp_select_ule: +; SSE-64: # %bb.0: +; SSE-64-NEXT: minsd %xmm0, %xmm1 +; SSE-64-NEXT: movapd %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: fcmp_select_ule: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $8, %esp +; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-32-NEXT: vminsd 8(%ebp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovsd %xmm0, (%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: fcmp_select_ule: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX-64-NEXT: retq +; +; X87-LABEL: fcmp_select_ule: +; X87: # %bb.0: +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: fcom %st(1) +; X87-NEXT: wait +; X87-NEXT: fnstsw %ax +; X87-NEXT: # kill: def $ah killed $ah killed $ax +; X87-NEXT: sahf +; X87-NEXT: jbe .LBB58_2 +; X87-NEXT: # %bb.1: +; X87-NEXT: fstp %st(0) +; X87-NEXT: fldz +; X87-NEXT: fxch %st(1) +; X87-NEXT: .LBB58_2: +; X87-NEXT: fstp %st(1) +; X87-NEXT: wait +; X87-NEXT: retl +; +; X87-CMOV-LABEL: fcmp_select_ule: +; X87-CMOV: # %bb.0: +; X87-CMOV-NEXT: fldl {{[0-9]+}}(%esp) +; X87-CMOV-NEXT: fldl {{[0-9]+}}(%esp) +; X87-CMOV-NEXT: fcomi %st(1), %st +; X87-CMOV-NEXT: fxch %st(1) +; X87-CMOV-NEXT: fcmovbe %st(1), %st +; X87-CMOV-NEXT: fstp %st(1) +; X87-CMOV-NEXT: wait +; X87-CMOV-NEXT: retl + %cond = call i1 @llvm.experimental.constrained.fcmps.f64( + double %f1, double %f2, metadata !"ule", + metadata !"fpexcept.strict") + %res = select i1 %cond, double %f1, double %f2 + ret double %res +} + +attributes #0 = { nounwind strictfp } declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata) declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata) From 0bc02b999a9686ba240b7a68d3f1cbbf037d2170 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Fri, 11 Oct 2024 10:31:27 +0800 Subject: [PATCH 112/177] [Clang] Instantiate Typedefs referenced by type alias deduction guides (#111804) TypedefNameDecl referenced by a synthesized CTAD guide for type aliases was not transformed previously, resulting in a substitution failure in BuildDeductionGuideForTypeAlias() when substituting into the right-hand-side deduction guide. This patch fixes it in the way we have been doing since https://reviews.llvm.org/D80743. We transform all the function parameters, parenting referenced TypedefNameDecls with the CXXDeductionGuideDecl. Then we instantiate these declarations in FindInstantiatedDecl() as we build up the eventual deduction guide, using the mechanism introduced in D80743 Fixes #111508 --- clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 21 ++++++++++++++++--- clang/test/SemaCXX/cxx20-ctad-type-alias.cpp | 13 ++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index 545da21183c3c4..2d3e58548fb7ac 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -70,8 +70,8 @@ class ExtractTypeForDeductionGuide ExtractTypeForDeductionGuide( Sema &SemaRef, llvm::SmallVectorImpl &MaterializedTypedefs, - ClassTemplateDecl *NestedPattern, - const MultiLevelTemplateArgumentList *OuterInstantiationArgs) + ClassTemplateDecl *NestedPattern = nullptr, + const MultiLevelTemplateArgumentList *OuterInstantiationArgs = nullptr) : Base(SemaRef), MaterializedTypedefs(MaterializedTypedefs), NestedPattern(NestedPattern), OuterInstantiationArgs(OuterInstantiationArgs) { @@ -1228,10 +1228,25 @@ FunctionTemplateDecl *DeclareAggregateDeductionGuideForTypeAlias( getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate).first; if (!RHSTemplate) return nullptr; + + llvm::SmallVector TypedefDecls; + llvm::SmallVector NewParamTypes; + ExtractTypeForDeductionGuide TypeAliasTransformer(SemaRef, TypedefDecls); + for (QualType P : ParamTypes) { + QualType Type = TypeAliasTransformer.TransformType(P); + if (Type.isNull()) + return nullptr; + NewParamTypes.push_back(Type); + } + auto *RHSDeductionGuide = SemaRef.DeclareAggregateDeductionGuideFromInitList( - RHSTemplate, ParamTypes, Loc); + RHSTemplate, NewParamTypes, Loc); if (!RHSDeductionGuide) return nullptr; + + for (TypedefNameDecl *TD : TypedefDecls) + TD->setDeclContext(RHSDeductionGuide->getTemplatedDecl()); + return BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate, RHSDeductionGuide, Loc); } diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp index 5392573fcdb9d5..675c32a81f1ae8 100644 --- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp +++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp @@ -481,3 +481,16 @@ struct Out { Out::B out(100); // deduced to Out::A; static_assert(__is_same(decltype(out), Out::A)); } + +namespace GH111508 { + +template struct S { + using T = V; + T Data; +}; + +template using Alias = S; + +Alias A(42); + +} // namespace GH111508 From ec3e0a5900894c82e1763aa8597f47111edf6246 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 11 Oct 2024 11:08:07 +0800 Subject: [PATCH 113/177] Revert "[CodeGenPrepare] Convert `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1`" (#111932) Reverts llvm/llvm-project#111284 to fix clang stage2 builds. Investigating... Failed buildbots: https://lab.llvm.org/buildbot/#/builders/76/builds/3576 https://lab.llvm.org/buildbot/#/builders/168/builds/4308 https://lab.llvm.org/buildbot/#/builders/127/builds/1087 --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 28 -------- llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 68 ++----------------- llvm/test/CodeGen/RISCV/rv32zbb.ll | 39 ----------- llvm/test/CodeGen/RISCV/rv64zbb.ll | 81 ----------------------- llvm/test/CodeGen/X86/ispow2.ll | 45 +------------ llvm/test/CodeGen/X86/known-never-zero.ll | 12 ++-- 6 files changed, 15 insertions(+), 258 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 86f28293ba9ff8..3e09fbad6ab198 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2111,31 +2111,6 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) { return false; } -/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. -/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` if the -/// result cannot be zero. -static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI, - const TargetTransformInfo &TTI, - const DataLayout &DL) { - ICmpInst::Predicate Pred; - if (!match(Cmp, m_ICmp(Pred, m_Intrinsic(), m_One()))) - return false; - if (!ICmpInst::isEquality(Pred)) - return false; - auto *II = cast(Cmp->getOperand(0)); - - if (isKnownNonZero(II, DL)) { - if (Pred == ICmpInst::ICMP_EQ) { - Cmp->setOperand(1, ConstantInt::get(II->getType(), 2)); - Cmp->setPredicate(ICmpInst::ICMP_ULT); - } else { - Cmp->setPredicate(ICmpInst::ICMP_UGT); - } - return true; - } - return false; -} - bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (sinkCmpExpression(Cmp, *TLI)) return true; @@ -2155,9 +2130,6 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (foldFCmpToFPClassTest(Cmp, *TLI, *DL)) return true; - if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL)) - return true; - return false; } diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll index 0030e9ce80abb4..f5ce73a366125b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -15,7 +15,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone { ; CHECK-NONEON-LABEL: cnt32_advsimd: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr w9, w0, #1 -; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101 +; CHECK-NONEON-NEXT: mov w8, #16843009 ; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 ; CHECK-NONEON-NEXT: sub w9, w0, w9 ; CHECK-NONEON-NEXT: lsr w10, w9, #2 @@ -50,7 +50,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) { ; CHECK-NONEON-LABEL: cnt32_advsimd_2: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr w9, w0, #1 -; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101 +; CHECK-NONEON-NEXT: mov w8, #16843009 ; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 ; CHECK-NONEON-NEXT: sub w9, w0, w9 ; CHECK-NONEON-NEXT: lsr w10, w9, #2 @@ -86,7 +86,7 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone { ; CHECK-NONEON-LABEL: cnt64_advsimd: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr x9, x0, #1 -; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 ; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 ; CHECK-NONEON-NEXT: sub x9, x0, x9 ; CHECK-NONEON-NEXT: lsr x10, x9, #2 @@ -114,7 +114,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat { ; CHECK-LABEL: cnt32: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr w9, w0, #1 -; CHECK-NEXT: mov w8, #16843009 // =0x1010101 +; CHECK-NEXT: mov w8, #16843009 ; CHECK-NEXT: and w9, w9, #0x55555555 ; CHECK-NEXT: sub w9, w0, w9 ; CHECK-NEXT: lsr w10, w9, #2 @@ -130,7 +130,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat { ; CHECK-NONEON-LABEL: cnt32: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr w9, w0, #1 -; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101 +; CHECK-NONEON-NEXT: mov w8, #16843009 ; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 ; CHECK-NONEON-NEXT: sub w9, w0, w9 ; CHECK-NONEON-NEXT: lsr w10, w9, #2 @@ -155,7 +155,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { ; CHECK-LABEL: cnt64: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr x9, x0, #1 -; CHECK-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; CHECK-NEXT: mov x8, #72340172838076673 ; CHECK-NEXT: and x9, x9, #0x5555555555555555 ; CHECK-NEXT: sub x9, x0, x9 ; CHECK-NEXT: lsr x10, x9, #2 @@ -171,7 +171,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { ; CHECK-NONEON-LABEL: cnt64: ; CHECK-NONEON: // %bb.0: ; CHECK-NONEON-NEXT: lsr x9, x0, #1 -; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 ; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 ; CHECK-NONEON-NEXT: sub x9, x0, x9 ; CHECK-NONEON-NEXT: lsr x10, x9, #2 @@ -278,59 +278,5 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone { ret i1 %cmp } -define i1 @ctpop32_eq_one_nonzero(i32 %x) { -; CHECK-LABEL: ctpop32_eq_one_nonzero: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub w8, w0, #1 -; CHECK-NEXT: tst w0, w8 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret -; -; CHECK-NONEON-LABEL: ctpop32_eq_one_nonzero: -; CHECK-NONEON: // %bb.0: // %entry -; CHECK-NONEON-NEXT: sub w8, w0, #1 -; CHECK-NONEON-NEXT: tst w0, w8 -; CHECK-NONEON-NEXT: cset w0, eq -; CHECK-NONEON-NEXT: ret -; -; CHECK-CSSC-LABEL: ctpop32_eq_one_nonzero: -; CHECK-CSSC: // %bb.0: // %entry -; CHECK-CSSC-NEXT: sub w8, w0, #1 -; CHECK-CSSC-NEXT: tst w0, w8 -; CHECK-CSSC-NEXT: cset w0, eq -; CHECK-CSSC-NEXT: ret -entry: - %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp eq i32 %popcnt, 1 - ret i1 %cmp -} - -define i1 @ctpop32_ne_one_nonzero(i32 %x) { -; CHECK-LABEL: ctpop32_ne_one_nonzero: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub w8, w0, #1 -; CHECK-NEXT: tst w0, w8 -; CHECK-NEXT: cset w0, ne -; CHECK-NEXT: ret -; -; CHECK-NONEON-LABEL: ctpop32_ne_one_nonzero: -; CHECK-NONEON: // %bb.0: // %entry -; CHECK-NONEON-NEXT: sub w8, w0, #1 -; CHECK-NONEON-NEXT: tst w0, w8 -; CHECK-NONEON-NEXT: cset w0, ne -; CHECK-NONEON-NEXT: ret -; -; CHECK-CSSC-LABEL: ctpop32_ne_one_nonzero: -; CHECK-CSSC: // %bb.0: // %entry -; CHECK-CSSC-NEXT: sub w8, w0, #1 -; CHECK-CSSC-NEXT: tst w0, w8 -; CHECK-CSSC-NEXT: cset w0, ne -; CHECK-CSSC-NEXT: ret -entry: - %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp ne i32 %popcnt, 1 - ret i1 %cmp -} - declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i64 @llvm.ctpop.i64(i64) nounwind readnone diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 4c52047b928f4d..e24b1b41645cdf 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -1441,42 +1441,3 @@ define i32 @srai_slli2(i16 signext %0) { %3 = sext i16 %sext to i32 ret i32 %3 } - -define i1 @ctpop32_eq_one_nonzero(i32 %x) { -; RV32I-LABEL: ctpop32_eq_one_nonzero: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: addi a1, a0, -1 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: seqz a0, a0 -; RV32I-NEXT: ret -; -; RV32ZBB-LABEL: ctpop32_eq_one_nonzero: -; RV32ZBB: # %bb.0: # %entry -; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: sltiu a0, a0, 2 -; RV32ZBB-NEXT: ret -entry: - %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp eq i32 %popcnt, 1 - ret i1 %cmp -} - -define i1 @ctpop32_ne_one_nonzero(i32 %x) { -; RV32I-LABEL: ctpop32_ne_one_nonzero: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: addi a1, a0, -1 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: snez a0, a0 -; RV32I-NEXT: ret -; -; RV32ZBB-LABEL: ctpop32_ne_one_nonzero: -; RV32ZBB: # %bb.0: # %entry -; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: sltiu a0, a0, 2 -; RV32ZBB-NEXT: xori a0, a0, 1 -; RV32ZBB-NEXT: ret -entry: - %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp ne i32 %popcnt, 1 - ret i1 %cmp -} diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 1e7814d588e4c0..43a499806ab5ae 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -1618,84 +1618,3 @@ entry: %5 = add nsw i32 %4, %0 ret i32 %5 } - -define i1 @ctpop32_eq_one_nonzero(i32 %x) { -; RV64I-LABEL: ctpop32_eq_one_nonzero: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi a1, a0, -1 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: seqz a0, a0 -; RV64I-NEXT: ret -; -; RV64ZBB-LABEL: ctpop32_eq_one_nonzero: -; RV64ZBB: # %bb.0: # %entry -; RV64ZBB-NEXT: cpopw a0, a0 -; RV64ZBB-NEXT: sltiu a0, a0, 2 -; RV64ZBB-NEXT: ret -entry: - %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp eq i32 %popcnt, 1 - ret i1 %cmp -} - -define i1 @ctpop32_ne_one_nonzero(i32 %x) { -; RV64I-LABEL: ctpop32_ne_one_nonzero: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi a1, a0, -1 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: snez a0, a0 -; RV64I-NEXT: ret -; -; RV64ZBB-LABEL: ctpop32_ne_one_nonzero: -; RV64ZBB: # %bb.0: # %entry -; RV64ZBB-NEXT: cpopw a0, a0 -; RV64ZBB-NEXT: sltiu a0, a0, 2 -; RV64ZBB-NEXT: xori a0, a0, 1 -; RV64ZBB-NEXT: ret -entry: - %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp ne i32 %popcnt, 1 - ret i1 %cmp -} - -define i1 @ctpop64_eq_one_nonzero(i64 %x) { -; RV64I-LABEL: ctpop64_eq_one_nonzero: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi a1, a0, -1 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: seqz a0, a0 -; RV64I-NEXT: ret -; -; RV64ZBB-LABEL: ctpop64_eq_one_nonzero: -; RV64ZBB: # %bb.0: # %entry -; RV64ZBB-NEXT: cpop a0, a0 -; RV64ZBB-NEXT: sltiu a0, a0, 2 -; RV64ZBB-NEXT: ret -entry: - %popcnt = call range(i64 1, 65) i64 @llvm.ctpop.i64(i64 %x) - %cmp = icmp eq i64 %popcnt, 1 - ret i1 %cmp -} - -define i1 @ctpop32_eq_one_maybezero(i32 %x) { -; RV64I-LABEL: ctpop32_eq_one_maybezero: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addiw a1, a0, -1 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: sltu a0, a1, a0 -; RV64I-NEXT: ret -; -; RV64ZBB-LABEL: ctpop32_eq_one_maybezero: -; RV64ZBB: # %bb.0: # %entry -; RV64ZBB-NEXT: cpopw a0, a0 -; RV64ZBB-NEXT: addi a0, a0, -1 -; RV64ZBB-NEXT: seqz a0, a0 -; RV64ZBB-NEXT: ret -entry: - %popcnt = call range(i32 0, 16) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp eq i32 %popcnt, 1 - ret i1 %cmp -} diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll index 649d257b28d762..8723432de8b6b0 100644 --- a/llvm/test/CodeGen/X86/ispow2.ll +++ b/llvm/test/CodeGen/X86/ispow2.ll @@ -102,7 +102,7 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) { ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 +; CHECK-AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512-NEXT: vzeroupper @@ -155,7 +155,7 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) { ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 +; CHECK-AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512-NEXT: vzeroupper @@ -220,44 +220,3 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) { %r = icmp ne <4 x i64> %cnt, ret <4 x i1> %r } - - -define i1 @ctpop32_eq_one_nonzero(i32 %x) { -; CHECK-NOBMI-LABEL: ctpop32_eq_one_nonzero: -; CHECK-NOBMI: # %bb.0: # %entry -; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax -; CHECK-NOBMI-NEXT: testl %eax, %edi -; CHECK-NOBMI-NEXT: sete %al -; CHECK-NOBMI-NEXT: retq -; -; CHECK-BMI2-LABEL: ctpop32_eq_one_nonzero: -; CHECK-BMI2: # %bb.0: # %entry -; CHECK-BMI2-NEXT: blsrl %edi, %eax -; CHECK-BMI2-NEXT: sete %al -; CHECK-BMI2-NEXT: retq -entry: - %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp eq i32 %popcnt, 1 - ret i1 %cmp -} - -define i1 @ctpop32_ne_one_nonzero(i32 %x) { -; CHECK-NOBMI-LABEL: ctpop32_ne_one_nonzero: -; CHECK-NOBMI: # %bb.0: # %entry -; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax -; CHECK-NOBMI-NEXT: testl %eax, %edi -; CHECK-NOBMI-NEXT: setne %al -; CHECK-NOBMI-NEXT: retq -; -; CHECK-BMI2-LABEL: ctpop32_ne_one_nonzero: -; CHECK-BMI2: # %bb.0: # %entry -; CHECK-BMI2-NEXT: blsrl %edi, %eax -; CHECK-BMI2-NEXT: setne %al -; CHECK-BMI2-NEXT: retq -entry: - %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) - %cmp = icmp ne i32 %popcnt, 1 - ret i1 %cmp -} diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index 6c0aaeb451e14a..ac41a3fe6bb7e4 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -555,9 +555,9 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { ; X86-NEXT: por %xmm2, %xmm0 ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pxor %xmm1, %xmm0 +; X86-NEXT: pcmpgtd %xmm1, %xmm0 ; X86-NEXT: psrld $31, %xmm0 ; X86-NEXT: retl ; @@ -566,10 +566,10 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { ; X64-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpsrld $31, %xmm0, %xmm0 +; X64-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> ) %r = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %z) From 126ed16525c92af1025a86b582c087d213b47145 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 10 Oct 2024 20:30:04 -0700 Subject: [PATCH 114/177] [ARM] Fix formatting (NFC) I'm about to post a PR in this area. --- llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index b042ee69edd26c..b151a0116a9c41 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2531,7 +2531,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { bool RetVal = false; - DenseMap MI2LocMap; + DenseMap MI2LocMap; using MapIt = DenseMap>::iterator; using Base2InstMap = DenseMap>; using BaseVec = SmallVector; @@ -2570,7 +2570,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { Register Base = MI.getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); bool StopHere = false; - auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) { + auto FindBases = [&](Base2InstMap &Base2Ops, BaseVec &Bases) { MapIt BI = Base2Ops.find(Base); if (BI == Base2Ops.end()) { Base2Ops[Base].push_back(&MI); From 51e9430a0c767243411d4b81c284700f89719277 Mon Sep 17 00:00:00 2001 From: lntue Date: Thu, 10 Oct 2024 23:33:02 -0400 Subject: [PATCH 115/177] [libc][math] Improve performance of double precision trig functions. (#111793) - Improve the accuracy of fast pass' range reduction. - Provide tighter error estimations. - Reduce the table size when `LIBC_MATH_SMALL_TABLES` flag is set. --- libc/src/__support/FPUtil/double_double.h | 43 ++- libc/src/__support/macros/optimization.h | 10 + libc/src/math/generic/cos.cpp | 117 +++--- libc/src/math/generic/pow.cpp | 2 +- .../generic/range_reduction_double_common.h | 348 ++++++++++++------ .../math/generic/range_reduction_double_fma.h | 254 +++---------- .../generic/range_reduction_double_nofma.h | 253 +++---------- libc/src/math/generic/sin.cpp | 129 +++---- libc/src/math/generic/sincos.cpp | 155 ++++---- libc/src/math/generic/sincos_eval.h | 27 +- libc/src/math/generic/tan.cpp | 147 ++++---- libc/test/src/math/cos_test.cpp | 3 +- libc/test/src/math/sin_test.cpp | 12 +- libc/test/src/math/tan_test.cpp | 21 +- 14 files changed, 666 insertions(+), 855 deletions(-) diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h index 25a4ee03387c67..db3c2c8a3d7a6e 100644 --- a/libc/src/__support/FPUtil/double_double.h +++ b/libc/src/__support/FPUtil/double_double.h @@ -18,6 +18,8 @@ namespace LIBC_NAMESPACE_DECL { namespace fputil { +#define DEFAULT_DOUBLE_SPLIT 27 + using DoubleDouble = LIBC_NAMESPACE::NumberPair; // The output of Dekker's FastTwoSum algorithm is correct, i.e.: @@ -61,7 +63,8 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) { // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed // Roundings," https://inria.hal.science/hal-04480440. // Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1. -template LIBC_INLINE constexpr DoubleDouble split(double a) { +template +LIBC_INLINE constexpr DoubleDouble split(double a) { DoubleDouble r{0.0, 0.0}; // CN = 2^N. constexpr double CN = static_cast(1 << N); @@ -73,6 +76,22 @@ template LIBC_INLINE constexpr DoubleDouble split(double a) { return r; } +// Helper for non-fma exact mult where the first number is already split. +template +LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a, + double b) { + DoubleDouble bs = split(b); + DoubleDouble r{0.0, 0.0}; + + r.hi = a * b; + double t1 = as.hi * bs.hi - r.hi; + double t2 = as.hi * bs.lo + t1; + double t3 = as.lo * bs.hi + t2; + r.lo = as.lo * bs.lo + t3; + + return r; +} + // Note: When FMA instruction is not available, the `exact_mult` function is // only correct for round-to-nearest mode. See: // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed @@ -80,7 +99,7 @@ template LIBC_INLINE constexpr DoubleDouble split(double a) { // Using Theorem 1 in the paper above, without FMA instruction, if we restrict // the generated constants to precision <= 51, and splitting it by 2^28 + 1, // then a * b = r.hi + r.lo is exact for all rounding modes. -template +template LIBC_INLINE DoubleDouble exact_mult(double a, double b) { DoubleDouble r{0.0, 0.0}; @@ -90,18 +109,8 @@ LIBC_INLINE DoubleDouble exact_mult(double a, double b) { #else // Dekker's Product. DoubleDouble as = split(a); - DoubleDouble bs; - if constexpr (NO_FMA_ALL_ROUNDINGS) - bs = split<28>(b); - else - bs = split(b); - - r.hi = a * b; - double t1 = as.hi * bs.hi - r.hi; - double t2 = as.hi * bs.lo + t1; - double t3 = as.lo * bs.hi + t2; - r.lo = as.lo * bs.lo + t3; + r = exact_mult(as, a, b); #endif // LIBC_TARGET_CPU_HAS_FMA return r; @@ -113,10 +122,10 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) { return r; } -template +template LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a, const DoubleDouble &b) { - DoubleDouble r = exact_mult(a.hi, b.hi); + DoubleDouble r = exact_mult(a.hi, b.hi); double t1 = multiply_add(a.hi, b.lo, r.lo); double t2 = multiply_add(a.lo, b.hi, t1); r.lo = t2; @@ -157,8 +166,8 @@ LIBC_INLINE DoubleDouble div(const DoubleDouble &a, const DoubleDouble &b) { double e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi); double e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo); #else - DoubleDouble b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi); - DoubleDouble b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi); + DoubleDouble b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi); + DoubleDouble b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi); double e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo; double e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo; #endif // LIBC_TARGET_CPU_HAS_FMA diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h index 5ffd474d35c54d..41ecd2bd6d7191 100644 --- a/libc/src/__support/macros/optimization.h +++ b/libc/src/__support/macros/optimization.h @@ -48,6 +48,16 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) { #ifndef LIBC_MATH #define LIBC_MATH 0 +#else + +#if (LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) +#define LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#endif + +#if (LIBC_MATH & LIBC_MATH_SMALL_TABLES) +#define LIBC_MATH_HAS_SMALL_TABLES +#endif + #endif // LIBC_MATH #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp index e61d800ce2dada..923ea96852d889 100644 --- a/libc/src/math/generic/cos.cpp +++ b/libc/src/math/generic/cos.cpp @@ -17,17 +17,14 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/math/generic/range_reduction_double_common.h" #include "src/math/generic/sincos_eval.h" -// TODO: We might be able to improve the performance of large range reduction of -// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and -// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of -// those lookup table. -#include "range_reduction_double_common.h" - -#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0) -#define LIBC_MATH_COS_SKIP_ACCURATE_PASS -#endif +#ifdef LIBC_TARGET_CPU_HAS_FMA +#include "range_reduction_double_fma.h" +#else +#include "range_reduction_double_nofma.h" +#endif // LIBC_TARGET_CPU_HAS_FMA namespace LIBC_NAMESPACE_DECL { @@ -42,22 +39,29 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { DoubleDouble y; unsigned k; - generic::LargeRangeReduction range_reduction_large{}; + LargeRangeReduction range_reduction_large{}; - // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA) + // |x| < 2^16. if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) { - // |x| < 2^-27 - if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) { - // Signed zeros. - if (LIBC_UNLIKELY(x == 0.0)) - return 1.0; - - // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2. - return fputil::round_result_slightly_down(1.0); + // |x| < 2^-7 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) { + // |x| < 2^-27 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) { + // Signed zeros. + if (LIBC_UNLIKELY(x == 0.0)) + return 1.0; + + // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2. + return fputil::round_result_slightly_down(1.0); + } + // No range reduction needed. + k = 0; + y.lo = 0.0; + y.hi = x; + } else { + // Small range reduction. + k = range_reduction_small(x, y); } - - // // Small range reduction. - k = range_reduction_small(x, y); } else { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { @@ -70,70 +74,51 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { } // Large range reduction. - k = range_reduction_large.compute_high_part(x); - y = range_reduction_large.fast(); + k = range_reduction_large.fast(x, y); } DoubleDouble sin_y, cos_y; - generic::sincos_eval(y, sin_y, cos_y); + [[maybe_unused]] double err = generic::sincos_eval(y, sin_y, cos_y); // Look up sin(k * pi/128) and cos(k * pi/128) - // Memory saving versions: - - // Use 128-entry table instead: - // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127]; - // uint64_t sin_s = static_cast((k + 128) & 128) << (63 - 7); - // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127]; - // uint64_t cos_s = static_cast((k + 64) & 128) << (63 - 7); - // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - - // Use 64-entry table instead: - // auto get_idx_dd = [](unsigned kk) -> DoubleDouble { - // unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - // DoubleDouble ans = SIN_K_PI_OVER_128[idx]; - // if (kk & 128) { - // ans.hi = -ans.hi; - // ans.lo = -ans.lo; - // } - // return ans; - // }; - // DoubleDouble sin_k = get_idx_dd(k + 128); - // DoubleDouble cos_k = get_idx_dd(k + 64); - +#ifdef LIBC_MATH_HAS_SMALL_TABLES + // Memory saving versions. Use 65-entry table. + auto get_idx_dd = [](unsigned kk) -> DoubleDouble { + unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); + DoubleDouble ans = SIN_K_PI_OVER_128[idx]; + if (kk & 128) { + ans.hi = -ans.hi; + ans.lo = -ans.lo; + } + return ans; + }; + DoubleDouble sin_k = get_idx_dd(k + 128); + DoubleDouble cos_k = get_idx_dd(k + 64); +#else // Fast look up version, but needs 256-entry table. // -sin(k * pi/128) = sin((k + 128) * pi/128) // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128) & 255]; DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255]; +#endif // LIBC_MATH_HAS_SMALL_TABLES // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128). // So k is an integer and -pi / 256 <= y <= pi / 256. // Then cos(x) = cos((k * pi/128 + y) // = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128) - DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k); - DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k); + DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k); + DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k); DoubleDouble rr = fputil::exact_add(cos_k_cos_y.hi, msin_k_sin_y.hi); rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo; -#ifdef LIBC_MATH_COS_SKIP_ACCURATE_PASS +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS return rr.hi + rr.lo; #else - // Accurate test and pass for correctly rounded implementation. -#ifdef LIBC_TARGET_CPU_HAS_FMA - constexpr double ERR = 0x1.0p-70; -#else - // TODO: Improve non-FMA fast pass accuracy. - constexpr double ERR = 0x1.0p-66; -#endif // LIBC_TARGET_CPU_HAS_FMA - - double rlp = rr.lo + ERR; - double rlm = rr.lo - ERR; + double rlp = rr.lo + err; + double rlm = rr.lo - err; double r_upper = rr.hi + rlp; // (rr.lo + ERR); double r_lower = rr.hi + rlm; // (rr.lo - ERR); @@ -144,7 +129,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { Float128 u_f128, sin_u, cos_u; if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) - u_f128 = generic::range_reduction_small_f128(x); + u_f128 = range_reduction_small_f128(x); else u_f128 = range_reduction_large.accurate(); @@ -152,7 +137,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { auto get_sin_k = [](unsigned kk) -> Float128 { unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx]; + Float128 ans = SIN_K_PI_OVER_128_F128[idx]; if (kk & 128) ans.sign = Sign::NEG; return ans; @@ -172,7 +157,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { // https://github.com/llvm/llvm-project/issues/96452. return static_cast(r); -#endif // !LIBC_MATH_COS_SKIP_ACCURATE_PASS +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp index 3a50e220154e51..181d3d40b3c9ad 100644 --- a/libc/src/math/generic/pow.cpp +++ b/libc/src/math/generic/pow.cpp @@ -398,7 +398,7 @@ LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) { #else double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val(); dx = fputil::multiply_add(RD[idx_x], m_x.get_val() - c, CD[idx_x]); // Exact - dx_c0 = fputil::exact_mult(COEFFS[0], dx); + dx_c0 = fputil::exact_mult<28>(dx, COEFFS[0]); // Exact #endif // LIBC_TARGET_CPU_HAS_FMA double dx2 = dx * dx; diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h index 290b642be4c69f..e23bbff144bee8 100644 --- a/libc/src/math/generic/range_reduction_double_common.h +++ b/libc/src/math/generic/range_reduction_double_common.h @@ -17,150 +17,272 @@ #include "src/__support/common.h" #include "src/__support/integer_literals.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" -#ifdef LIBC_TARGET_CPU_HAS_FMA -#include "range_reduction_double_fma.h" - -// With FMA, we limit the maxmimum exponent to be 2^16, so that the error bound -// from the fma::range_reduction_small is bounded by 2^-88 instead of 2^-72. -#define FAST_PASS_EXPONENT 16 -using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI; -using LIBC_NAMESPACE::fma::range_reduction_small; -using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128; +namespace LIBC_NAMESPACE_DECL { -LIBC_INLINE constexpr bool NO_FMA = false; +#ifdef LIBC_TARGET_CPU_HAS_FMA +static constexpr unsigned SPLIT = DEFAULT_DOUBLE_SPLIT; #else -#include "range_reduction_double_nofma.h" +// When there is no-FMA instructions, in order to have exact product of 2 double +// precision with directional roundings, we need to lower the precision of the +// constants by at least 1 bit, and use a different splitting constant. +static constexpr unsigned SPLIT = 28; +#endif // LIBC_TARGET_CPU_HAS_FMA -using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT; -using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI; -using LIBC_NAMESPACE::nofma::range_reduction_small; -using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128; +using LIBC_NAMESPACE::fputil::DoubleDouble; +using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>; -LIBC_INLINE constexpr bool NO_FMA = true; -#endif // LIBC_TARGET_CPU_HAS_FMA +#define FAST_PASS_EXPONENT 16 -namespace LIBC_NAMESPACE_DECL { +// For 2^-7 < |x| < 2^16, return k and u such that: +// k = round(x * 128/pi) +// x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo +// Error bound: +// |(x - k * pi/128) - (u_hi + u_lo)| <= max(ulp(ulp(u_hi)), 2^-119) +// <= 2^-111. +LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) { + // Values of -pi/128 used for inputs with absolute value <= 2^16. + // The first 3 parts are generated with (53 - 21 = 32)-bit precision, so that + // the product k * MPI_OVER_128[i] is exact. + // Generated by Sollya with: + // > display = hexadecimal!; + // > a = round(pi/128, 32, RN); + // > b = round(pi/128 - a, 32, RN); + // > c = round(pi/128 - a - b, D, RN); + // > print(-a, ",", -b, ",", -c); + constexpr double MPI_OVER_128[3] = {-0x1.921fb544p-6, -0x1.0b4611a6p-40, + -0x1.3198a2e037073p-75}; + constexpr double ONE_TWENTY_EIGHT_OVER_PI_D = 0x1.45f306dc9c883p5; + double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI_D; + double kd = fputil::nearest_integer(prod_hi); -namespace generic { + // Let y = x - k * (pi/128) + // Then |y| < pi / 256 + // With extra rounding errors, we can bound |y| < 1.6 * 2^-7. + double y_hi = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact + // |u.hi| < 1.6*2^-7 + u.hi = fputil::multiply_add(kd, MPI_OVER_128[1], y_hi); + double u0 = y_hi - u.hi; // Exact + // |u.lo| <= max(ulp(u.hi), |kd * MPI_OVER_128[2]|) + double u1 = fputil::multiply_add(kd, MPI_OVER_128[1], u0); // Exact + u.lo = fputil::multiply_add(kd, MPI_OVER_128[2], u1); + // Error bound: + // |x - k * pi/128| - (u.hi + u.lo) <= ulp(u.lo) + // <= ulp(max(ulp(u.hi), kd*MPI_OVER_128[2])) + // <= 2^(-7 - 104) = 2^-111. -using LIBC_NAMESPACE::fputil::DoubleDouble; -using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>; + return static_cast(static_cast(kd)); +} -LIBC_INLINE constexpr Float128 PI_OVER_128_F128 = { - Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128}; +// Digits of 2^(16*i) / pi, generated by Sollya with: +// > procedure ulp(x, n) { return 2^(floor(log2(abs(x))) - n); }; +// > for i from 0 to 63 do { +// if i < 3 then { pi_inv = 0.25 + 2^(16*(i - 3)) / pi; } +// else { pi_inv = 2^(16*(i-3)) / pi; }; +// pn = nearestint(pi_inv); +// pi_frac = pi_inv - pn; +// a = round(pi_frac, 51, RN); +// b = round(pi_frac - a, 51, RN); +// c = round(pi_frac - a - b, 51, RN); +// d = round(pi_frac - a - b - c, D, RN); +// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},"); +// }; +// +// Notice that for [0..2] the leading bit of 2^(16*(i - 3)) / pi is very small, +// so we add 0.25 so that the conditions for the algorithms are still satisfied, +// and one of those conditions guarantees that ulp(0.25 * x_reduced) >= 2, and +// will safely be discarded. -// Note: The look-up tables ONE_TWENTY_EIGHT_OVER_PI is selected to be either -// from fma:: or nofma:: namespace. +static constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = { + {0x1.0000000000014p5, 0x1.7cc1b727220a8p-49, 0x1.4fe13abe8fa9cp-101, + -0x1.911f924eb5336p-153}, + {0x1.0000000145f3p5, 0x1.b727220a94fep-49, 0x1.3abe8fa9a6eep-101, + 0x1.b6c52b3278872p-155}, + {0x1.000145f306dc8p5, 0x1.c882a53f84ebp-47, -0x1.70565911f925p-101, + 0x1.4acc9e21c821p-153}, + {0x1.45f306dc9c884p5, -0x1.5ac07b1505c14p-47, -0x1.96447e493ad4cp-99, + -0x1.b0ef1bef806bap-152}, + {-0x1.f246c6efab58p4, -0x1.ec5417056591p-49, -0x1.f924eb53361ep-101, + 0x1.c820ff28b1d5fp-153}, + {0x1.391054a7f09d4p4, 0x1.f47d4d377036cp-48, 0x1.8a5664f10e41p-100, + 0x1.fe5163abdebbcp-154}, + {0x1.529fc2757d1f4p2, 0x1.34ddc0db62958p-50, 0x1.93c439041fe5p-102, + 0x1.63abdebbc561bp-154}, + {-0x1.ec5417056591p-1, -0x1.f924eb53361ep-53, 0x1.c820ff28b1d6p-105, + -0x1.0a21d4f246dc9p-157}, + {-0x1.505c1596447e4p5, -0x1.275a99b0ef1cp-48, 0x1.07f9458eaf7bp-100, + -0x1.0ea79236e4717p-152}, + {-0x1.596447e493ad4p1, -0x1.9b0ef1bef806cp-52, 0x1.63abdebbc561cp-106, + -0x1.1b7238b7b645ap-159}, + {0x1.bb81b6c52b328p5, -0x1.de37df00d74e4p-49, 0x1.5ef5de2b0db94p-101, + -0x1.c8e2ded9169p-153}, + {0x1.b6c52b3278874p5, -0x1.f7c035d38a844p-47, 0x1.778ac36e48dc8p-99, + -0x1.6f6c8b47fe6dbp-152}, + {0x1.2b3278872084p5, -0x1.ae9c5421443a8p-50, -0x1.e48db91c5bdb4p-102, + 0x1.d2e006492eea1p-154}, + {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db8p-49, 0x1.2371d2126e97p-101, + 0x1.924bba8274648p-160}, + {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dcp-104, + 0x1.77504e8c90e7fp-157}, + {-0x1.ae9c5421443a8p-2, -0x1.e48db91c5bdb4p-54, 0x1.d2e006492eeap-106, + 0x1.3a32439fc3bd6p-159}, + {-0x1.38a84288753c8p5, -0x1.1b7238b7b645cp-47, 0x1.c00c925dd413cp-99, + -0x1.cdbc603c429c7p-151}, + {-0x1.0a21d4f246dc8p3, -0x1.c5bdb22d1ff9cp-50, 0x1.25dd413a32438p-103, + 0x1.fc3bd63962535p-155}, + {-0x1.d4f246dc8e2ep3, 0x1.26e9700324978p-49, -0x1.5f62e6de301e4p-102, + 0x1.eb1cb129a73efp-154}, + {-0x1.236e4716f6c8cp4, 0x1.700324977505p-49, -0x1.736f180f10a7p-101, + -0x1.a76b2c608bbeep-153}, + {0x1.b8e909374b8p4, 0x1.924bba8274648p-48, 0x1.cfe1deb1cb128p-102, + 0x1.a73ee88235f53p-154}, + {0x1.09374b801924cp4, -0x1.15f62e6de302p-50, 0x1.deb1cb129a74p-102, + -0x1.177dca0ad144cp-154}, + {-0x1.68ffcdb688afcp3, 0x1.d1921cfe1debp-50, 0x1.cb129a73ee884p-102, + -0x1.ca0ad144bb7b1p-154}, + {0x1.924bba8274648p0, 0x1.cfe1deb1cb128p-54, 0x1.a73ee88235f54p-106, + -0x1.144bb7b16639p-158}, + {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106, + 0x1.213a671c09ad1p-160}, + {0x1.3a32439fc3bd8p1, -0x1.c69dacb1822fp-51, 0x1.1afa975da2428p-105, + -0x1.6638fd94ba082p-158}, + {-0x1.b78c0788538d4p4, 0x1.29a73ee88236p-50, -0x1.5a28976f62cc8p-103, + 0x1.c09ad17df904ep-156}, + {0x1.fc3bd63962534p5, 0x1.cfba208d7d4bcp-48, -0x1.12edec598e3f8p-100, + 0x1.ad17df904e647p-152}, + {-0x1.4e34ed658c118p2, 0x1.046bea5d7689p-51, 0x1.3a671c09ad17cp-104, + 0x1.f904e64758e61p-156}, + {0x1.62534e7dd1048p5, -0x1.415a28976f62cp-47, -0x1.8e3f652e8207p-100, + 0x1.3991d63983534p-154}, + {-0x1.63045df7282b4p4, -0x1.44bb7b16638fcp-50, -0x1.94ba081bec67p-102, + 0x1.d639835339f4ap-154}, + {0x1.d1046bea5d768p5, 0x1.213a671c09adp-48, 0x1.7df904e64759p-100, + -0x1.9f2b3182d8defp-152}, + {0x1.afa975da24274p3, 0x1.9c7026b45f7e4p-50, 0x1.3991d63983534p-106, + -0x1.82d8dee81d108p-160}, + {-0x1.a28976f62cc7p5, -0x1.fb29741037d8cp-47, -0x1.b8a719f2b3184p-100, + 0x1.272117e2ef7e5p-152}, + {-0x1.76f62cc71fb28p5, -0x1.741037d8cdc54p-47, 0x1.cc1a99cfa4e44p-101, + -0x1.d03a21036be27p-153}, + {0x1.d338e04d68bfp5, -0x1.bec66e29c67ccp-50, 0x1.339f49c845f8cp-102, + -0x1.081b5f13801dap-156}, + {0x1.c09ad17df905p4, -0x1.9b8a719f2b318p-48, -0x1.6c6f740e8840cp-103, + -0x1.af89c00ed0004p-155}, + {0x1.68befc827323cp5, -0x1.38cf9598c16c8p-47, 0x1.08bf177bf2508p-99, + -0x1.3801da00087eap-152}, + {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ffp-103, + 0x1.2fffbc0b301fep-155}, + {-0x1.8cdc538cf9598p5, -0x1.82d8dee81d108p-48, -0x1.b5f13801dap-104, + -0x1.0fd33f8086877p-157}, + {-0x1.4e33e566305bp3, -0x1.bdd03a21036cp-49, 0x1.d8ffc4bffef04p-101, + -0x1.33f80868773a5p-153}, + {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f138p-52, -0x1.da00087e99fcp-104, + -0x1.0d0ee74a5f593p-158}, + {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086878p-109, + 0x1.8b5a0a6d1f6d3p-162}, + {0x1.3908bf177bf24p5, 0x1.0763ff12fffbcp-47, 0x1.6603fbcbc462cp-104, + 0x1.6829b47db4dap-156}, + {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f594p-110, + 0x1.1f6d367ecf27dp-162}, + {-0x1.081b5f13801dcp4, 0x1.fff7816603fbcp-48, 0x1.788c5ad05369p-101, + -0x1.25930261b069fp-155}, + {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee8p-50, 0x1.6b414da3eda6cp-103, + 0x1.fb3c9f2c26dd4p-156}, + {-0x1.c00ed00043f4cp5, -0x1.fc04343b9d298p-48, 0x1.4da3eda6cfdap-103, + -0x1.b069ec9161738p-155}, + {0x1.2fffbc0b301fcp5, 0x1.e5e2316b414dcp-47, -0x1.c125930261b08p-99, + 0x1.6136e9e8c7ecdp-151}, + {-0x1.0fd33f8086878p3, 0x1.8b5a0a6d1f6d4p-50, -0x1.30261b069ec9p-103, + -0x1.61738132c3403p-155}, + {-0x1.9fc04343b9d28p4, -0x1.7d64b824b2604p-48, -0x1.86c1a7b24585cp-101, + -0x1.c09961a015d29p-154}, + {-0x1.0d0ee74a5f594p2, 0x1.1f6d367ecf27cp-50, 0x1.6136e9e8c7eccp-103, + 0x1.3cbfd45aea4f7p-155}, + {-0x1.dce94beb25c14p5, 0x1.a6cfd9e4f9614p-47, -0x1.22c2e70265868p-100, + -0x1.5d28ad8453814p-158}, + {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b4p-104, + 0x1.d49eeb1faf97cp-156}, + {0x1.b47db4d9fb3c8p4, 0x1.f2c26dd3d18fcp-48, 0x1.9a797fa8b5d48p-100, + 0x1.eeb1faf97c5edp-152}, + {-0x1.25930261b06ap5, 0x1.36e9e8c7ecd3cp-47, 0x1.7fa8b5d49eebp-100, + 0x1.faf97c5ecf41dp-152}, + {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bcp-51, 0x1.aea4f758fd7ccp-103, + -0x1.d0985f18c10ebp-159}, + {-0x1.b069ec9161738p5, -0x1.32c3402ba515cp-51, 0x1.eeb1faf97c5ecp-104, + 0x1.e839cfbc52949p-157}, + {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41cp-104, + 0x1.cfbc529497536p-157}, + {-0x1.61738132c3404p5, 0x1.45aea4f758fd8p-47, -0x1.a0e84c2f8c608p-102, + -0x1.d6b5b45650128p-156}, + {0x1.fb34f2ff516bcp3, -0x1.6c229c0a0d074p-49, -0x1.30be31821d6b4p-104, + -0x1.b4565012813b8p-156}, + {0x1.3cbfd45aea4f8p5, -0x1.4e050683a130cp-48, 0x1.ce7de294a4ba8p-104, + 0x1.afed7ec47e357p-156}, + {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108, + -0x1.3b81ca8bdea7fp-164}, + {-0x1.15b08a702834p5, -0x1.d0985f18c10ecp-47, 0x1.4a4ba9afed7ecp-100, + 0x1.1f8d5d0856033p-154}, +}; -// For large range |x| >= 2^32, we use the exponent of x to find 3 double-chunks -// of 128/pi c_hi, c_mid, c_lo such that: -// 1) ulp(round(x * c_hi, D, RN)) >= 256, +// For large range |x| >= 2^16, we perform the range reduction computations as: +// u = x - k * pi/128 = (pi/128) * (x * (128/pi) - k). +// We use the exponent of x to find 4 double-chunks of 128/pi: +// c_hi, c_mid, c_lo, c_lo_2 such that: +// 1) ulp(round(x * c_hi, D, RN)) >= 2^8 = 256, // 2) If x * c_hi = ph_hi + ph_lo and x * c_mid = pm_hi + pm_lo, then // min(ulp(ph_lo), ulp(pm_hi)) >= 2^-53. -// 3) ulp(round(x * c_lo, D, RN)) <= 2^-7x. -// This will allow us to do quick computations as: -// (x * 256/pi) ~ x * (c_hi + c_mid + c_lo) (mod 256) -// ~ ph_lo + pm_hi + pm_lo + (x * c_lo) +// This will allow us to drop the high part ph_hi and the addition: +// (ph_lo + pm_hi) mod 1 +// can be exactly representable in a double precision. +// This will allow us to do split the computations as: +// (x * 256/pi) ~ x * (c_hi + c_mid + c_lo + c_lo_2) (mod 256) +// ~ (ph_lo + pm_hi) + (pm_lo + x * c_lo) + x * c_lo_2. // Then, // round(x * 128/pi) = round(ph_lo + pm_hi) (mod 256) // And the high part of fractional part of (x * 128/pi) can simply be: // {x * 128/pi}_hi = {ph_lo + pm_hi}. // To prevent overflow when x is very large, we simply scale up -// (c_hi, c_mid, c_lo) by a fixed power of 2 (based on the index) and scale down -// x by the same amount. - -template struct LargeRangeReduction { - // Calculate the high part of the range reduction exactly. - LIBC_INLINE unsigned compute_high_part(double x) { - using FPBits = typename fputil::FPBits; - FPBits xbits(x); - - // TODO: The extra exponent gap of 62 below can be reduced a bit for non-FMA - // with a more careful analysis, which in turn will reduce the error bound - // for non-FMA - int x_e_m62 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 62); - idx = static_cast((x_e_m62 >> 4) + 3); - // Scale x down by 2^(-(16 * (idx - 3)) - xbits.set_biased_exponent((x_e_m62 & 15) + FPBits::EXP_BIAS + 62); - // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78 - x_reduced = xbits.get_val(); - // x * c_hi = ph.hi + ph.lo exactly. - DoubleDouble ph = - fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]); - // x * c_mid = pm.hi + pm.lo exactly. - DoubleDouble pm = - fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]); - // Extract integral parts and fractional parts of (ph.lo + pm.hi). - double kh = fputil::nearest_integer(ph.lo); - double ph_lo_frac = ph.lo - kh; // Exact - double km = fputil::nearest_integer(pm.hi + ph_lo_frac); - double pm_hi_frac = pm.hi - km; // Exact - // x * 128/pi mod 1 ~ y_hi + y_lo - y_hi = ph_lo_frac + pm_hi_frac; // Exact - pm_lo = pm.lo; - return static_cast(static_cast(kh) + - static_cast(km)); - } +// (c_hi, c_mid, c_lo, c_lo_2) by a fixed power of 2 (based on the index) and +// scale down x by the same amount. - LIBC_INLINE DoubleDouble fast() const { - // y_lo = x * c_lo + pm.lo - double y_lo = fputil::multiply_add(x_reduced, - ONE_TWENTY_EIGHT_OVER_PI[idx][2], pm_lo); - DoubleDouble y = fputil::exact_add(y_hi, y_lo); - - // Digits of pi/128, generated by Sollya with: - // > a = round(pi/128, D, RN); - // > b = round(pi/128 - a, D, RN); - constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60, - 0x1.921fb54442d18p-6}; - - // Error bound: with {a} denote the fractional part of a, i.e.: - // {a} = a - round(a) - // Then, - // | {x * 128/pi} - (y_hi + y_lo) | < 2 * ulp(x_reduced * - // * ONE_TWENTY_EIGHT_OVER_PI[idx][2]) - // For FMA: - // | {x * 128/pi} - (y_hi + y_lo) | <= 2 * 2^77 * 2^-103 * 2^-52 - // = 2^-77. - // | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-77. - // = 2^-82. - // For non-FMA: - // | {x * 128/pi} - (y_hi + y_lo) | <= 2 * 2^77 * 2^-99 * 2^-52 - // = 2^-73. - // | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-73. - // = 2^-78. - return fputil::quick_mult(y, PI_OVER_128_DD); - } +struct LargeRangeReduction { + + // To be implemented in range_reduction_double_fma.h and + // range_reduction_double_nofma.h. + unsigned fast(double x, DoubleDouble &u); +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS LIBC_INLINE Float128 accurate() const { + constexpr Float128 PI_OVER_128_F128 = { + Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128}; + // y_lo = x * c_lo + pm.lo Float128 y_lo_0(x_reduced * ONE_TWENTY_EIGHT_OVER_PI[idx][3]); - Float128 y_lo_1 = fputil::quick_mul( - Float128(x_reduced), Float128(ONE_TWENTY_EIGHT_OVER_PI[idx][2])); - Float128 y_lo_2(pm_lo); - Float128 y_hi_f128(y_hi); - - Float128 y = fputil::quick_add( - y_hi_f128, - fputil::quick_add(y_lo_2, fputil::quick_add(y_lo_1, y_lo_0))); + Float128 y_lo_1 = fputil::quick_add(Float128(y_lo), y_lo_0); + Float128 y_mid_f128 = fputil::quick_add(Float128(y_mid.lo), y_lo_1); + Float128 y_hi_f128 = fputil::quick_add(Float128(y_hi), Float128(y_mid.hi)); + Float128 y = fputil::quick_add(y_hi_f128, y_mid_f128); return fputil::quick_mul(y, PI_OVER_128_F128); } +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS private: // Index of x in the look-up table ONE_TWENTY_EIGHT_OVER_PI. unsigned idx; // x scaled down by 2^(-16 *(idx - 3))). double x_reduced; - // High part of (x * 128/pi) mod 1. - double y_hi; - // Low part of x * ONE_TWENTY_EIGHT_OVER_PI[idx][1]. - double pm_lo; + // Parts of (x * 128/pi) mod 1. + double y_hi, y_lo; + DoubleDouble y_mid; }; -LIBC_INLINE Float128 range_reduction_small_f128(double x) { - double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI[3][0]; +static Float128 range_reduction_small_f128(double x) { + constexpr Float128 PI_OVER_128_F128 = { + Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128}; + constexpr double ONE_TWENTY_EIGHT_OVER_PI_D = 0x1.45f306dc9c883p5; + double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI_D; double kd = fputil::nearest_integer(prod_hi); Float128 mk_f128(-kd); @@ -178,7 +300,8 @@ LIBC_INLINE Float128 range_reduction_small_f128(double x) { return fputil::quick_mul(y, PI_OVER_128_F128); } -LIBC_INLINE constexpr Float128 SIN_K_PI_OVER_128_F128[65] = { +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +static constexpr Float128 SIN_K_PI_OVER_128_F128[65] = { {Sign::POS, 0, 0}, {Sign::POS, -133, 0xc90a'afbd'1b33'efc9'c539'edcb'fda0'cf2c_u128}, {Sign::POS, -132, 0xc8fb'2f88'6ec0'9f37'6a17'954b'2b7c'5171_u128}, @@ -245,8 +368,7 @@ LIBC_INLINE constexpr Float128 SIN_K_PI_OVER_128_F128[65] = { {Sign::POS, -128, 0xffec'4304'2668'65d9'5657'5523'6696'1732_u128}, {Sign::POS, 0, 1}, }; - -} // namespace generic +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/range_reduction_double_fma.h b/libc/src/math/generic/range_reduction_double_fma.h index 7448b5f63dfde2..cab031c28baa17 100644 --- a/libc/src/math/generic/range_reduction_double_fma.h +++ b/libc/src/math/generic/range_reduction_double_fma.h @@ -15,174 +15,62 @@ #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/math/generic/range_reduction_double_common.h" namespace LIBC_NAMESPACE_DECL { -namespace fma { - using LIBC_NAMESPACE::fputil::DoubleDouble; -LIBC_INLINE constexpr int FAST_PASS_EXPONENT = 32; +LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) { + using FPBits = typename fputil::FPBits; + FPBits xbits(x); -// Digits of 2^(16*i) / pi, generated by Sollya with: -// For [2..62]: -// > for i from 3 to 63 do { -// pi_inv = 2^(16*(i - 3)) / pi; -// pn = nearestint(pi_inv); -// pi_frac = pi_inv - pn; -// a = round(pi_frac, D, RN); -// b = round(pi_frac - a, D, RN); -// c = round(pi_frac - a - b, D, RN); -// d = round(pi_frac - a - b - c, D, RN); -// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},"); -// }; -// For [0..1]: -// The leading bit of 2^(16*(i - 3)) / pi is very small, so we add 0.25 so that -// the conditions for the algorithms are still satisfied, and one of those -// conditions guarantees that ulp(0.25 * x_reduced) >= 2, and will safely be -// discarded. -// for i from 0 to 2 do { -// pi_frac = 0.25 + 2^(16*(i - 3)) / pi; -// a = round(pi_frac, D, RN); -// b = round(pi_frac - a, D, RN); -// c = round(pi_frac - a - b, D, RN); -// d = round(pi_frac - a - b - c, D, RN); -// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},"); -// }; -// For The fast pass using double-double, we only need 3 parts (a, b, c), but -// for the accurate pass using Float128, instead of using another table of -// Float128s, we simply add the fourth path (a, b, c, d), which simplify the -// implementation a bit and saving some memory. -LIBC_INLINE constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = { - {0x1.0000000000014p5, 0x1.7cc1b727220a9p-49, 0x1.3f84eafa3ea6ap-103, - -0x1.11f924eb53362p-157}, - {0x1.0000000145f3p5, 0x1.b727220a94fe1p-49, 0x1.d5f47d4d37703p-104, - 0x1.b6295993c439p-158}, - {0x1.000145f306dcap5, -0x1.bbead603d8a83p-50, 0x1.f534ddc0db629p-106, - 0x1.664f10e4107f9p-160}, - {0x1.45f306dc9c883p5, -0x1.6b01ec5417056p-49, -0x1.6447e493ad4cep-103, - 0x1.e21c820ff28b2p-157}, - {-0x1.f246c6efab581p4, 0x1.3abe8fa9a6eep-53, 0x1.b6c52b3278872p-107, - 0x1.07f9458eaf7afp-164}, - {0x1.391054a7f09d6p4, -0x1.70565911f924fp-53, 0x1.2b3278872084p-107, - -0x1.ae9c5421443aap-162}, - {0x1.529fc2757d1f5p2, 0x1.a6ee06db14acdp-53, -0x1.8778df7c035d4p-107, - 0x1.d5ef5de2b0db9p-161}, - {-0x1.ec54170565912p-1, 0x1.b6c52b3278872p-59, 0x1.07f9458eaf7afp-116, - -0x1.d4f246dc8e2dfp-173}, - {-0x1.505c1596447e5p5, 0x1.b14acc9e21c82p-49, 0x1.fe5163abdebbcp-106, - 0x1.586dc91b8e909p-160}, - {-0x1.596447e493ad5p1, 0x1.93c439041fe51p-54, 0x1.8eaf7aef1586ep-108, - -0x1.b7238b7b645a4p-163}, - {0x1.bb81b6c52b328p5, -0x1.de37df00d74e3p-49, 0x1.7bd778ac36e49p-103, - -0x1.1c5bdb22d1ffap-158}, - {0x1.b6c52b3278872p5, 0x1.07f9458eaf7afp-52, -0x1.d4f246dc8e2dfp-109, - 0x1.374b801924bbbp-164}, - {0x1.2b3278872084p5, -0x1.ae9c5421443aap-50, 0x1.b7246e3a424ddp-106, - 0x1.700324977504fp-161}, - {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db9p-49, 0x1.1b8e909374b8p-104, - 0x1.924bba8274648p-160}, - {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dbp-104, - -0x1.115f62e6de302p-158}, - {-0x1.ae9c5421443aap-2, 0x1.b7246e3a424ddp-58, 0x1.700324977504fp-113, - -0x1.cdbc603c429c7p-167}, - {-0x1.38a84288753c9p5, -0x1.b7238b7b645a4p-51, 0x1.924bba8274648p-112, - 0x1.cfe1deb1cb12ap-166}, - {-0x1.0a21d4f246dc9p3, 0x1.d2126e9700325p-53, -0x1.a22bec5cdbc6p-107, - -0x1.e214e34ed658cp-162}, - {-0x1.d4f246dc8e2dfp3, 0x1.374b801924bbbp-52, -0x1.f62e6de301e21p-106, - -0x1.38d3b5963045ep-160}, - {-0x1.236e4716f6c8bp4, -0x1.1ff9b6d115f63p-50, 0x1.921cfe1deb1cbp-106, - 0x1.29a73ee88235fp-162}, - {0x1.b8e909374b802p4, -0x1.b6d115f62e6dep-50, -0x1.80f10a71a76b3p-105, - 0x1.cfba208d7d4bbp-160}, - {0x1.09374b801924cp4, -0x1.15f62e6de301ep-50, -0x1.0a71a76b2c609p-105, - 0x1.1046bea5d7689p-159}, - {-0x1.68ffcdb688afbp3, -0x1.736f180f10a72p-53, 0x1.62534e7dd1047p-107, - -0x1.0568a25dbd8b3p-161}, - {0x1.924bba8274648p0, 0x1.cfe1deb1cb12ap-54, -0x1.63045df7282b4p-108, - -0x1.44bb7b16638fep-162}, - {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106, - 0x1.213a671c09ad1p-160}, - {0x1.3a32439fc3bd6p1, 0x1.cb129a73ee882p-54, 0x1.afa975da24275p-109, - -0x1.8e3f652e8207p-164}, - {-0x1.b78c0788538d4p4, 0x1.29a73ee88235fp-50, 0x1.4baed1213a672p-104, - -0x1.fb29741037d8dp-159}, - {0x1.fc3bd63962535p5, -0x1.822efb9415a29p-51, 0x1.a24274ce38136p-105, - -0x1.741037d8cdc54p-159}, - {-0x1.4e34ed658c117p2, -0x1.f7282b4512edfp-52, 0x1.d338e04d68bfp-107, - -0x1.bec66e29c67cbp-162}, - {0x1.62534e7dd1047p5, -0x1.0568a25dbd8b3p-49, -0x1.c7eca5d040df6p-105, - -0x1.9b8a719f2b318p-160}, - {-0x1.63045df7282b4p4, -0x1.44bb7b16638fep-50, 0x1.ad17df904e647p-104, - 0x1.639835339f49dp-158}, - {0x1.d1046bea5d769p5, -0x1.bd8b31c7eca5dp-49, -0x1.037d8cdc538dp-107, - 0x1.a99cfa4e422fcp-161}, - {0x1.afa975da24275p3, -0x1.8e3f652e8207p-52, 0x1.3991d63983534p-106, - -0x1.82d8dee81d108p-160}, - {-0x1.a28976f62cc72p5, 0x1.35a2fbf209cc9p-53, -0x1.4e33e566305b2p-109, - 0x1.08bf177bf2507p-163}, - {-0x1.76f62cc71fb29p5, -0x1.d040df633714ep-49, -0x1.9f2b3182d8defp-104, - 0x1.f8bbdf9283b2p-158}, - {0x1.d338e04d68bfp5, -0x1.bec66e29c67cbp-50, 0x1.9cfa4e422fc5ep-105, - -0x1.036be27003b4p-161}, - {0x1.c09ad17df904ep4, 0x1.91d639835339fp-50, 0x1.272117e2ef7e5p-104, - -0x1.7c4e007680022p-158}, - {0x1.68befc827323bp5, -0x1.c67cacc60b638p-50, 0x1.17e2ef7e4a0ecp-104, - 0x1.ff897ffde0598p-158}, - {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ff1p-103, - 0x1.7ffde05980fefp-158}, - {-0x1.8cdc538cf9599p5, 0x1.f49c845f8bbep-50, -0x1.b5f13801da001p-104, - 0x1.e05980fef2f12p-158}, - {-0x1.4e33e566305b2p3, 0x1.08bf177bf2507p-51, 0x1.8ffc4bffef02dp-105, - -0x1.fc04343b9d298p-160}, - {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f13802p-52, 0x1.2fffbc0b301fep-107, - -0x1.a1dce94beb25cp-163}, - {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086877p-109, - -0x1.d297d64b824b2p-164}, - {0x1.3908bf177bf25p5, 0x1.d8ffc4bffef03p-53, -0x1.9fc04343b9d29p-108, - -0x1.f592e092c9813p-162}, - {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f593p-110, - 0x1.f6d367ecf27cbp-166}, - {-0x1.081b5f13801dap4, -0x1.0fd33f8086877p-61, -0x1.d297d64b824b2p-116, - -0x1.8130d834f648bp-170}, - {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee7p-50, -0x1.297d64b824b26p-104, - -0x1.30d834f648b0cp-162}, - {-0x1.c00ed00043f4dp5, 0x1.fde5e2316b415p-55, -0x1.2e092c98130d8p-110, - -0x1.a7b24585ce04dp-165}, - {0x1.2fffbc0b301fep5, -0x1.a1dce94beb25cp-51, -0x1.25930261b069fp-107, - 0x1.b74f463f669e6p-162}, - {-0x1.0fd33f8086877p3, -0x1.d297d64b824b2p-52, -0x1.8130d834f648bp-106, - -0x1.738132c3402bap-163}, - {-0x1.9fc04343b9d29p4, -0x1.f592e092c9813p-50, -0x1.b069ec9161738p-107, - -0x1.32c3402ba515bp-163}, - {-0x1.0d0ee74a5f593p2, 0x1.f6d367ecf27cbp-54, 0x1.36e9e8c7ecd3dp-111, - -0x1.00ae9456c229cp-165}, - {-0x1.dce94beb25c12p5, -0x1.64c0986c1a7b2p-49, -0x1.161738132c34p-103, - -0x1.5d28ad8453814p-158}, - {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b6p-104, - -0x1.5b08a7028341dp-159}, - {0x1.b47db4d9fb3cap4, -0x1.a7b24585ce04dp-53, 0x1.3cbfd45aea4f7p-107, - 0x1.63f5f2f8bd9e8p-161}, - {-0x1.25930261b069fp5, 0x1.b74f463f669e6p-50, -0x1.5d28ad8453814p-110, - -0x1.a0e84c2f8c608p-166}, - {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bap-51, -0x1.456c229c0a0dp-105, - -0x1.d0985f18c10ebp-159}, - {-0x1.b069ec9161738p5, -0x1.32c3402ba515bp-51, -0x1.14e050683a131p-108, - 0x1.0739f78a5292fp-162}, - {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41dp-104, - -0x1.821d6b5b4565p-160}, - {-0x1.61738132c3403p5, 0x1.16ba93dd63f5fp-49, 0x1.7c5ecf41ce7dep-104, - 0x1.4a525d4d7f6bfp-159}, - {0x1.fb34f2ff516bbp3, -0x1.b08a7028341d1p-51, 0x1.9e839cfbc5295p-105, - -0x1.a2b2809409dc1p-159}, - {0x1.3cbfd45aea4f7p5, 0x1.63f5f2f8bd9e8p-49, 0x1.ce7de294a4baap-104, - -0x1.404a04ee072a3p-158}, - {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108, - -0x1.3b81ca8bdea7fp-164}, - {-0x1.15b08a7028342p5, 0x1.7b3d0739f78a5p-50, 0x1.497535fdafd89p-105, - -0x1.ca8bdea7f33eep-164}, -}; + int x_e_m62 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 62); + idx = static_cast((x_e_m62 >> 4) + 3); + // Scale x down by 2^(-(16 * (idx - 3)) + xbits.set_biased_exponent((x_e_m62 & 15) + FPBits::EXP_BIAS + 62); + // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78 + x_reduced = xbits.get_val(); + // x * c_hi = ph.hi + ph.lo exactly. + DoubleDouble ph = + fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]); + // x * c_mid = pm.hi + pm.lo exactly. + DoubleDouble pm = + fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]); + // x * c_lo = pl.hi + pl.lo exactly. + DoubleDouble pl = + fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]); + // Extract integral parts and fractional parts of (ph.lo + pm.hi). + double sum_hi = ph.lo + pm.hi; + double kd = fputil::nearest_integer(sum_hi); + + // x * 128/pi mod 1 ~ y_hi + y_mid + y_lo + y_hi = (ph.lo - kd) + pm.hi; // Exact + y_mid = fputil::exact_add(pm.lo, pl.hi); + y_lo = pl.lo; + + // y_l = x * c_lo_2 + pl.lo + double y_l = + fputil::multiply_add(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][3], y_lo); + DoubleDouble y = fputil::exact_add(y_hi, y_mid.hi); + y.lo += (y_mid.lo + y_l); + + // Digits of pi/128, generated by Sollya with: + // > a = round(pi/128, D, RN); + // > b = round(pi/128 - a, D, RN); + constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60, + 0x1.921fb54442d18p-6}; + + // Error bound: with {a} denote the fractional part of a, i.e.: + // {a} = a - round(a) + // Then, + // | {x * 128/pi} - (y_hi + y_lo) | <= ulp(ulp(y_hi)) <= 2^-105 + // | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-105 = 2^-110 + u = fputil::quick_mult(y, PI_OVER_128_DD); + + return static_cast(static_cast(kd)); +} // Lookup table for sin(k * pi / 128) with k = 0, ..., 255. // Table is generated with Sollya as follow: @@ -258,6 +146,7 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = { {-0x1.c57bc2e24aa15p-57, 0x1.ff621e3796d7ep-1}, {-0x1.1354d4556e4cbp-55, 0x1.ffd886084cd0dp-1}, {0, 1}, +#ifndef LIBC_MATH_HAS_SMALL_TABLES {-0x1.1354d4556e4cbp-55, 0x1.ffd886084cd0dp-1}, {-0x1.c57bc2e24aa15p-57, 0x1.ff621e3796d7ep-1}, {0x1.521ecd0c67e35p-57, 0x1.fe9cdad01883ap-1}, @@ -449,48 +338,9 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = { {0x1.9a088a8bf6b2cp-59, -0x1.2d52092ce19f6p-4}, {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5}, {0x1.b1d63091a013p-64, -0x1.92155f7a3667ep-6}, +#endif // !LIBC_MATH_HAS_SMALL_TABLES }; -// For |x| < 2^-32, return k and u such that: -// k = round(x * 128/pi) -// x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo -LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) { - // Digits of pi/128, generated by Sollya with: - // > a = round(pi/128, D, RN); - // > b = round(pi/128 - a, D, RN); - constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60, - 0x1.921fb54442d18p-6}; - - double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI[3][0]; - double kd = fputil::nearest_integer(prod_hi); - - // Let y = x - k * (pi/128) - // Then |y| < pi / 256 - // With extra rounding errors, we can bound |y| < 2^-6. - double y_hi = fputil::multiply_add(kd, -PI_OVER_128_DD.hi, x); // Exact - // u_hi + u_lo ~ (y_hi + kd*(-PI_OVER_128_DD[1])) - // and |u_lo| < 2* ulp(u_hi) - // The upper bound 2^-6 is over-estimated, we should still have: - // |u_hi + u_lo| < 2^-6. - u.hi = fputil::multiply_add(kd, -PI_OVER_128_DD.lo, y_hi); - u.lo = y_hi - u.hi; // Exact; - u.lo = fputil::multiply_add(kd, -PI_OVER_128_DD.lo, u.lo); - // Error bound: - // For |x| < 2^32: - // |x * high part of 128/pi| < 2^32 * 2^6 = 2^38 - // So |k| = |round(x * high part of 128/pi)| < 2^38 - // And hence, - // |(x mod pi/128) - (u.hi + u.lo)| <= ulp(2 * kd * PI_OVER_128_DD.lo) - // < 2 * 2^38 * 2^-59 * 2^-52 - // = 2^-72 - // Note: if we limit the input exponent to the same as in non-FMA version, - // i.e., |x| < 2^-23, then the output errors can be bounded by 2^-81, similar - // to the large range reduction bound. - return static_cast(static_cast(kd)); -} - -} // namespace fma - } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_FMA_H diff --git a/libc/src/math/generic/range_reduction_double_nofma.h b/libc/src/math/generic/range_reduction_double_nofma.h index 445a45d3f9796a..56407329477989 100644 --- a/libc/src/math/generic/range_reduction_double_nofma.h +++ b/libc/src/math/generic/range_reduction_double_nofma.h @@ -15,174 +15,63 @@ #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/math/generic/range_reduction_double_common.h" namespace LIBC_NAMESPACE_DECL { -namespace nofma { - using fputil::DoubleDouble; -LIBC_INLINE constexpr int FAST_PASS_EXPONENT = 23; +LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) { + using FPBits = typename fputil::FPBits; + FPBits xbits(x); -// Digits of 2^(16*i) / pi, generated by Sollya with: -// For [2..62]: -// > for i from 3 to 63 do { -// pi_inv = 2^(16*(i - 3)) / pi; -// pn = nearestint(pi_inv); -// pi_frac = pi_inv - pn; -// a = round(pi_frac, 51, RN); -// b = round(pi_frac - a, 51, RN); -// c = round(pi_frac - a - b, D, RN); -// d = round(pi_frac - a - b - c, D, RN); -// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},"); -// }; -// For [0..1]: -// The leading bit of 2^(16*(i - 3)) / pi is very small, so we add 0.25 so that -// the conditions for the algorithms are still satisfied, and one of those -// conditions guarantees that ulp(0.25 * x_reduced) >= 2, and will safely be -// discarded. -// for i from 0 to 2 do { -// pi_frac = 0.25 + 2^(16*(i - 3)) / pi; -// a = round(pi_frac, 51, RN); -// b = round(pi_frac - a, 51, RN); -// c = round(pi_frac - a - b, D, RN); -// d = round(pi_frac - a - b - c, D, RN); -// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},"); -// }; -// For The fast pass using double-double, we only need 3 parts (a, b, c), but -// for the accurate pass using Float128, instead of using another table of -// Float128s, we simply add the fourth path (a, b, c, d), which simplify the -// implementation a bit and saving some memory. -LIBC_INLINE constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = { - {0x1.0000000000014p5, 0x1.7cc1b727220a8p-49, 0x1.4fe13abe8fa9ap-101, - 0x1.bb81b6c52b328p-155}, - {0x1.0000000145f3p5, 0x1.b727220a94fep-49, 0x1.3abe8fa9a6eep-101, - 0x1.b6c52b3278872p-155}, - {0x1.000145f306dc8p5, 0x1.c882a53f84ebp-47, -0x1.70565911f924fp-101, - 0x1.2b3278872084p-155}, - {0x1.45f306dc9c884p5, -0x1.5ac07b1505c14p-47, -0x1.96447e493ad4dp-99, - 0x1.3c439041fe516p-154}, - {-0x1.f246c6efab58p4, -0x1.ec5417056591p-49, -0x1.f924eb53361dep-101, - -0x1.bef806ba71508p-156}, - {0x1.391054a7f09d4p4, 0x1.f47d4d377036cp-48, 0x1.8a5664f10e41p-100, - 0x1.fe5163abdebbcp-154}, - {0x1.529fc2757d1f4p2, 0x1.34ddc0db62958p-50, 0x1.93c439041fe51p-102, - 0x1.8eaf7aef1586ep-156}, - {-0x1.ec5417056591p-1, -0x1.f924eb53361ep-53, 0x1.c820ff28b1d5fp-105, - -0x1.443a9e48db91cp-162}, - {-0x1.505c1596447e4p5, -0x1.275a99b0ef1cp-48, 0x1.07f9458eaf7afp-100, - -0x1.d4f246dc8e2dfp-157}, - {-0x1.596447e493ad4p1, -0x1.9b0ef1bef806cp-52, 0x1.63abdebbc561bp-106, - 0x1.c91b8e909374cp-160}, - {0x1.bb81b6c52b328p5, -0x1.de37df00d74e4p-49, 0x1.5ef5de2b0db92p-101, - 0x1.b8e909374b802p-156}, - {0x1.b6c52b3278874p5, -0x1.f7c035d38a844p-47, 0x1.778ac36e48dc7p-99, - 0x1.2126e97003249p-153}, - {0x1.2b3278872084p5, -0x1.ae9c5421443a8p-50, -0x1.e48db91c5bdb2p-102, - -0x1.68ffcdb688afbp-157}, - {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db8p-49, 0x1.2371d2126e97p-101, - 0x1.924bba8274648p-160}, - {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dbp-104, - -0x1.115f62e6de302p-158}, - {-0x1.ae9c5421443a8p-2, -0x1.e48db91c5bdb4p-54, 0x1.d2e006492eea1p-106, - -0x1.8b9b78c078854p-160}, - {-0x1.38a84288753c8p5, -0x1.1b7238b7b645cp-47, 0x1.c00c925dd413ap-99, - 0x1.921cfe1deb1cbp-154}, - {-0x1.0a21d4f246dc8p3, -0x1.c5bdb22d1ff9cp-50, 0x1.25dd413a3243ap-103, - -0x1.e214e34ed658cp-162}, - {-0x1.d4f246dc8e2ep3, 0x1.26e9700324978p-49, -0x1.5f62e6de301e2p-102, - -0x1.4e34ed658c117p-158}, - {-0x1.236e4716f6c8cp4, 0x1.700324977505p-49, -0x1.736f180f10a72p-101, - 0x1.62534e7dd1047p-155}, - {0x1.b8e909374b8p4, 0x1.924bba8274648p-48, 0x1.cfe1deb1cb12ap-102, - -0x1.63045df7282b4p-156}, - {0x1.09374b801924cp4, -0x1.15f62e6de302p-50, 0x1.deb1cb129a73fp-102, - -0x1.77dca0ad144bbp-158}, - {-0x1.68ffcdb688afcp3, 0x1.d1921cfe1debp-50, 0x1.cb129a73ee882p-102, - 0x1.afa975da24275p-157}, - {0x1.924bba8274648p0, 0x1.cfe1deb1cb128p-54, 0x1.a73ee88235f53p-106, - -0x1.44bb7b16638fep-162}, - {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106, - 0x1.213a671c09ad1p-160}, - {0x1.3a32439fc3bd8p1, -0x1.c69dacb1822fp-51, 0x1.1afa975da2427p-105, - 0x1.338e04d68befdp-159}, - {-0x1.b78c0788538d4p4, 0x1.29a73ee88236p-50, -0x1.5a28976f62cc7p-103, - -0x1.fb29741037d8dp-159}, - {0x1.fc3bd63962534p5, 0x1.cfba208d7d4bcp-48, -0x1.12edec598e3f6p-100, - -0x1.4ba081bec66e3p-154}, - {-0x1.4e34ed658c118p2, 0x1.046bea5d7689p-51, 0x1.3a671c09ad17ep-104, - -0x1.bec66e29c67cbp-162}, - {0x1.62534e7dd1048p5, -0x1.415a28976f62cp-47, -0x1.8e3f652e8207p-100, - 0x1.3991d63983534p-154}, - {-0x1.63045df7282b4p4, -0x1.44bb7b16638fcp-50, -0x1.94ba081bec66ep-102, - -0x1.4e33e566305b2p-157}, - {0x1.d1046bea5d768p5, 0x1.213a671c09adp-48, 0x1.7df904e64758ep-100, - 0x1.835339f49c846p-154}, - {0x1.afa975da24274p3, 0x1.9c7026b45f7e4p-50, 0x1.3991d63983534p-106, - -0x1.82d8dee81d108p-160}, - {-0x1.a28976f62cc7p5, -0x1.fb29741037d8cp-47, -0x1.b8a719f2b3183p-100, - 0x1.3908bf177bf25p-155}, - {-0x1.76f62cc71fb28p5, -0x1.741037d8cdc54p-47, 0x1.cc1a99cfa4e42p-101, - 0x1.7e2ef7e4a0ec8p-156}, - {0x1.d338e04d68bfp5, -0x1.bec66e29c67ccp-50, 0x1.339f49c845f8cp-102, - -0x1.081b5f13801dap-156}, - {0x1.c09ad17df905p4, -0x1.9b8a719f2b318p-48, -0x1.6c6f740e8840ep-103, - 0x1.41d8ffc4bffefp-157}, - {0x1.68befc827323cp5, -0x1.38cf9598c16c8p-47, 0x1.08bf177bf2507p-99, - 0x1.8ffc4bffef02dp-153}, - {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ff1p-103, - 0x1.7ffde05980fefp-158}, - {-0x1.8cdc538cf9598p5, -0x1.82d8dee81d108p-48, -0x1.b5f13801da001p-104, - 0x1.e05980fef2f12p-158}, - {-0x1.4e33e566305bp3, -0x1.bdd03a21036cp-49, 0x1.d8ffc4bffef03p-101, - -0x1.9fc04343b9d29p-156}, - {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f138p-52, -0x1.da00087e99fcp-104, - -0x1.0d0ee74a5f593p-158}, - {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086877p-109, - -0x1.d297d64b824b2p-164}, - {0x1.3908bf177bf24p5, 0x1.0763ff12fffbcp-47, 0x1.6603fbcbc462dp-104, - 0x1.a0a6d1f6d367fp-158}, - {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f593p-110, - 0x1.f6d367ecf27cbp-166}, - {-0x1.081b5f13801dcp4, 0x1.fff7816603fbcp-48, 0x1.788c5ad05369p-101, - -0x1.25930261b069fp-155}, - {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee8p-50, 0x1.6b414da3eda6dp-103, - -0x1.30d834f648b0cp-162}, - {-0x1.c00ed00043f4cp5, -0x1.fc04343b9d298p-48, 0x1.4da3eda6cfd9ep-103, - 0x1.3e584dba7a32p-157}, - {0x1.2fffbc0b301fcp5, 0x1.e5e2316b414dcp-47, -0x1.c125930261b07p-99, - 0x1.84dba7a31fb35p-153}, - {-0x1.0fd33f8086878p3, 0x1.8b5a0a6d1f6d4p-50, -0x1.30261b069ec91p-103, - -0x1.85ce04cb0d00bp-157}, - {-0x1.9fc04343b9d28p4, -0x1.7d64b824b2604p-48, -0x1.86c1a7b24585dp-101, - 0x1.fb34f2ff516bbp-157}, - {-0x1.0d0ee74a5f594p2, 0x1.1f6d367ecf27cp-50, 0x1.6136e9e8c7ecdp-103, - 0x1.e5fea2d7527bbp-158}, - {-0x1.dce94beb25c14p5, 0x1.a6cfd9e4f9614p-47, -0x1.22c2e70265868p-100, - -0x1.5d28ad8453814p-158}, - {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b6p-104, - -0x1.5b08a7028341dp-159}, - {0x1.b47db4d9fb3c8p4, 0x1.f2c26dd3d18fcp-48, 0x1.9a797fa8b5d4ap-100, - -0x1.14e050683a131p-156}, - {-0x1.25930261b06ap5, 0x1.36e9e8c7ecd3cp-47, 0x1.7fa8b5d49eeb2p-100, - -0x1.41a0e84c2f8c6p-158}, - {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bcp-51, 0x1.aea4f758fd7ccp-103, - -0x1.d0985f18c10ebp-159}, - {-0x1.b069ec9161738p5, -0x1.32c3402ba515cp-51, 0x1.eeb1faf97c5edp-104, - -0x1.7c63043ad6b69p-161}, - {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41dp-104, - -0x1.821d6b5b4565p-160}, - {-0x1.61738132c3404p5, 0x1.45aea4f758fd8p-47, -0x1.a0e84c2f8c608p-102, - -0x1.d6b5b45650128p-156}, - {0x1.fb34f2ff516bcp3, -0x1.6c229c0a0d074p-49, -0x1.30be31821d6b6p-104, - 0x1.2ea6bfb5fb12p-158}, - {0x1.3cbfd45aea4f8p5, -0x1.4e050683a130cp-48, 0x1.ce7de294a4baap-104, - -0x1.404a04ee072a3p-158}, - {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108, - -0x1.3b81ca8bdea7fp-164}, - {-0x1.15b08a702834p5, -0x1.d0985f18c10ecp-47, 0x1.4a4ba9afed7ecp-100, - 0x1.1f8d5d0856033p-154}, -}; + int x_e_m62 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 62); + idx = static_cast((x_e_m62 >> 4) + 3); + // Scale x down by 2^(-(16 * (idx - 3)) + xbits.set_biased_exponent((x_e_m62 & 15) + FPBits::EXP_BIAS + 62); + // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78 + x_reduced = xbits.get_val(); + // x * c_hi = ph.hi + ph.lo exactly. + DoubleDouble x_split = fputil::split(x_reduced); + DoubleDouble ph = fputil::exact_mult(x_split, x_reduced, + ONE_TWENTY_EIGHT_OVER_PI[idx][0]); + // x * c_mid = pm.hi + pm.lo exactly. + DoubleDouble pm = fputil::exact_mult(x_split, x_reduced, + ONE_TWENTY_EIGHT_OVER_PI[idx][1]); + // x * c_lo = pl.hi + pl.lo exactly. + DoubleDouble pl = fputil::exact_mult(x_split, x_reduced, + ONE_TWENTY_EIGHT_OVER_PI[idx][2]); + // Extract integral parts and fractional parts of (ph.lo + pm.hi). + double sum_hi = ph.lo + pm.hi; + double kd = fputil::nearest_integer(sum_hi); + + // x * 128/pi mod 1 ~ y_hi + y_mid + y_lo + y_hi = (ph.lo - kd) + pm.hi; // Exact + y_mid = fputil::exact_add(pm.lo, pl.hi); + y_lo = pl.lo; + + // y_l = x * c_lo_2 + pl.lo + double y_l = + fputil::multiply_add(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][3], y_lo); + DoubleDouble y = fputil::exact_add(y_hi, y_mid.hi); + y.lo += (y_mid.lo + y_l); + + // Digits of pi/128, generated by Sollya with: + // > a = round(pi/128, D, RN); + // > b = round(pi/128 - a, D, RN); + constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60, + 0x1.921fb54442d18p-6}; + + // Error bound: with {a} denote the fractional part of a, i.e.: + // {a} = a - round(a) + // Then, + // | {x * 128/pi} - (y_hi + y_lo) | <= ulp(ulp(y_hi)) <= 2^-105 + // | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-105 = 2^-110 + u = fputil::quick_mult(y, PI_OVER_128_DD); + + return static_cast(static_cast(kd)); +} // Lookup table for sin(k * pi / 128) with k = 0, ..., 255. // Table is generated with Sollya as follow: @@ -258,6 +147,7 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = { {0x1.e3a843d1db55fp-53, 0x1.ff621e3796d7cp-1}, {0x1.765595d548d9ap-54, 0x1.ffd886084cd0cp-1}, {0, 1}, +#ifndef LIBC_MATH_HAS_SMALL_TABLES {0x1.765595d548d9ap-54, 0x1.ffd886084cd0cp-1}, {0x1.e3a843d1db55fp-53, 0x1.ff621e3796d7cp-1}, {-0x1.eade132f3981dp-53, 0x1.fe9cdad01883cp-1}, @@ -449,46 +339,9 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = { {-0x1.ccbeeeae8129ap-56, -0x1.2d52092ce19f4p-4}, {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5}, {-0x1.f938a73db97fbp-58, -0x1.92155f7a3667cp-6}, +#endif // !LIBC_MATH_HAS_SMALL_TABLES }; -LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) { - constexpr double ONE_TWENTY_EIGHT_OVER_PI = 0x1.45f306dc9c883p5; - - // Digits of -pi/128, generated by Sollya with: - // > a = round(-pi/128, 25, RN); - // > b = round(-pi/128 - a, 23, RN); - // > c = round(-pi/128 - a - b, 25, RN); - // > d = round(-pi/128 - a - b - c, D, RN); - // -pi/128 ~ a + b + c + d - // The precisions of the parts are chosen so that: - // 1) k * a, k * b, k * c are exact in double precision - // 2) k * b + (x - (k * a)) is exact in double precsion - constexpr double MPI_OVER_128[4] = {-0x1.921fb5p-6, -0x1.110b48p-32, - +0x1.ee59dap-56, -0x1.98a2e03707345p-83}; - - double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI; - double kd = fputil::nearest_integer(prod_hi); - - // With -pi/128 ~ a + b + c + d as in MPI_OVER_128 description: - // t = x + k * a - double t = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact - // y_hi = t + k * b = (x + k * a) + k * b - double y_hi = fputil::multiply_add(kd, MPI_OVER_128[1], t); // Exact - // y_lo ~ k * c + k * d - double y_lo = fputil::multiply_add(kd, MPI_OVER_128[2], kd * MPI_OVER_128[3]); - // u.hi + u.lo ~ x + k * (a + b + c + d) - u = fputil::exact_add(y_hi, y_lo); - // Error bound: For |x| < 2^-23, - // |(x mod pi/128) - (u_hi + u_lo)| < ulp(y_lo) - // <= ulp(2 * x * c) - // <= ulp(2^24 * 2^-56) - // = 2^(24 - 56 - 52) - // = 2^-84 - return static_cast(static_cast(kd)); -} - -} // namespace nofma - } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_NOFMA_H diff --git a/libc/src/math/generic/sin.cpp b/libc/src/math/generic/sin.cpp index da3d1e94b5f645..2e1d3ffd5f37d8 100644 --- a/libc/src/math/generic/sin.cpp +++ b/libc/src/math/generic/sin.cpp @@ -18,17 +18,14 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/math/generic/range_reduction_double_common.h" #include "src/math/generic/sincos_eval.h" -// TODO: We might be able to improve the performance of large range reduction of -// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and -// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of -// those lookup table. -#include "range_reduction_double_common.h" - -#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0) -#define LIBC_MATH_SIN_SKIP_ACCURATE_PASS -#endif +#ifdef LIBC_TARGET_CPU_HAS_FMA +#include "range_reduction_double_fma.h" +#else +#include "range_reduction_double_nofma.h" +#endif // LIBC_TARGET_CPU_HAS_FMA namespace LIBC_NAMESPACE_DECL { @@ -43,33 +40,39 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { DoubleDouble y; unsigned k; - generic::LargeRangeReduction range_reduction_large{}; + LargeRangeReduction range_reduction_large{}; - // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA) + // |x| < 2^16 if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) { - // |x| < 2^-26 - if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) { - // Signed zeros. - if (LIBC_UNLIKELY(x == 0.0)) - return x; + // |x| < 2^-7 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) { + // |x| < 2^-26, |sin(x) - x| < ulp(x)/2. + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) { + // Signed zeros. + if (LIBC_UNLIKELY(x == 0.0)) + return x; - // For |x| < 2^-26, |sin(x) - x| < ulp(x)/2. #ifdef LIBC_TARGET_CPU_HAS_FMA - return fputil::multiply_add(x, -0x1.0p-54, x); + return fputil::multiply_add(x, -0x1.0p-54, x); #else - if (LIBC_UNLIKELY(x_e < 4)) { - int rounding_mode = fputil::quick_get_round(); - if (rounding_mode == FE_TOWARDZERO || - (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) || - (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD)) - return FPBits(xbits.uintval() - 1).get_val(); - } - return fputil::multiply_add(x, -0x1.0p-54, x); + if (LIBC_UNLIKELY(x_e < 4)) { + int rounding_mode = fputil::quick_get_round(); + if (rounding_mode == FE_TOWARDZERO || + (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) || + (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD)) + return FPBits(xbits.uintval() - 1).get_val(); + } + return fputil::multiply_add(x, -0x1.0p-54, x); #endif // LIBC_TARGET_CPU_HAS_FMA + } + // No range reduction needed. + k = 0; + y.lo = 0.0; + y.hi = x; + } else { + // Small range reduction. + k = range_reduction_small(x, y); } - - // // Small range reduction. - k = range_reduction_small(x, y); } else { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { @@ -82,69 +85,51 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { } // Large range reduction. - k = range_reduction_large.compute_high_part(x); - y = range_reduction_large.fast(); + k = range_reduction_large.fast(x, y); } DoubleDouble sin_y, cos_y; - generic::sincos_eval(y, sin_y, cos_y); + [[maybe_unused]] double err = generic::sincos_eval(y, sin_y, cos_y); // Look up sin(k * pi/128) and cos(k * pi/128) - // Memory saving versions: - - // Use 128-entry table instead: - // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127]; - // uint64_t sin_s = static_cast(k & 128) << (63 - 7); - // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127]; - // uint64_t cos_s = static_cast((k + 64) & 128) << (63 - 7); - // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - - // Use 64-entry table instead: - // auto get_idx_dd = [](unsigned kk) -> DoubleDouble { - // unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - // DoubleDouble ans = SIN_K_PI_OVER_128[idx]; - // if (kk & 128) { - // ans.hi = -ans.hi; - // ans.lo = -ans.lo; - // } - // return ans; - // }; - // DoubleDouble sin_k = get_idx_dd(k); - // DoubleDouble cos_k = get_idx_dd(k + 64); - +#ifdef LIBC_MATH_HAS_SMALL_TABLES + // Memory saving versions. Use 65-entry table. + auto get_idx_dd = [](unsigned kk) -> DoubleDouble { + unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); + DoubleDouble ans = SIN_K_PI_OVER_128[idx]; + if (kk & 128) { + ans.hi = -ans.hi; + ans.lo = -ans.lo; + } + return ans; + }; + DoubleDouble sin_k = get_idx_dd(k); + DoubleDouble cos_k = get_idx_dd(k + 64); +#else // Fast look up version, but needs 256-entry table. // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 255]; DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255]; +#endif // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128). // So k is an integer and -pi / 256 <= y <= pi / 256. // Then sin(x) = sin((k * pi/128 + y) // = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128) - DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k); - DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k); + DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k); + DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k); DoubleDouble rr = fputil::exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi); rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo; -#ifdef LIBC_MATH_SIN_SKIP_ACCURATE_PASS +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS return rr.hi + rr.lo; #else // Accurate test and pass for correctly rounded implementation. -#ifdef LIBC_TARGET_CPU_HAS_FMA - constexpr double ERR = 0x1.0p-70; -#else - // TODO: Improve non-FMA fast pass accuracy. - constexpr double ERR = 0x1.0p-66; -#endif // LIBC_TARGET_CPU_HAS_FMA - - double rlp = rr.lo + ERR; - double rlm = rr.lo - ERR; + double rlp = rr.lo + err; + double rlm = rr.lo - err; double r_upper = rr.hi + rlp; // (rr.lo + ERR); double r_lower = rr.hi + rlm; // (rr.lo - ERR); @@ -155,7 +140,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { Float128 u_f128, sin_u, cos_u; if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) - u_f128 = generic::range_reduction_small_f128(x); + u_f128 = range_reduction_small_f128(x); else u_f128 = range_reduction_large.accurate(); @@ -163,7 +148,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { auto get_sin_k = [](unsigned kk) -> Float128 { unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx]; + Float128 ans = SIN_K_PI_OVER_128_F128[idx]; if (kk & 128) ans.sign = Sign::NEG; return ans; @@ -182,7 +167,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { // https://github.com/llvm/llvm-project/issues/96452. return static_cast(r); -#endif // !LIBC_MATH_SIN_SKIP_ACCURATE_PASS +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/sincos.cpp b/libc/src/math/generic/sincos.cpp index 1af0ee7b0eb2c8..166ce466031409 100644 --- a/libc/src/math/generic/sincos.cpp +++ b/libc/src/math/generic/sincos.cpp @@ -19,17 +19,14 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/math/generic/range_reduction_double_common.h" #include "src/math/generic/sincos_eval.h" -// TODO: We might be able to improve the performance of large range reduction of -// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and -// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of -// those lookup table. -#include "range_reduction_double_common.h" - -#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0) -#define LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS -#endif +#ifdef LIBC_TARGET_CPU_HAS_FMA +#include "range_reduction_double_fma.h" +#else +#include "range_reduction_double_nofma.h" +#endif // LIBC_TARGET_CPU_HAS_FMA namespace LIBC_NAMESPACE_DECL { @@ -44,40 +41,47 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { DoubleDouble y; unsigned k; - generic::LargeRangeReduction range_reduction_large{}; + LargeRangeReduction range_reduction_large{}; - // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA) + // |x| < 2^16 if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) { - // |x| < 2^-27 - if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) { - // Signed zeros. - if (LIBC_UNLIKELY(x == 0.0)) { - *sin_x = x; - *cos_x = 1.0; - return; - } - - // For |x| < 2^-27, max(|sin(x) - x|, |cos(x) - 1|) < ulp(x)/2. + // |x| < 2^-7 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) { + // |x| < 2^-27 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) { + // Signed zeros. + if (LIBC_UNLIKELY(x == 0.0)) { + *sin_x = x; + *cos_x = 1.0; + return; + } + + // For |x| < 2^-27, max(|sin(x) - x|, |cos(x) - 1|) < ulp(x)/2. #ifdef LIBC_TARGET_CPU_HAS_FMA - *sin_x = fputil::multiply_add(x, -0x1.0p-54, x); - *cos_x = fputil::multiply_add(x, -x, 1.0); + *sin_x = fputil::multiply_add(x, -0x1.0p-54, x); + *cos_x = fputil::multiply_add(x, -x, 1.0); #else - *cos_x = fputil::round_result_slightly_down(1.0); - - if (LIBC_UNLIKELY(x_e < 4)) { - int rounding_mode = fputil::quick_get_round(); - if (rounding_mode == FE_TOWARDZERO || - (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) || - (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD)) - *sin_x = FPBits(xbits.uintval() - 1).get_val(); - } - *sin_x = fputil::multiply_add(x, -0x1.0p-54, x); + *cos_x = fputil::round_result_slightly_down(1.0); + + if (LIBC_UNLIKELY(x_e < 4)) { + int rounding_mode = fputil::quick_get_round(); + if (rounding_mode == FE_TOWARDZERO || + (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) || + (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD)) + *sin_x = FPBits(xbits.uintval() - 1).get_val(); + } + *sin_x = fputil::multiply_add(x, -0x1.0p-54, x); #endif // LIBC_TARGET_CPU_HAS_FMA - return; + return; + } + // No range reduction needed. + k = 0; + y.lo = 0.0; + y.hi = x; + } else { + // Small range reduction. + k = range_reduction_small(x, y); } - - // // Small range reduction. - k = range_reduction_small(x, y); } else { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { @@ -91,56 +95,46 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { } // Large range reduction. - k = range_reduction_large.compute_high_part(x); - y = range_reduction_large.fast(); + k = range_reduction_large.fast(x, y); } DoubleDouble sin_y, cos_y; - generic::sincos_eval(y, sin_y, cos_y); + [[maybe_unused]] double err = generic::sincos_eval(y, sin_y, cos_y); // Look up sin(k * pi/128) and cos(k * pi/128) - // Memory saving versions: - - // Use 128-entry table instead: - // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127]; - // uint64_t sin_s = static_cast(k & 128) << (63 - 7); - // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127]; - // uint64_t cos_s = static_cast((k + 64) & 128) << (63 - 7); - // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - - // Use 64-entry table instead: - // auto get_idx_dd = [](unsigned kk) -> DoubleDouble { - // unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - // DoubleDouble ans = SIN_K_PI_OVER_128[idx]; - // if (kk & 128) { - // ans.hi = -ans.hi; - // ans.lo = -ans.lo; - // } - // return ans; - // }; - // DoubleDouble sin_k = get_idx_dd(k); - // DoubleDouble cos_k = get_idx_dd(k + 64); - +#ifdef LIBC_MATH_HAS_SMALL_TABLES + // Memory saving versions. Use 65-entry table. + auto get_idx_dd = [](unsigned kk) -> DoubleDouble { + unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); + DoubleDouble ans = SIN_K_PI_OVER_128[idx]; + if (kk & 128) { + ans.hi = -ans.hi; + ans.lo = -ans.lo; + } + return ans; + }; + DoubleDouble sin_k = get_idx_dd(k); + DoubleDouble cos_k = get_idx_dd(k + 64); +#else // Fast look up version, but needs 256-entry table. // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 255]; DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255]; +#endif // LIBC_MATH_HAS_SMALL_TABLES + DoubleDouble msin_k{-sin_k.lo, -sin_k.hi}; // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128). // So k is an integer and -pi / 256 <= y <= pi / 256. // Then sin(x) = sin((k * pi/128 + y) // = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128) - DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k); - DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k); + DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k); + DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k); // cos(x) = cos((k * pi/128 + y) // = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128) - DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k); - DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k); + DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k); + DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k); DoubleDouble sin_dd = fputil::exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi); @@ -149,24 +143,17 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { sin_dd.lo += sin_k_cos_y.lo + cos_k_sin_y.lo; cos_dd.lo += msin_k_sin_y.lo + cos_k_cos_y.lo; -#ifdef LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS *sin_x = sin_dd.hi + sin_dd.lo; *cos_x = cos_dd.hi + cos_dd.lo; return; #else // Accurate test and pass for correctly rounded implementation. -#ifdef LIBC_TARGET_CPU_HAS_FMA - constexpr double ERR = 0x1.0p-70; -#else - // TODO: Improve non-FMA fast pass accuracy. - constexpr double ERR = 0x1.0p-66; -#endif // LIBC_TARGET_CPU_HAS_FMA - - double sin_lp = sin_dd.lo + ERR; - double sin_lm = sin_dd.lo - ERR; - double cos_lp = cos_dd.lo + ERR; - double cos_lm = cos_dd.lo - ERR; + double sin_lp = sin_dd.lo + err; + double sin_lm = sin_dd.lo - err; + double cos_lp = cos_dd.lo + err; + double cos_lm = cos_dd.lo - err; double sin_upper = sin_dd.hi + sin_lp; double sin_lower = sin_dd.hi + sin_lm; @@ -182,7 +169,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { Float128 u_f128, sin_u, cos_u; if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) - u_f128 = generic::range_reduction_small_f128(x); + u_f128 = range_reduction_small_f128(x); else u_f128 = range_reduction_large.accurate(); @@ -190,7 +177,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { auto get_sin_k = [](unsigned kk) -> Float128 { unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx]; + Float128 ans = SIN_K_PI_OVER_128_F128[idx]; if (kk & 128) ans.sign = Sign::NEG; return ans; @@ -222,7 +209,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u), fputil::quick_mul(msin_k_f128, sin_u))); -#endif // !LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/sincos_eval.h b/libc/src/math/generic/sincos_eval.h index e491467c5663fd..6cd1da4721bf57 100644 --- a/libc/src/math/generic/sincos_eval.h +++ b/libc/src/math/generic/sincos_eval.h @@ -23,8 +23,8 @@ namespace generic { using fputil::DoubleDouble; using Float128 = fputil::DyadicFloat<128>; -LIBC_INLINE void sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u, - DoubleDouble &cos_u) { +LIBC_INLINE double sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u, + DoubleDouble &cos_u) { // Evaluate sin(y) = sin(x - k * (pi/128)) // We use the degree-7 Taylor approximation: // sin(y) ~ y - y^3/3! + y^5/5! - y^7/7! @@ -61,9 +61,19 @@ LIBC_INLINE void sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u, // + u_hi u_lo (-1 + u_hi^2/6) // We compute 1 - u_hi^2 accurately: // v_hi + v_lo ~ 1 - u_hi^2/2 - double v_hi = fputil::multiply_add(u.hi, u.hi * (-0.5), 1.0); - double v_lo = 1.0 - v_hi; // Exact - v_lo = fputil::multiply_add(u.hi, u.hi * (-0.5), v_lo); + // with error <= 2^-105. + double u_hi_neg_half = (-0.5) * u.hi; + DoubleDouble v; + +#ifdef LIBC_TARGET_CPU_HAS_FMA + v.hi = fputil::multiply_add(u.hi, u_hi_neg_half, 1.0); + v.lo = 1.0 - v.hi; // Exact + v.lo = fputil::multiply_add(u.hi, u_hi_neg_half, v.lo); +#else + DoubleDouble u_hi_sq_neg_half = fputil::exact_mult(u.hi, u_hi_neg_half); + v = fputil::exact_add(1.0, u_hi_sq_neg_half.hi); + v.lo += u_hi_sq_neg_half.lo; +#endif // LIBC_TARGET_CPU_HAS_FMA // r1 ~ -1/720 + u_hi^2 / 40320 double r1 = fputil::multiply_add(u_hi_sq, 0x1.a01a01a01a01ap-16, @@ -75,12 +85,15 @@ LIBC_INLINE void sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u, // r2 ~ 1/24 + u_hi^2 (-1/720 + u_hi^2 / 40320) double r2 = fputil::multiply_add(u_hi_sq, r1, 0x1.5555555555555p-5); // s2 ~ v_lo + u_hi * u_lo * (-1 + u_hi^2 / 6) - double s2 = fputil::multiply_add(u_hi_u_lo, s1, v_lo); + double s2 = fputil::multiply_add(u_hi_u_lo, s1, v.lo); double cos_lo = fputil::multiply_add(u_hi_4, r2, s2); // Overall, |cos(y) - (v_hi + cos_lo)| < 2*ulp(u_hi^4) < 2^-75. sin_u = fputil::exact_add(u.hi, sin_lo); - cos_u = fputil::exact_add(v_hi, cos_lo); + cos_u = fputil::exact_add(v.hi, cos_lo); + + return fputil::multiply_add(fputil::FPBits(u_hi_3).abs().get_val(), + 0x1.0p-51, 0x1.0p-105); } LIBC_INLINE void sincos_eval(const Float128 &u, Float128 &sin_u, diff --git a/libc/src/math/generic/tan.cpp b/libc/src/math/generic/tan.cpp index 45fd8bb9156be0..f9be25ed866e1d 100644 --- a/libc/src/math/generic/tan.cpp +++ b/libc/src/math/generic/tan.cpp @@ -20,16 +20,13 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/math/generic/range_reduction_double_common.h" -// TODO: We might be able to improve the performance of large range reduction of -// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and -// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of -// those lookup table. -#include "range_reduction_double_common.h" - -#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0) -#define LIBC_MATH_TAN_SKIP_ACCURATE_PASS -#endif +#ifdef LIBC_TARGET_CPU_HAS_FMA +#include "range_reduction_double_fma.h" +#else +#include "range_reduction_double_nofma.h" +#endif // LIBC_TARGET_CPU_HAS_FMA namespace LIBC_NAMESPACE_DECL { @@ -38,7 +35,7 @@ using Float128 = typename fputil::DyadicFloat<128>; namespace { -LIBC_INLINE DoubleDouble tan_eval(const DoubleDouble &u) { +LIBC_INLINE double tan_eval(const DoubleDouble &u, DoubleDouble &result) { // Evaluate tan(y) = tan(x - k * (pi/128)) // We use the degree-9 Taylor approximation: // tan(y) ~ P(y) = y + y^3/3 + 2*y^5/15 + 17*y^7/315 + 62*y^9/2835 @@ -69,10 +66,12 @@ LIBC_INLINE DoubleDouble tan_eval(const DoubleDouble &u) { // Overall, |tan(y) - (u_hi + tan_lo)| < ulp(u_hi^3) <= 2^-71. // And the relative errors is: // |(tan(y) - (u_hi + tan_lo)) / tan(y) | <= 2*ulp(u_hi^2) < 2^-64 - - return fputil::exact_add(u.hi, tan_lo); + result = fputil::exact_add(u.hi, tan_lo); + return fputil::multiply_add(fputil::FPBits(u_hi_3).abs().get_val(), + 0x1.0p-51, 0x1.0p-102); } +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS // Accurate evaluation of tan for small u. [[maybe_unused]] Float128 tan_eval(const Float128 &u) { Float128 u_sq = fputil::quick_mul(u, u); @@ -117,6 +116,7 @@ LIBC_INLINE DoubleDouble tan_eval(const DoubleDouble &u) { fputil::quick_mul(q1, fputil::quick_add(TWO, fputil::quick_mul(b, q1))); return fputil::quick_mul(a, q2); } +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS } // anonymous namespace @@ -128,33 +128,38 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { DoubleDouble y; unsigned k; - generic::LargeRangeReduction range_reduction_large{}; + LargeRangeReduction range_reduction_large{}; - // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA) + // |x| < 2^16 if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) { - // |x| < 2^-27 - if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) { - // Signed zeros. - if (LIBC_UNLIKELY(x == 0.0)) - return x; + // |x| < 2^-7 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) { + // |x| < 2^-27, |tan(x) - x| < ulp(x)/2. + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) { + // Signed zeros. + if (LIBC_UNLIKELY(x == 0.0)) + return x; - // For |x| < 2^-27, |tan(x) - x| < ulp(x)/2. #ifdef LIBC_TARGET_CPU_HAS_FMA - return fputil::multiply_add(x, 0x1.0p-54, x); + return fputil::multiply_add(x, 0x1.0p-54, x); #else - if (LIBC_UNLIKELY(x_e < 4)) { - int rounding_mode = fputil::quick_get_round(); - if (rounding_mode == FE_TOWARDZERO || - (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) || - (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD)) - return FPBits(xbits.uintval() + 1).get_val(); - } - return fputil::multiply_add(x, 0x1.0p-54, x); + if (LIBC_UNLIKELY(x_e < 4)) { + int rounding_mode = fputil::quick_get_round(); + if ((xbits.sign() == Sign::POS && rounding_mode == FE_UPWARD) || + (xbits.sign() == Sign::NEG && rounding_mode == FE_DOWNWARD)) + return FPBits(xbits.uintval() + 1).get_val(); + } + return fputil::multiply_add(x, 0x1.0p-54, x); #endif // LIBC_TARGET_CPU_HAS_FMA + } + // No range reduction needed. + k = 0; + y.lo = 0.0; + y.hi = x; + } else { + // Small range reduction. + k = range_reduction_small(x, y); } - - // // Small range reduction. - k = range_reduction_small(x, y); } else { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { @@ -167,42 +172,32 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { } // Large range reduction. - k = range_reduction_large.compute_high_part(x); - y = range_reduction_large.fast(); + k = range_reduction_large.fast(x, y); } - DoubleDouble tan_y = tan_eval(y); + DoubleDouble tan_y; + [[maybe_unused]] double err = tan_eval(y, tan_y); // Look up sin(k * pi/128) and cos(k * pi/128) - // Memory saving versions: - - // Use 128-entry table instead: - // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127]; - // uint64_t sin_s = static_cast(k & 128) << (63 - 7); - // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val(); - // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127]; - // uint64_t cos_s = static_cast((k + 64) & 128) << (63 - 7); - // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val(); - - // Use 64-entry table instead: - // auto get_idx_dd = [](unsigned kk) -> DoubleDouble { - // unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - // DoubleDouble ans = SIN_K_PI_OVER_128[idx]; - // if (kk & 128) { - // ans.hi = -ans.hi; - // ans.lo = -ans.lo; - // } - // return ans; - // }; - // DoubleDouble msin_k = get_idx_dd(k + 128); - // DoubleDouble cos_k = get_idx_dd(k + 64); - +#ifdef LIBC_MATH_HAS_SMALL_TABLES + // Memory saving versions. Use 65-entry table: + auto get_idx_dd = [](unsigned kk) -> DoubleDouble { + unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); + DoubleDouble ans = SIN_K_PI_OVER_128[idx]; + if (kk & 128) { + ans.hi = -ans.hi; + ans.lo = -ans.lo; + } + return ans; + }; + DoubleDouble msin_k = get_idx_dd(k + 128); + DoubleDouble cos_k = get_idx_dd(k + 64); +#else // Fast look up version, but needs 256-entry table. // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128) & 255]; DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255]; +#endif // LIBC_MATH_HAS_SMALL_TABLES // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128). // So k is an integer and -pi / 256 <= y <= pi / 256. @@ -212,8 +207,8 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { // / (cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)) // = (sin(k*pi/128) + tan(y) * cos(k*pi/128)) / // / (cos(k*pi/128) - tan(y) * sin(k*pi/128)) - DoubleDouble cos_k_tan_y = fputil::quick_mult(tan_y, cos_k); - DoubleDouble msin_k_tan_y = fputil::quick_mult(tan_y, msin_k); + DoubleDouble cos_k_tan_y = fputil::quick_mult(tan_y, cos_k); + DoubleDouble msin_k_tan_y = fputil::quick_mult(tan_y, msin_k); // num_dd = sin(k*pi/128) + tan(y) * cos(k*pi/128) DoubleDouble num_dd = fputil::exact_add(cos_k_tan_y.hi, -msin_k.hi); @@ -222,7 +217,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { num_dd.lo += cos_k_tan_y.lo - msin_k.lo; den_dd.lo += msin_k_tan_y.lo + cos_k.lo; -#ifdef LIBC_MATH_TAN_SKIP_ACCURATE_PASS +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS double tan_x = (num_dd.hi + num_dd.lo) / (den_dd.hi + den_dd.lo); return tan_x; #else @@ -231,18 +226,16 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { // Accurate double-double division DoubleDouble tan_x = fputil::div(num_dd, den_dd); - // Relative errors for k != 0 mod 64 is: - // absolute errors / min(sin(k*pi/128), cos(k*pi/128)) <= 2^-71 / 2^-7 - // = 2^-64. - // For k = 0 mod 64, the relative errors is bounded by: - // 2^-71 / 2^(exponent of x). - constexpr int ERR = 64; + // Simple error bound: |1 / den_dd| < 2^(1 + floor(-log2(den_dd)))). + uint64_t den_inv = (static_cast(FPBits::EXP_BIAS + 1) + << (FPBits::FRACTION_LEN + 1)) - + (FPBits(den_dd.hi).uintval() & FPBits::EXP_MASK); - int y_exp = 7 + FPBits(y.hi).get_exponent(); - int rel_err_exp = ERR + static_cast((k & 63) == 0) * y_exp; - int64_t tan_x_err = static_cast(FPBits(tan_x.hi).uintval()) - - (static_cast(rel_err_exp) << 52); - double tan_err = FPBits(static_cast(tan_x_err)).get_val(); + // For tan_x = (num_dd + err) / (den_dd + err), the error is bounded by: + // | tan_x - num_dd / den_dd | <= err * ( 1 + | tan_x * den_dd | ). + double tan_err = + err * fputil::multiply_add(FPBits(den_inv).get_val(), + FPBits(tan_x.hi).abs().get_val(), 1.0); double err_higher = tan_x.lo + tan_err; double err_lower = tan_x.lo - tan_err; @@ -256,7 +249,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { Float128 u_f128; if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) - u_f128 = generic::range_reduction_small_f128(x); + u_f128 = range_reduction_small_f128(x); else u_f128 = range_reduction_large.accurate(); @@ -264,7 +257,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { auto get_sin_k = [](unsigned kk) -> Float128 { unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx]; + Float128 ans = SIN_K_PI_OVER_128_F128[idx]; if (kk & 128) ans.sign = Sign::NEG; return ans; @@ -292,7 +285,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { // https://github.com/llvm/llvm-project/issues/96452. return static_cast(result); -#endif // !LIBC_MATH_TAN_SKIP_ACCURATE_PASS +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp index 484d47fd3e96c4..e2d47917e545e0 100644 --- a/libc/test/src/math/cos_test.cpp +++ b/libc/test/src/math/cos_test.cpp @@ -50,8 +50,7 @@ TEST_F(LlvmLibcCosTest, TrickyInputs) { 0x1.2b5fe88a9d8d5p+903, 0x1.f6d7518808571p+1023, -0x1.a880417b7b119p+1023, 0x1.00a33764a0a83p-7, 0x1.fe81868fc47fep+1, 0x1.0da8cc189b47dp-10, - 0x1.da1838053b866p+5, - + 0x1.da1838053b866p+5, 0x1.ffffffffe854bp199, }; constexpr int N = sizeof(INPUTS) / sizeof(INPUTS[0]); diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp index 60f6ef5c844630..d4c6bd416a4099 100644 --- a/libc/test/src/math/sin_test.cpp +++ b/libc/test/src/math/sin_test.cpp @@ -20,11 +20,13 @@ using LIBC_NAMESPACE::testing::tlog; TEST_F(LlvmLibcSinTest, TrickyInputs) { constexpr double INPUTS[] = { - 0x1.940c877fb7dacp-7, 0x1.fffffffffdb6p24, 0x1.fd4da4ef37075p29, - 0x1.b951f1572eba5p+31, 0x1.55202aefde314p+31, 0x1.85fc0f04c0128p101, - 0x1.7776c2343ba4ep101, 0x1.678309fa50d58p110, 0x1.fffffffffef4ep199, - -0x1.ab514bfc61c76p+7, -0x1.f7898d5a756ddp+2, -0x1.f42fb19b5b9b2p-6, - 0x1.5f09cad750ab1p+3, -0x1.14823229799c2p+7, -0x1.0285070f9f1bcp-5, + 0x1.5f09cad750ab1p+3, 0x1.fff781921b61fp15, -0x1.f635b70b92407p-21, + -0x1.3ecf146c39c0cp-20, 0x1.6ac5b262ca1ffp849, 0x1.6c6cbc45dc8dep5, + 0x1.921fb5443p-7, 0x1.940c877fb7dacp-7, 0x1.fffffffffdb6p24, + 0x1.fd4da4ef37075p29, 0x1.b951f1572eba5p+31, 0x1.55202aefde314p+31, + 0x1.85fc0f04c0128p101, 0x1.7776c2343ba4ep101, 0x1.678309fa50d58p110, + 0x1.fffffffffef4ep199, -0x1.ab514bfc61c76p+7, -0x1.f7898d5a756ddp+2, + -0x1.f42fb19b5b9b2p-6, -0x1.14823229799c2p+7, -0x1.0285070f9f1bcp-5, 0x1.23f40dccdef72p+0, 0x1.43cf16358c9d7p+0, 0x1.addf3b9722265p+0, 0x1.48ff1782ca91dp+8, 0x1.a211877de55dbp+4, 0x1.dcbfda0c7559ep+8, 0x1.1ffb509f3db15p+5, 0x1.2345d1e090529p+5, 0x1.ae945054939c2p+10, diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp index 1ca67afdaddf25..12dfc02bac111a 100644 --- a/libc/test/src/math/tan_test.cpp +++ b/libc/test/src/math/tan_test.cpp @@ -20,17 +20,20 @@ using LIBC_NAMESPACE::testing::tlog; TEST_F(LlvmLibcTanTest, TrickyInputs) { constexpr double INPUTS[] = { - 0x1.d130383d17321p-27, 0x1.8000000000009p-23, 0x1.8000000000024p-22, - 0x1.800000000009p-21, 0x1.20000000000f3p-20, 0x1.800000000024p-20, - 0x1.e0000000001c2p-20, 0x1.00452f0e0134dp-13, 0x1.0da8cc189b47dp-10, - 0x1.00a33764a0a83p-7, 0x1.911a18779813fp-7, 0x1.940c877fb7dacp-7, - 0x1.f42fb19b5b9b2p-6, 0x1.0285070f9f1bcp-5, 0x1.89f0f5241255bp-2, + 0x0.0000000000001p-1022, 0x1.d130383d17321p-27, 0x1.8000000000009p-23, + 0x1.8000000000024p-22, 0x1.800000000009p-21, 0x1.20000000000f3p-20, + 0x1.800000000024p-20, 0x1.e0000000001c2p-20, 0x1.00452f0e0134dp-13, + 0x1.0da8cc189b47dp-10, 0x1.00a33764a0a83p-7, 0x1.911a18779813fp-7, + 0x1.940c877fb7dacp-7, 0x1.f42fb19b5b9b2p-6, 0x1.0285070f9f1bcp-5, + 0x1.90e833c6969c7p-4, 0x1.91d4b77c527eap-3, 0x1.89f0f5241255bp-2, 0x1.6ca9ef729af76p-1, 0x1.23f40dccdef72p+0, 0x1.43cf16358c9d7p+0, + 0x1.90f422b49115ep+0, 0x1.9220efee9fc7ep+0, 0x1.a224411cdebcep+0, 0x1.addf3b9722265p+0, 0x1.ae78d360afa15p+0, 0x1.fe81868fc47fep+1, - 0x1.e31b55306f22cp+2, 0x1.e639103a05997p+2, 0x1.f7898d5a756ddp+2, - 0x1.1685973506319p+3, 0x1.5f09cad750ab1p+3, 0x1.aaf85537ea4c7p+3, - 0x1.4f2b874135d27p+4, 0x1.13114266f9764p+4, 0x1.a211877de55dbp+4, - 0x1.a5eece87e8606p+4, 0x1.a65d441ea6dcep+4, 0x1.045457ae3994p+5, + 0x1.e31b55306f22cp+2, 0x1.e639103a05997p+2, 0x1.f69d074a3358fp+2, + 0x1.f7898d5a756ddp+2, 0x1.1685973506319p+3, 0x1.5f09cad750ab1p+3, + 0x1.aaf85537ea4c7p+3, 0x1.c50ddc4f513b4p+3, 0x1.13114266f9764p+4, + 0x1.4f2b874135d27p+4, 0x1.a211877de55dbp+4, 0x1.a5eece87e8606p+4, + 0x1.a65d441ea6dcep+4, 0x1.ab8c2f8ab5b7p+4, 0x1.045457ae3994p+5, 0x1.1ffb509f3db15p+5, 0x1.2345d1e090529p+5, 0x1.c96e28eb679f8p+5, 0x1.da1838053b866p+5, 0x1.be886d9c2324dp+6, 0x1.ab514bfc61c76p+7, 0x1.14823229799c2p+7, 0x1.48ff1782ca91dp+8, 0x1.dcbfda0c7559ep+8, From e01ae3920dd98779f2e58aa8f103ae3b6c6b5499 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 20:56:25 -0700 Subject: [PATCH 116/177] [NFC][sanitizer] Use tid_t instead of int in ThreadLister (#111941) --- compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 4 ++-- compiler-rt/lib/sanitizer_common/sanitizer_linux.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index a4e58133c79f08..31750cf65ab6eb 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -1086,7 +1086,7 @@ ThreadLister::Result ThreadLister::ListThreads( } } -const char *ThreadLister::LoadStatus(int tid) { +const char *ThreadLister::LoadStatus(tid_t tid) { auto cleanup = at_scope_exit([&] { // Resize back to capacity if it is downsized by `ReadFileToVector`. buffer_.resize(buffer_.capacity()); @@ -1097,7 +1097,7 @@ const char *ThreadLister::LoadStatus(int tid) { return buffer_.data(); } -bool ThreadLister::IsAlive(int tid) { +bool ThreadLister::IsAlive(tid_t tid) { // /proc/%d/task/%d/status uses same call to detect alive threads as // proc_task_readdir. See task_state implementation in Linux. static const char kPrefix[] = "\nPPid:"; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h index 07d9528813b3fe..8b7874bb5a3494 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h @@ -103,10 +103,10 @@ class ThreadLister { Ok, }; Result ListThreads(InternalMmapVector *threads); - const char *LoadStatus(int tid); + const char *LoadStatus(tid_t tid); private: - bool IsAlive(int tid); + bool IsAlive(tid_t tid); InternalScopedString task_path_; InternalScopedString status_path_; From 59b2945c705671a676806b8985c3ade8d6088ab1 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 20:57:34 -0700 Subject: [PATCH 117/177] [sanitizer] Fix ThreadLister::IsAlive (#111942) 'status_path_' must include `tid`. Regression from #111909. --- compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 3 ++- .../lib/sanitizer_common/tests/sanitizer_linux_test.cpp | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index 31750cf65ab6eb..33107eb0b42993 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -1027,7 +1027,6 @@ bool internal_sigismember(__sanitizer_sigset_t *set, int signum) { // ThreadLister implementation. ThreadLister::ThreadLister(pid_t pid) : buffer_(4096) { task_path_.AppendF("/proc/%d/task", pid); - status_path_.AppendF("%s/status", task_path_.data()); } ThreadLister::Result ThreadLister::ListThreads( @@ -1087,6 +1086,8 @@ ThreadLister::Result ThreadLister::ListThreads( } const char *ThreadLister::LoadStatus(tid_t tid) { + status_path_.clear(); + status_path_.AppendF("%s/%llu/status", task_path_.data(), tid); auto cleanup = at_scope_exit([&] { // Resize back to capacity if it is downsized by `ReadFileToVector`. buffer_.resize(buffer_.capacity()); diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp index b286ab72a5c795..ce4a40444cd496 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp @@ -143,6 +143,9 @@ TEST_F(ThreadListerTest, ThreadListerSeesAllSpawnedThreads) { std::vector listed_tids = ReadTidsToVector(&thread_lister); ASSERT_TRUE(HasElement(listed_tids, self_tid)); ASSERT_TRUE(Includes(listed_tids, tids_)); + + ASSERT_NE(nullptr, thread_lister.LoadStatus(self_tid)); + for (auto tid : tids_) ASSERT_NE(nullptr, thread_lister.LoadStatus(tid)); } TEST_F(ThreadListerTest, DoNotForgetThreads) { From 36b07077673b6c639804160c6b31ce57718e13db Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 05:58:25 +0200 Subject: [PATCH 118/177] [clang][bytecode] Return an lvalue path for dummy pointers (#111862) Not doing this is wrong in general and we need to reject expressions where it would matter differently. --- clang/lib/AST/ByteCode/Compiler.cpp | 16 ++++++++++------ clang/lib/AST/ByteCode/Pointer.cpp | 5 ----- clang/test/AST/ByteCode/cxx1z.cpp | 3 +++ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index ba4c5600d613b0..0a3b38b0dc6e57 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6006,6 +6006,9 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { return this->emitGetPtrParam(It->second.Offset, E); } + + if (D->getType()->isReferenceType()) + return false; // FIXME: Do we need to emit InvalidDeclRef? } // In case we need to re-visit a declaration. @@ -6042,9 +6045,7 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { const auto typeShouldBeVisited = [&](QualType T) -> bool { if (T.isConstant(Ctx.getASTContext())) return true; - if (const auto *RT = T->getAs()) - return RT->getPointeeType().isConstQualified(); - return false; + return T->isReferenceType(); }; // DecompositionDecls are just proxies for us. @@ -6060,9 +6061,12 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { // other words, we're evaluating the initializer, just to know if we can // evaluate the initializer. if (VD->isLocalVarDecl() && typeShouldBeVisited(VD->getType()) && - VD->getInit() && !VD->getInit()->isValueDependent() && - VD->evaluateValue()) - return revisit(VD); + VD->getInit() && !VD->getInit()->isValueDependent()) { + + if (VD->evaluateValue()) + return revisit(VD); + return this->emitInvalidDeclRef(cast(E), E); + } } } else { if (const auto *VD = dyn_cast(D); diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index a52f0e336ef298..75b00dcb2ab242 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -253,11 +253,6 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const { } } - // FIXME(perf): We compute the lvalue path above, but we can't supply it - // for dummy pointers (that causes crashes later in CheckConstantExpression). - if (isDummy()) - Path.clear(); - // We assemble the LValuePath starting from the innermost pointer to the // outermost one. SO in a.b.c, the first element in Path will refer to // the field 'c', while later code expects it to refer to 'a'. diff --git a/clang/test/AST/ByteCode/cxx1z.cpp b/clang/test/AST/ByteCode/cxx1z.cpp index 2b5d215f016548..1a06597fa348fe 100644 --- a/clang/test/AST/ByteCode/cxx1z.cpp +++ b/clang/test/AST/ByteCode/cxx1z.cpp @@ -10,3 +10,6 @@ namespace Temp { A c; // both-error {{reference to subobject of temporary object}} A d; // both-error {{pointer to subobject of temporary object}} } + +char arr[3]; +A d; // both-error {{refers to subobject '&arr[1]'}} From 374886a360424d5f1c38359378a504408a9f64ed Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 20:59:48 -0700 Subject: [PATCH 119/177] [NFC][sanitizer] Check suspended threads outside `ThreadSuspender::SuspendThread` (#111943) Allows to distinguish failure from stopped threads. --- .../sanitizer_stoptheworld_linux_libcdep.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp index 6ebca965f6a334..ebe7b6f2ee8cc2 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp @@ -137,10 +137,6 @@ class ThreadSuspender { }; bool ThreadSuspender::SuspendThread(tid_t tid) { - // Are we already attached to this thread? - // Currently this check takes linear time, however the number of threads is - // usually small. - if (suspended_threads_list_.ContainsTid(tid)) return false; int pterrno; if (internal_iserror(internal_ptrace(PTRACE_ATTACH, tid, nullptr, nullptr), &pterrno)) { @@ -226,6 +222,11 @@ bool ThreadSuspender::SuspendAllThreads() { break; } for (tid_t tid : threads) { + // Are we already attached to this thread? + // Currently this check takes linear time, however the number of threads + // is usually small. + if (suspended_threads_list_.ContainsTid(tid)) + continue; if (SuspendThread(tid)) retry = true; else From 36639af8adcd302e12f2962fd2b917d41323e5ae Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 21:01:07 -0700 Subject: [PATCH 120/177] [NFC][sanitizer] VReport incomplete list (#111944) --- .../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp index ebe7b6f2ee8cc2..945da99d41f4ea 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp @@ -216,6 +216,7 @@ bool ThreadSuspender::SuspendAllThreads() { VReport(1, "Failed to list threads\n"); return false; case ThreadLister::Incomplete: + VReport(1, "Incomplete list\n"); retry = true; break; case ThreadLister::Ok: From e556f0787cb9675a120fcfc91156edcd27047772 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 21:03:29 -0700 Subject: [PATCH 121/177] [NFC][asan] Cleanup AsanThreadIdAndName ctor/init (#111923) Co-authored-by: YunQiang Su --- compiler-rt/lib/asan/asan_descriptions.cpp | 26 +++++++++------------- compiler-rt/lib/asan/asan_descriptions.h | 2 -- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp index 674fe9c1e90be0..db87789aea86a0 100644 --- a/compiler-rt/lib/asan/asan_descriptions.cpp +++ b/compiler-rt/lib/asan/asan_descriptions.cpp @@ -20,24 +20,20 @@ namespace __asan { AsanThreadIdAndName::AsanThreadIdAndName(AsanThreadContext *t) { - Init(t->tid, t->name); -} - -AsanThreadIdAndName::AsanThreadIdAndName(u32 tid) { - if (tid == kInvalidTid) { - Init(tid, ""); - } else { - asanThreadRegistry().CheckLocked(); - AsanThreadContext *t = GetThreadContextByTidLocked(tid); - Init(tid, t->name); + if (!t) { + internal_snprintf(name, sizeof(name), "T-1"); + return; } + int len = internal_snprintf(name, sizeof(name), "T%d", t->tid); + CHECK(((unsigned int)len) < sizeof(name)); + if (internal_strlen(t->name)) + internal_snprintf(&name[len], sizeof(name) - len, " (%s)", t->name); } -void AsanThreadIdAndName::Init(u32 tid, const char *tname) { - int len = internal_snprintf(name, sizeof(name), "T%d", tid); - CHECK(((unsigned int)len) < sizeof(name)); - if (tname[0] != '\0') - internal_snprintf(&name[len], sizeof(name) - len, " (%s)", tname); +AsanThreadIdAndName::AsanThreadIdAndName(u32 tid) + : AsanThreadIdAndName( + tid == kInvalidTid ? nullptr : GetThreadContextByTidLocked(tid)) { + asanThreadRegistry().CheckLocked(); } void DescribeThread(AsanThreadContext *context) { diff --git a/compiler-rt/lib/asan/asan_descriptions.h b/compiler-rt/lib/asan/asan_descriptions.h index 650e2eb9173ad5..a614f47d461bbd 100644 --- a/compiler-rt/lib/asan/asan_descriptions.h +++ b/compiler-rt/lib/asan/asan_descriptions.h @@ -35,8 +35,6 @@ class AsanThreadIdAndName { const char *c_str() const { return &name[0]; } private: - void Init(u32 tid, const char *tname); - char name[128]; }; From df4c91342577cd9a74f168ad8c98380538d5e7c4 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 21:04:25 -0700 Subject: [PATCH 122/177] [asan] Print `unique_id` instead of `tid` (#111925) Before the first reuse, after 2^32 threads they are equal. --- compiler-rt/lib/asan/asan_descriptions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp index db87789aea86a0..caec79313e22ff 100644 --- a/compiler-rt/lib/asan/asan_descriptions.cpp +++ b/compiler-rt/lib/asan/asan_descriptions.cpp @@ -24,7 +24,7 @@ AsanThreadIdAndName::AsanThreadIdAndName(AsanThreadContext *t) { internal_snprintf(name, sizeof(name), "T-1"); return; } - int len = internal_snprintf(name, sizeof(name), "T%d", t->tid); + int len = internal_snprintf(name, sizeof(name), "T%llu", t->unique_id); CHECK(((unsigned int)len) < sizeof(name)); if (internal_strlen(t->name)) internal_snprintf(&name[len], sizeof(name) - len, " (%s)", t->name); From 3cb4d20d5bcefd98454d0e181cd89f8ee6f16498 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 10 Oct 2024 21:19:49 -0700 Subject: [PATCH 123/177] [NFC][sanitizer] Simplify GetThreadLocked Now we can pass `invalid tid`. --- compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h index 2c7e5c276fa1c7..bf492c17f7e107 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h @@ -101,7 +101,7 @@ class SANITIZER_MUTEX ThreadRegistry { // Should be guarded by ThreadRegistryLock. ThreadContextBase *GetThreadLocked(u32 tid) { - return threads_.empty() ? nullptr : threads_[tid]; + return tid < threads_.size() ? threads_[tid] : nullptr; } u32 NumThreadsLocked() const { return threads_.size(); } From bf81bd800fbcf1d11f149d897f55409e27ec59fb Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 10 Oct 2024 21:36:51 -0700 Subject: [PATCH 124/177] [ELF] Pass Ctx & --- lld/ELF/Arch/ARM.cpp | 8 ++++++-- lld/ELF/Arch/LoongArch.cpp | 6 +++--- lld/ELF/Arch/X86.cpp | 18 ++++++++++++++---- lld/ELF/Arch/X86_64.cpp | 18 ++++++++++++++---- lld/ELF/LinkerScript.cpp | 15 +++++++++------ 5 files changed, 46 insertions(+), 19 deletions(-) diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 43fbbc8d49131a..013e90cde6f995 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -48,6 +48,10 @@ class ARM final : public TargetInfo { bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override; void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; + +private: +void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val, + int group, bool check) const; }; enum class CodeState { Data = 0, Thumb = 2, Arm = 4 }; } // namespace @@ -534,8 +538,8 @@ static std::pair getRemAndLZForGroup(unsigned group, return {rem, lz}; } -static void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val, - int group, bool check) { +void ARM::encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val, + int group, bool check) const { // ADD/SUB (immediate) add = bit23, sub = bit22 // immediate field carries is a 12-bit modified immediate, made up of a 4-bit // even rotate right and an 8-bit immediate. diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index eca1d2fdc08caf..f16f8f0c8d5ce4 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -159,7 +159,7 @@ static bool isJirl(uint32_t insn) { return (insn & 0xfc000000) == JIRL; } -static void handleUleb128(uint8_t *loc, uint64_t val) { +static void handleUleb128(Ctx &ctx, uint8_t *loc, uint64_t val) { const uint32_t maxcount = 1 + 64 / 7; uint32_t count; const char *error = nullptr; @@ -700,7 +700,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel, write64le(loc, read64le(loc) + val); return; case R_LARCH_ADD_ULEB128: - handleUleb128(loc, val); + handleUleb128(ctx, loc, val); return; case R_LARCH_SUB6: *loc = (*loc & 0xc0) | ((*loc - val) & 0x3f); @@ -718,7 +718,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel, write64le(loc, read64le(loc) - val); return; case R_LARCH_SUB_ULEB128: - handleUleb128(loc, -val); + handleUleb128(ctx, loc, -val); return; case R_LARCH_MARK_LA: diff --git a/lld/ELF/Arch/X86.cpp b/lld/ELF/Arch/X86.cpp index 3314dcfc172f8c..4e574a520f1ff1 100644 --- a/lld/ELF/Arch/X86.cpp +++ b/lld/ELF/Arch/X86.cpp @@ -39,6 +39,12 @@ class X86 : public TargetInfo { RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override; void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + +private: + void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; }; } // namespace @@ -344,7 +350,8 @@ void X86::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { } } -static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { +void X86::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { if (rel.type == R_386_TLS_GD) { // Convert (loc[-2] == 0x04) // leal x@tlsgd(, %ebx, 1), %eax @@ -379,7 +386,8 @@ static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { } } -static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) { +void X86::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { if (rel.type == R_386_TLS_GD) { // Convert (loc[-2] == 0x04) // leal x@tlsgd(, %ebx, 1), %eax @@ -413,7 +421,8 @@ static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) { // In some conditions, relocations can be optimized to avoid using GOT. // This function does that for Initial Exec to Local Exec case. -static void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { +void X86::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { // Ulrich's document section 6.2 says that @gotntpoff can // be used with MOVL or ADDL instructions. // @indntpoff is similar to @gotntpoff, but for use in @@ -450,7 +459,8 @@ static void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { write32le(loc, val); } -static void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { +void X86::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { if (rel.type == R_386_TLS_LDO_32) { write32le(loc, val); return; diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp index fbf1076fc71e52..121b7d9929b209 100644 --- a/lld/ELF/Arch/X86_64.cpp +++ b/lld/ELF/Arch/X86_64.cpp @@ -50,6 +50,12 @@ class X86_64 : public TargetInfo { bool deleteFallThruJmpInsn(InputSection &is, InputFile *file, InputSection *nextIS) const override; bool relaxOnce(int pass) const override; + +private: + void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; + void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; }; } // namespace @@ -460,7 +466,8 @@ RelType X86_64::getDynRel(RelType type) const { return R_X86_64_NONE; } -static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { +void X86_64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { if (rel.type == R_X86_64_TLSGD) { // Convert // .byte 0x66 @@ -500,7 +507,8 @@ static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { } } -static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) { +void X86_64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { if (rel.type == R_X86_64_TLSGD) { // Convert // .byte 0x66 @@ -541,7 +549,8 @@ static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) { // In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to // R_X86_64_TPOFF32 so that it does not use GOT. -static void relaxTlsIeToLe(uint8_t *loc, const Relocation &, uint64_t val) { +void X86_64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { uint8_t *inst = loc - 3; uint8_t reg = loc[-1] >> 3; uint8_t *regSlot = loc - 1; @@ -582,7 +591,8 @@ static void relaxTlsIeToLe(uint8_t *loc, const Relocation &, uint64_t val) { write32le(loc, val + 4); } -static void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { +void X86_64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, + uint64_t val) const { const uint8_t inst[] = { 0x66, 0x66, // .word 0x6666 0x66, // .byte 0x66 diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index f3f95ec589bd82..e9a637bac4e9bd 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -479,7 +479,7 @@ static void sortSections(MutableArrayRef vec, // --sort-section is handled as an inner SORT command. // 3. If one SORT command is given, and if it is SORT_NONE, don't sort. // 4. If no SORT command is given, sort according to --sort-section. -static void sortInputSections(MutableArrayRef vec, +static void sortInputSections(Ctx &ctx, MutableArrayRef vec, SortSectionPolicy outer, SortSectionPolicy inner) { if (outer == SortSectionPolicy::None) @@ -517,6 +517,7 @@ LinkerScript::computeInputSections(const InputSectionDescription *cmd, for (size_t i = begin; i != end; ++i) ret[i] = sections[indexes[i]]; sortInputSections( + ctx, MutableArrayRef(ret).slice(begin, end - begin), ctx.arg.sortSection, SortSectionPolicy::None); }; @@ -584,6 +585,7 @@ LinkerScript::computeInputSections(const InputSectionDescription *cmd, // ret[sizeBeforeCurrPat,ret.size()) are already in the input order, so we // just sort by sortOuter and sortInner. sortInputSections( + ctx, MutableArrayRef(ret).slice(sizeBeforeCurrPat), pat.sortOuter, pat.sortInner); sizeAfterPrevSort = ret.size(); @@ -865,7 +867,8 @@ static OutputDesc *createSection(InputSectionBase *isec, StringRef outsecName) { return osd; } -static OutputDesc *addInputSec(StringMap> &map, +static OutputDesc *addInputSec(Ctx &ctx, + StringMap> &map, InputSectionBase *isec, StringRef outsecName) { // Sections with SHT_GROUP or SHF_GROUP attributes reach here only when the -r // option is given. A section with SHT_GROUP defines a "section group", and @@ -983,7 +986,7 @@ void LinkerScript::addOrphanSections() { } else if (OutputSection *sec = findByName(sectionCommands, name)) { sec->recordSection(s); } else { - if (OutputDesc *osd = addInputSec(map, s, name)) + if (OutputDesc *osd = addInputSec(ctx, map, s, name)) v.push_back(osd); assert(isa(s) || s->getOutputSection()->sectionIndex == UINT32_MAX); @@ -1114,7 +1117,7 @@ LinkerScript::findMemoryRegion(OutputSection *sec, MemoryRegion *hint) { return {nullptr, nullptr}; } -static OutputSection *findFirstSection(PhdrEntry *load) { +static OutputSection *findFirstSection(Ctx &ctx, PhdrEntry *load) { for (OutputSection *sec : ctx.outputSections) if (sec->ptLoad == load) return sec; @@ -1187,7 +1190,7 @@ bool LinkerScript::assignOffsets(OutputSection *sec) { // Propagate state->lmaOffset to the first "non-header" section. if (PhdrEntry *l = sec->ptLoad) - if (sec == findFirstSection(l)) + if (sec == findFirstSection(ctx, l)) l->lmaOffset = state->lmaOffset; // We can call this method multiple times during the creation of @@ -1462,7 +1465,7 @@ void LinkerScript::allocateHeaders(SmallVector &phdrs) { ctx.out.elfHeader->ptLoad = nullptr; ctx.out.programHeaders->ptLoad = nullptr; - firstPTLoad->firstSec = findFirstSection(firstPTLoad); + firstPTLoad->firstSec = findFirstSection(ctx, firstPTLoad); llvm::erase_if(phdrs, [](const PhdrEntry *e) { return e->p_type == PT_PHDR; }); From 15de239406bfc0a1dfbd0640490c4bd5d1e0ac33 Mon Sep 17 00:00:00 2001 From: Serge Pavlov Date: Fri, 11 Oct 2024 12:09:10 +0700 Subject: [PATCH 125/177] [IR] Allow MDString in operand bundles (#110805) This change implements support of metadata strings in operand bundle values. It makes possible calls like: call void @some_func(i32 %x) [ "foo"(i32 42, metadata !"abc") ] It requires some extension of the bitcode serialization. As SSA values and metadata are stored in different tables, there must be a way to distinguish them during deserialization. It is implemented by putting a special marker before the metadata index. The marker cannot be treated as a reference to any SSA value, so it unambiguously identifies metadata. It allows extending the bitcode serialization without breaking compatibility. Metadata as operand bundle values are intended to be used in floating-point function calls. They would represent the same information as now is passed by the constrained intrinsic arguments. --- llvm/docs/LangRef.rst | 6 +++--- llvm/docs/ReleaseNotes.md | 2 ++ llvm/include/llvm/Bitcode/LLVMBitCodes.h | 3 +++ llvm/lib/AsmParser/LLParser.cpp | 8 +++++++- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 21 ++++++++++++++++++-- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 17 +++++++++++++++- llvm/test/Bitcode/compatibility.ll | 8 ++++++++ llvm/test/Bitcode/operand-bundles.ll | 24 +++++++++++++++++++++++ 8 files changed, 82 insertions(+), 7 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 0c7279de06cd68..a330b804930326 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2666,8 +2666,8 @@ are grouped into a single :ref:`attribute group `. Operand Bundles --------------- -Operand bundles are tagged sets of SSA values that can be associated -with certain LLVM instructions (currently only ``call`` s and +Operand bundles are tagged sets of SSA values or metadata strings that can be +associated with certain LLVM instructions (currently only ``call`` s and ``invoke`` s). In a way they are like metadata, but dropping them is incorrect and will change program semantics. @@ -2675,7 +2675,7 @@ Syntax:: operand bundle set ::= '[' operand bundle (, operand bundle )* ']' operand bundle ::= tag '(' [ bundle operand ] (, bundle operand )* ')' - bundle operand ::= SSA value + bundle operand ::= SSA value | metadata string tag ::= string constant Operand bundles are **not** part of a function's signature, and a diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 8ac5900a7e532e..dcdd7a25c7fbee 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -88,6 +88,8 @@ Changes to the LLVM IR * `llvm.nvvm.ptr.shared.to.gen` * `llvm.nvvm.ptr.constant.to.gen` * `llvm.nvvm.ptr.local.to.gen` + +* Operand bundle values can now be metadata strings. Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index cbd92fd52fc75a..ba2efee9414218 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -529,6 +529,9 @@ enum PossiblyExactOperatorOptionalFlags { PEO_EXACT = 0 }; /// PossiblyDisjointInst's SubclassOptionalData contents. enum PossiblyDisjointInstOptionalFlags { PDI_DISJOINT = 0 }; +/// Mark to distinguish metadata from value in an operator bundle. +enum MetadataOperandBundleValueMarker { OB_METADATA = 0x80000000 }; + /// GetElementPtrOptionalFlags - Flags for serializing /// GEPOperator's SubclassOptionalData contents. enum GetElementPtrOptionalFlags { diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 9f2ef2e6a9311e..c3b4a8235ce637 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -3202,8 +3202,14 @@ bool LLParser::parseOptionalOperandBundles( Type *Ty = nullptr; Value *Input = nullptr; - if (parseType(Ty) || parseValue(Ty, Input, PFS)) + if (parseType(Ty)) return true; + if (Ty->isMetadataTy()) { + if (parseMetadataAsValue(Input, PFS)) + return true; + } else if (parseValue(Ty, Input, PFS)) { + return true; + } Inputs.push_back(Input); } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 6f997510b03609..8ee93253bc2447 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -792,6 +792,24 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { return ResVal == nullptr; } + bool getValueOrMetadata(const SmallVectorImpl &Record, + unsigned &Slot, unsigned InstNum, Value *&ResVal, + BasicBlock *ConstExprInsertBB) { + if (Slot == Record.size()) + return true; + unsigned ValID = Record[Slot++]; + if (ValID != bitc::OB_METADATA) { + unsigned TypeId; + return getValueTypePair(Record, --Slot, InstNum, ResVal, TypeId, + ConstExprInsertBB); + } + if (Slot == Record.size()) + return true; + unsigned ValNo = InstNum - (unsigned)Record[Slot++]; + ResVal = MetadataAsValue::get(Context, getFnMetadataByID(ValNo)); + return false; + } + /// Read a value out of the specified record from slot 'Slot'. Increment Slot /// past the number of slots used by the value in the record. Return true if /// there is an error. @@ -6767,8 +6785,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { unsigned OpNum = 1; while (OpNum != Record.size()) { Value *Op; - unsigned OpTypeID; - if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB)) + if (getValueOrMetadata(Record, OpNum, NextValueNo, Op, CurBB)) return error("Invalid record"); Inputs.push_back(Op); } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index d9086bfebbd2a9..bec0caef58afa8 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -395,6 +395,8 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase { void writeModuleConstants(); bool pushValueAndType(const Value *V, unsigned InstID, SmallVectorImpl &Vals); + bool pushValueOrMetadata(const Value *V, unsigned InstID, + SmallVectorImpl &Vals); void writeOperandBundles(const CallBase &CB, unsigned InstID); void pushValue(const Value *V, unsigned InstID, SmallVectorImpl &Vals); @@ -2931,6 +2933,19 @@ bool ModuleBitcodeWriter::pushValueAndType(const Value *V, unsigned InstID, return false; } +bool ModuleBitcodeWriter::pushValueOrMetadata(const Value *V, unsigned InstID, + SmallVectorImpl &Vals) { + bool IsMetadata = V->getType()->isMetadataTy(); + if (IsMetadata) { + Vals.push_back(bitc::OB_METADATA); + Metadata *MD = cast(V)->getMetadata(); + unsigned ValID = VE.getMetadataID(MD); + Vals.push_back(InstID - ValID); + return false; + } + return pushValueAndType(V, InstID, Vals); +} + void ModuleBitcodeWriter::writeOperandBundles(const CallBase &CS, unsigned InstID) { SmallVector Record; @@ -2941,7 +2956,7 @@ void ModuleBitcodeWriter::writeOperandBundles(const CallBase &CS, Record.push_back(C.getOperandBundleTagID(Bundle.getTagName())); for (auto &Input : Bundle.Inputs) - pushValueAndType(Input, InstID, Record); + pushValueOrMetadata(Input, InstID, Record); Stream.EmitRecord(bitc::FUNC_CODE_OPERAND_BUNDLE, Record); Record.clear(); diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index a1b2370a87b821..280c3a99d7535f 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -1327,6 +1327,14 @@ continue: ret i32 0 } +declare void @instructions.bundles.callee(i32) +define void @instructions.bundles.metadata(i32 %x) { +entry: + call void @instructions.bundles.callee(i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ] +; CHECK: call void @instructions.bundles.callee(i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ] + ret void +} + ; Instructions -- Unary Operations define void @instructions.unops(double %op1) { fneg double %op1 diff --git a/llvm/test/Bitcode/operand-bundles.ll b/llvm/test/Bitcode/operand-bundles.ll index ab28cffd84aa29..a8e086f784c6cf 100644 --- a/llvm/test/Bitcode/operand-bundles.ll +++ b/llvm/test/Bitcode/operand-bundles.ll @@ -56,6 +56,13 @@ define void @f4(i32* %ptr) { ret void } +define void @f5(i32 %x) { +entry: + call void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ] +; CHECK: call void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ] + ret void +} + ; Invoke versions of the above tests: @@ -150,3 +157,20 @@ exception: normal: ret void } + +define void @g5(ptr %ptr) personality i8 3 { +entry: + %l = load i32, ptr %ptr, align 4 + %x = add i32 42, 1 + invoke void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ] + to label %normal unwind label %exception +; CHECK: invoke void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ] + +exception: ; preds = %entry + %cleanup = landingpad i8 + cleanup + br label %normal + +normal: ; preds = %exception, %entry + ret void +} From c22588c7cdc5a82afd825ce90f21f922dedee98b Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 10 Oct 2024 22:15:10 -0700 Subject: [PATCH 126/177] [ELF] Move InputSectionBase::file to SectionBase ... and add getCtx (file->ctx). This allows InputSectionBase and OutputSection to access ctx without taking an extra function argument. --- lld/ELF/EhFrame.cpp | 2 +- lld/ELF/InputFiles.h | 4 +++- lld/ELF/InputSection.cpp | 10 ++++++---- lld/ELF/InputSection.h | 22 ++++++++++++---------- lld/ELF/OutputSections.cpp | 5 +++-- lld/ELF/OutputSections.h | 3 ++- 6 files changed, 27 insertions(+), 19 deletions(-) diff --git a/lld/ELF/EhFrame.cpp b/lld/ELF/EhFrame.cpp index d2d0e62e97ec45..f4c788fe610ae6 100644 --- a/lld/ELF/EhFrame.cpp +++ b/lld/ELF/EhFrame.cpp @@ -119,7 +119,7 @@ void EhReader::skipAugP() { uint8_t enc = readByte(); if ((enc & 0xf0) == DW_EH_PE_aligned) failOn(d.data() - 1, "DW_EH_PE_aligned encoding is not supported"); - size_t size = getAugPSize(ctx, enc); + size_t size = getAugPSize(isec->getCtx(), enc); if (size == 0) failOn(d.data() - 1, "unknown FDE encoding"); if (size >= d.size()) diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h index 0b54f92d1a2669..f80413b215047d 100644 --- a/lld/ELF/InputFiles.h +++ b/lld/ELF/InputFiles.h @@ -48,8 +48,10 @@ void parseFiles(Ctx &, const std::vector &files); // The root class of input files. class InputFile { -protected: +public: Ctx &ctx; + +protected: std::unique_ptr symbols; uint32_t numSymbols = 0; SmallVector sections; diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 0885815a22a14a..90716f4f3675cc 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -52,9 +52,9 @@ InputSectionBase::InputSectionBase(InputFile *file, uint64_t flags, uint32_t link, uint32_t info, uint32_t addralign, ArrayRef data, StringRef name, Kind sectionKind) - : SectionBase(sectionKind, name, flags, entsize, addralign, type, info, - link), - file(file), content_(data.data()), size(data.size()) { + : SectionBase(sectionKind, file, name, flags, entsize, addralign, type, + info, link), + content_(data.data()), size(data.size()) { // In order to reduce memory allocation, we assume that mergeable // sections are smaller than 4 GiB, which is not an unreasonable // assumption as of 2017. @@ -88,7 +88,7 @@ template InputSectionBase::InputSectionBase(ObjFile &file, const typename ELFT::Shdr &hdr, StringRef name, Kind sectionKind) - : InputSectionBase(&file, getFlags(ctx, hdr.sh_flags), hdr.sh_type, + : InputSectionBase(&file, getFlags(file.ctx, hdr.sh_flags), hdr.sh_type, hdr.sh_entsize, hdr.sh_link, hdr.sh_info, hdr.sh_addralign, getSectionContents(file, hdr), name, sectionKind) { @@ -185,6 +185,8 @@ RelsOrRelas InputSectionBase::relsOrRelas(bool supportsCrel) const { return ret; } +Ctx &SectionBase::getCtx() const { return file->ctx; } + uint64_t SectionBase::getOffset(uint64_t offset) const { switch (kind()) { case Output: { diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index bff9ec324d9bc5..8f69a957e11d7a 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -78,6 +78,12 @@ class SectionBase { uint8_t partition = 1; uint32_t type; + + // The file which contains this section. For InputSectionBase, its dynamic + // type is usually ObjFile, but may be an InputFile of InternalKind + // (for a synthetic section). + InputFile *file; + StringRef name; // The 1-indexed partition that this section is assigned to by the garbage @@ -92,6 +98,7 @@ class SectionBase { uint32_t link; uint32_t info; + Ctx &getCtx() const; OutputSection *getOutputSection(); const OutputSection *getOutputSection() const { return const_cast(this)->getOutputSection(); @@ -108,12 +115,12 @@ class SectionBase { void markDead() { partition = 0; } protected: - constexpr SectionBase(Kind sectionKind, StringRef name, uint64_t flags, - uint32_t entsize, uint32_t addralign, uint32_t type, - uint32_t info, uint32_t link) + constexpr SectionBase(Kind sectionKind, InputFile *file, StringRef name, + uint64_t flags, uint32_t entsize, uint32_t addralign, + uint32_t type, uint32_t info, uint32_t link) : sectionKind(sectionKind), bss(false), keepUnique(false), type(type), - name(name), flags(flags), addralign(addralign), entsize(entsize), - link(link), info(info) {} + file(file), name(name), flags(flags), addralign(addralign), + entsize(entsize), link(link), info(info) {} }; struct SymbolAnchor { @@ -150,11 +157,6 @@ class InputSectionBase : public SectionBase { return s->kind() != Output && s->kind() != Class; } - // The file which contains this section. Its dynamic type is usually - // ObjFile, but may be an InputFile of InternalKind (for a synthetic - // section). - InputFile *file; - // Input sections are part of an output section. Special sections // like .eh_frame and merge sections are first combined into a // synthetic section that is then added to an output section. In all diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index 408dbdc43d5481..864c30ca790508 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -66,8 +66,9 @@ void OutputSection::writeHeaderTo(typename ELFT::Shdr *shdr) { } OutputSection::OutputSection(StringRef name, uint32_t type, uint64_t flags) - : SectionBase(Output, name, flags, /*Entsize*/ 0, /*Alignment*/ 1, type, - /*Info*/ 0, /*Link*/ 0) {} + : SectionBase(Output, ctx.internalFile, name, flags, /*entsize=*/0, + /*addralign=*/1, type, + /*info=*/0, /*link=*/0) {} // We allow sections of types listed below to merged into a // single progbits section. This is typically done by linker diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h index 904206b20bc1cb..11977507e9268e 100644 --- a/lld/ELF/OutputSections.h +++ b/lld/ELF/OutputSections.h @@ -150,7 +150,8 @@ struct SectionClass final : public SectionBase { SmallVector commands; bool assigned = false; - SectionClass(StringRef name) : SectionBase(Class, name, 0, 0, 0, 0, 0, 0) {} + SectionClass(StringRef name) + : SectionBase(Class, nullptr, name, 0, 0, 0, 0, 0, 0) {} static bool classof(const SectionBase *s) { return s->kind() == Class; } }; From e018f550d0c40bd99294cdd943c23bbec3804ace Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 10 Oct 2024 22:22:25 -0700 Subject: [PATCH 127/177] [ELF] Pass Ctx & --- lld/ELF/ARMErrataFix.cpp | 2 +- lld/ELF/Arch/Mips.cpp | 4 ++-- lld/ELF/DWARF.cpp | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp index 839ab2b074bdd1..5818772bf19d14 100644 --- a/lld/ELF/ARMErrataFix.cpp +++ b/lld/ELF/ARMErrataFix.cpp @@ -417,7 +417,7 @@ void ARMErr657417Patcher::insertPatches( // isec so the branch we are patching always goes forwards. static void implementPatch(ScanResult sr, InputSection *isec, std::vector &patches) { - + Ctx &ctx = isec->getCtx(); log("detected cortex-a8-657419 erratum sequence starting at " + utohexstr(isec->getVA(sr.off)) + " in unpatched output."); Patch657417Section *psec; diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp index 975fa9ead762d7..6313ac8ca4fb9a 100644 --- a/lld/ELF/Arch/Mips.cpp +++ b/lld/ELF/Arch/Mips.cpp @@ -480,7 +480,7 @@ int64_t MIPS::getImplicitAddend(const uint8_t *buf, RelType type) const { } static std::pair -calculateMipsRelChain(uint8_t *loc, RelType type, uint64_t val) { +calculateMipsRelChain(Ctx &ctx, uint8_t *loc, RelType type, uint64_t val) { // MIPS N64 ABI packs multiple relocations into the single relocation // record. In general, all up to three relocations can have arbitrary // types. In fact, Clang and GCC uses only a few combinations. For now, @@ -572,7 +572,7 @@ void MIPS::relocate(uint8_t *loc, const Relocation &rel, RelType type = rel.type; if (ELFT::Is64Bits || ctx.arg.mipsN32Abi) - std::tie(type, val) = calculateMipsRelChain(loc, type, val); + std::tie(type, val) = calculateMipsRelChain(ctx, loc, type, val); // Detect cross-mode jump/branch and fix instruction. val = fixupCrossModeJump(loc, type, val); diff --git a/lld/ELF/DWARF.cpp b/lld/ELF/DWARF.cpp index 133e66baabe2de..8e4740919a481d 100644 --- a/lld/ELF/DWARF.cpp +++ b/lld/ELF/DWARF.cpp @@ -112,6 +112,7 @@ LLDDwarfObj::findAux(const InputSectionBase &sec, uint64_t pos, const RelTy &rel = *it; const ObjFile *file = sec.getFile(); + Ctx &ctx = sec.getCtx(); uint32_t symIndex = rel.getSymbol(ctx.arg.isMips64EL); const typename ELFT::Sym &sym = file->template getELFSyms()[symIndex]; uint32_t secIndex = file->getSectionIndex(sym); From 25cda9e069bc5948f38dde0d2e07814a7bf3fc71 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 10 Oct 2024 23:07:02 -0700 Subject: [PATCH 128/177] [ELF] Pass Ctx & to SyntheticSection --- lld/ELF/AArch64ErrataFix.cpp | 2 +- lld/ELF/ARMErrataFix.cpp | 2 +- lld/ELF/Arch/ARM.cpp | 9 +- lld/ELF/Arch/RISCV.cpp | 7 +- lld/ELF/Driver.cpp | 2 +- lld/ELF/InputSection.h | 2 +- lld/ELF/OutputSections.cpp | 4 +- lld/ELF/Relocations.cpp | 6 +- lld/ELF/SyntheticSections.cpp | 305 ++++++++++++++++++---------------- lld/ELF/SyntheticSections.h | 121 +++++++------- 10 files changed, 240 insertions(+), 220 deletions(-) diff --git a/lld/ELF/AArch64ErrataFix.cpp b/lld/ELF/AArch64ErrataFix.cpp index 19db4295d46ed5..a5129c58da13d9 100644 --- a/lld/ELF/AArch64ErrataFix.cpp +++ b/lld/ELF/AArch64ErrataFix.cpp @@ -393,7 +393,7 @@ class elf::Patch843419Section final : public SyntheticSection { }; Patch843419Section::Patch843419Section(Ctx &ctx, InputSection *p, uint64_t off) - : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4, + : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4, ".text.patch"), patchee(p), patcheeOffset(off) { this->parent = p->getParent(); diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp index 5818772bf19d14..57df542e57ec48 100644 --- a/lld/ELF/ARMErrataFix.cpp +++ b/lld/ELF/ARMErrataFix.cpp @@ -136,7 +136,7 @@ static bool is32bitBranch(uint32_t instr) { Patch657417Section::Patch657417Section(Ctx &ctx, InputSection *p, uint64_t off, uint32_t instr, bool isARM) - : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4, + : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4, ".text.patch"), patchee(p), patcheeOffset(off), instr(instr), isARM(isARM) { parent = p->getParent(); diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 013e90cde6f995..ecf293602ac99d 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1331,7 +1331,7 @@ class elf::ArmCmseSGVeneer { }; ArmCmseSGSection::ArmCmseSGSection(Ctx &ctx) - : SyntheticSection(llvm::ELF::SHF_ALLOC | llvm::ELF::SHF_EXECINSTR, + : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC | llvm::ELF::SHF_EXECINSTR, llvm::ELF::SHT_PROGBITS, /*alignment=*/32, ".gnu.sgstubs"), ctx(ctx) { @@ -1446,10 +1446,11 @@ void ArmCmseSGSection::finalizeContents(Ctx &) { // https://developer.arm.com/documentation/ecm0359818/latest template void elf::writeARMCmseImportLib(Ctx &ctx) { StringTableSection *shstrtab = - make(".shstrtab", /*dynamic=*/false); + make(ctx, ".shstrtab", /*dynamic=*/false); StringTableSection *strtab = - make(".strtab", /*dynamic=*/false); - SymbolTableBaseSection *impSymTab = make>(*strtab); + make(ctx, ".strtab", /*dynamic=*/false); + SymbolTableBaseSection *impSymTab = + make>(ctx, *strtab); SmallVector, 0> osIsPairs; osIsPairs.emplace_back(make(strtab->name, 0, 0), strtab); diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 57cc26b3f0a3ff..351cca025b3864 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -1044,8 +1044,9 @@ namespace { // extension. class RISCVAttributesSection final : public SyntheticSection { public: - RISCVAttributesSection() - : SyntheticSection(0, SHT_RISCV_ATTRIBUTES, 1, ".riscv.attributes") {} + RISCVAttributesSection(Ctx &ctx) + : SyntheticSection(ctx, 0, SHT_RISCV_ATTRIBUTES, 1, ".riscv.attributes") { + } size_t getSize(Ctx &) const override { return size; } void writeTo(Ctx &, uint8_t *buf) override; @@ -1179,7 +1180,7 @@ mergeAttributesSection(Ctx &ctx, unsigned firstStackAlignValue = 0, xlen = 0; bool hasArch = false; - ctx.in.riscvAttributes = std::make_unique(); + ctx.in.riscvAttributes = std::make_unique(ctx); auto &merged = static_cast(*ctx.in.riscvAttributes); // Collect all tags values from attributes section. diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 43f75bab12775f..019388c9bd2e2c 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -2370,7 +2370,7 @@ static void replaceCommonSymbols(Ctx &ctx) { if (!s) continue; - auto *bss = make("COMMON", s->size, s->alignment); + auto *bss = make(ctx, "COMMON", s->size, s->alignment); bss->file = s->file; ctx.inputSections.push_back(bss); Defined(s->file, StringRef(), s->binding, s->stOther, s->type, diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index 8f69a957e11d7a..543ff4db3c3270 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -472,7 +472,7 @@ static_assert(sizeof(InputSection) <= 160, "InputSection is too big"); class SyntheticSection : public InputSection { public: - SyntheticSection(uint64_t flags, uint32_t type, uint32_t addralign, + SyntheticSection(Ctx &ctx, uint64_t flags, uint32_t type, uint32_t addralign, StringRef name) : InputSection(ctx.internalFile, flags, type, addralign, {}, name, InputSectionBase::Synthetic) {} diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index 864c30ca790508..3f3b80830d80d5 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -178,8 +178,8 @@ static MergeSyntheticSection *createMergeSynthetic(Ctx &ctx, StringRef name, uint64_t flags, uint32_t addralign) { if ((flags & SHF_STRINGS) && ctx.arg.optimize >= 2) - return make(name, type, flags, addralign); - return make(name, type, flags, addralign); + return make(ctx, name, type, flags, addralign); + return make(ctx, name, type, flags, addralign); } // This function scans over the InputSectionBase list sectionBases to create diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index ba2d493c28213f..5d81d0cccb78e5 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -381,8 +381,8 @@ template static void addCopyRelSymbol(Ctx &ctx, SharedSymbol &ss) { // See if this symbol is in a read-only segment. If so, preserve the symbol's // memory protection by reserving space in the .bss.rel.ro section. bool isRO = isReadOnly(ss); - BssSection *sec = - make(isRO ? ".bss.rel.ro" : ".bss", symSize, ss.alignment); + BssSection *sec = make(ctx, isRO ? ".bss.rel.ro" : ".bss", + symSize, ss.alignment); OutputSection *osec = (isRO ? ctx.in.bssRelRo : ctx.in.bss)->getParent(); // At this point, sectionBases has been migrated to sections. Append sec to @@ -2185,7 +2185,7 @@ void ThunkCreator::createInitialThunkSections( ThunkSection *ThunkCreator::addThunkSection(OutputSection *os, InputSectionDescription *isd, uint64_t off) { - auto *ts = make(os, off); + auto *ts = make(ctx, os, off); ts->partition = os->partition; if ((ctx.arg.fixCortexA53Errata843419 || ctx.arg.fixCortexA8) && !isd->sections.empty()) { diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 88f0ccf1c4b730..a65c137762ce63 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -96,8 +96,9 @@ MergeInputSection *elf::createCommentSection() { // .MIPS.abiflags section. template -MipsAbiFlagsSection::MipsAbiFlagsSection(Elf_Mips_ABIFlags flags) - : SyntheticSection(SHF_ALLOC, SHT_MIPS_ABIFLAGS, 8, ".MIPS.abiflags"), +MipsAbiFlagsSection::MipsAbiFlagsSection(Ctx &ctx, + Elf_Mips_ABIFlags flags) + : SyntheticSection(ctx, SHF_ALLOC, SHT_MIPS_ABIFLAGS, 8, ".MIPS.abiflags"), flags(flags) { this->entsize = sizeof(Elf_Mips_ABIFlags); } @@ -152,14 +153,14 @@ MipsAbiFlagsSection::create(Ctx &ctx) { }; if (create) - return std::make_unique>(flags); + return std::make_unique>(ctx, flags); return nullptr; } // .MIPS.options section. template -MipsOptionsSection::MipsOptionsSection(Elf_Mips_RegInfo reginfo) - : SyntheticSection(SHF_ALLOC, SHT_MIPS_OPTIONS, 8, ".MIPS.options"), +MipsOptionsSection::MipsOptionsSection(Ctx &ctx, Elf_Mips_RegInfo reginfo) + : SyntheticSection(ctx, SHF_ALLOC, SHT_MIPS_OPTIONS, 8, ".MIPS.options"), reginfo(reginfo) { this->entsize = sizeof(Elf_Mips_Options) + sizeof(Elf_Mips_RegInfo); } @@ -216,13 +217,13 @@ MipsOptionsSection::create(Ctx &ctx) { } }; - return std::make_unique>(reginfo); + return std::make_unique>(ctx, reginfo); } // MIPS .reginfo section. template -MipsReginfoSection::MipsReginfoSection(Elf_Mips_RegInfo reginfo) - : SyntheticSection(SHF_ALLOC, SHT_MIPS_REGINFO, 4, ".reginfo"), +MipsReginfoSection::MipsReginfoSection(Ctx &ctx, Elf_Mips_RegInfo reginfo) + : SyntheticSection(ctx, SHF_ALLOC, SHT_MIPS_REGINFO, 4, ".reginfo"), reginfo(reginfo) { this->entsize = sizeof(Elf_Mips_RegInfo); } @@ -263,7 +264,7 @@ MipsReginfoSection::create(Ctx &ctx) { sec->getFile()->mipsGp0 = r->ri_gp_value; }; - return std::make_unique>(reginfo); + return std::make_unique>(ctx, reginfo); } InputSection *elf::createInterpSection() { @@ -319,8 +320,8 @@ static size_t getHashSize() { // If the flag is zero (which indicates that the intersection of the feature // sets is empty, or some input files didn't have .note.gnu.property sections), // we don't create this section. -GnuPropertySection::GnuPropertySection() - : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, +GnuPropertySection::GnuPropertySection(Ctx &ctx) + : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, ctx.arg.wordsize, ".note.gnu.property") {} void GnuPropertySection::writeTo(Ctx &ctx, uint8_t *buf) { @@ -361,8 +362,8 @@ size_t GnuPropertySection::getSize(Ctx &ctx) const { return contentSize + 16; } -BuildIdSection::BuildIdSection() - : SyntheticSection(SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"), +BuildIdSection::BuildIdSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"), hashSize(getHashSize()) {} void BuildIdSection::writeTo(Ctx &ctx, uint8_t *buf) { @@ -378,14 +379,16 @@ void BuildIdSection::writeBuildId(ArrayRef buf) { memcpy(hashBuf, buf.data(), hashSize); } -BssSection::BssSection(StringRef name, uint64_t size, uint32_t alignment) - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_NOBITS, alignment, name) { +BssSection::BssSection(Ctx &ctx, StringRef name, uint64_t size, + uint32_t alignment) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_NOBITS, alignment, + name) { this->bss = true; this->size = size; } -EhFrameSection::EhFrameSection() - : SyntheticSection(SHF_ALLOC, SHT_PROGBITS, 1, ".eh_frame") {} +EhFrameSection::EhFrameSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 1, ".eh_frame") {} // Search for an existing CIE record or create a new one. // CIE records from input object files are uniquified by their contents @@ -653,8 +656,8 @@ void EhFrameSection::writeTo(Ctx &ctx, uint8_t *buf) { getPartition().ehFrameHdr->write(); } -GotSection::GotSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, +GotSection::GotSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, ctx.target->gotEntrySize, ".got") { numEntries = ctx.target->gotHeaderEntriesNum; } @@ -737,9 +740,9 @@ static uint64_t getMipsPageCount(uint64_t size) { return (size + 0xfffe) / 0xffff + 1; } -MipsGotSection::MipsGotSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE | SHF_MIPS_GPREL, SHT_PROGBITS, 16, - ".got") {} +MipsGotSection::MipsGotSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE | SHF_MIPS_GPREL, + SHT_PROGBITS, 16, ".got") {} void MipsGotSection::addEntry(InputFile &file, Symbol &sym, int64_t addend, RelExpr expr) { @@ -1169,9 +1172,9 @@ void MipsGotSection::writeTo(Ctx &ctx, uint8_t *buf) { // instead of the .got.plt, and the type is SHT_NOBITS similar to a .bss // section. I don't know why we have a BSS style type for the section but it is // consistent across both 64-bit PowerPC ABIs as well as the 32-bit PowerPC ABI. -GotPltSection::GotPltSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, ctx.arg.wordsize, - ".got.plt") { +GotPltSection::GotPltSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, + ctx.arg.wordsize, ".got.plt") { if (ctx.arg.emachine == EM_PPC) { name = ".plt"; } else if (ctx.arg.emachine == EM_PPC64) { @@ -1221,8 +1224,8 @@ static StringRef getIgotPltName() { // On PowerPC64 the GotPltSection type is SHT_NOBITS so we have to follow suit // with the IgotPltSection. -IgotPltSection::IgotPltSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, +IgotPltSection::IgotPltSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, ctx.arg.emachine == EM_PPC64 ? SHT_NOBITS : SHT_PROGBITS, ctx.target->gotEntrySize, getIgotPltName()) {} @@ -1242,8 +1245,9 @@ void IgotPltSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -StringTableSection::StringTableSection(StringRef name, bool dynamic) - : SyntheticSection(dynamic ? (uint64_t)SHF_ALLOC : 0, SHT_STRTAB, 1, name), +StringTableSection::StringTableSection(Ctx &ctx, StringRef name, bool dynamic) + : SyntheticSection(ctx, dynamic ? (uint64_t)SHF_ALLOC : 0, SHT_STRTAB, 1, + name), dynamic(dynamic) { // ELF string tables start with a NUL byte. strings.push_back(""); @@ -1283,9 +1287,9 @@ void StringTableSection::writeTo(Ctx &ctx, uint8_t *buf) { static unsigned getVerDefNum() { return namedVersionDefs(ctx).size() + 1; } template -DynamicSection::DynamicSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_DYNAMIC, ctx.arg.wordsize, - ".dynamic") { +DynamicSection::DynamicSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_DYNAMIC, + ctx.arg.wordsize, ".dynamic") { this->entsize = ELFT::Is64Bits ? 16 : 8; // .dynamic section is not writable on MIPS and on Fuchsia OS @@ -1632,12 +1636,12 @@ uint32_t DynamicReloc::getSymIndex(SymbolTableBaseSection *symTab) const { return index; } -RelocationBaseSection::RelocationBaseSection(StringRef name, uint32_t type, - int32_t dynamicTag, +RelocationBaseSection::RelocationBaseSection(Ctx &ctx, StringRef name, + uint32_t type, int32_t dynamicTag, int32_t sizeDynamicTag, bool combreloc, unsigned concurrency) - : SyntheticSection(SHF_ALLOC, type, ctx.arg.wordsize, name), + : SyntheticSection(ctx, SHF_ALLOC, type, ctx.arg.wordsize, name), dynamicTag(dynamicTag), sizeDynamicTag(sizeDynamicTag), relocsVec(concurrency), combreloc(combreloc) {} @@ -1728,9 +1732,9 @@ void RelocationBaseSection::computeRels() { } template -RelocationSection::RelocationSection(StringRef name, bool combreloc, - unsigned concurrency) - : RelocationBaseSection(name, ctx.arg.isRela ? SHT_RELA : SHT_REL, +RelocationSection::RelocationSection(Ctx &ctx, StringRef name, + bool combreloc, unsigned concurrency) + : RelocationBaseSection(ctx, name, ctx.arg.isRela ? SHT_RELA : SHT_REL, ctx.arg.isRela ? DT_RELA : DT_REL, ctx.arg.isRela ? DT_RELASZ : DT_RELSZ, combreloc, concurrency) { @@ -1750,9 +1754,10 @@ void RelocationSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -RelrBaseSection::RelrBaseSection(unsigned concurrency, bool isAArch64Auth) +RelrBaseSection::RelrBaseSection(Ctx &ctx, unsigned concurrency, + bool isAArch64Auth) : SyntheticSection( - SHF_ALLOC, + ctx, SHF_ALLOC, isAArch64Auth ? SHT_AARCH64_AUTH_RELR : (ctx.arg.useAndroidRelrTags ? SHT_ANDROID_RELR : SHT_RELR), @@ -1771,9 +1776,9 @@ void RelrBaseSection::mergeRels() { template AndroidPackedRelocationSection::AndroidPackedRelocationSection( - StringRef name, unsigned concurrency) + Ctx &ctx, StringRef name, unsigned concurrency) : RelocationBaseSection( - name, ctx.arg.isRela ? SHT_ANDROID_RELA : SHT_ANDROID_REL, + ctx, name, ctx.arg.isRela ? SHT_ANDROID_RELA : SHT_ANDROID_REL, ctx.arg.isRela ? DT_ANDROID_RELA : DT_ANDROID_REL, ctx.arg.isRela ? DT_ANDROID_RELASZ : DT_ANDROID_RELSZ, /*combreloc=*/false, concurrency) { @@ -2024,8 +2029,9 @@ bool AndroidPackedRelocationSection::updateAllocSize(Ctx &ctx) { } template -RelrSection::RelrSection(unsigned concurrency, bool isAArch64Auth) - : RelrBaseSection(concurrency, isAArch64Auth) { +RelrSection::RelrSection(Ctx &ctx, unsigned concurrency, + bool isAArch64Auth) + : RelrBaseSection(ctx, concurrency, isAArch64Auth) { this->entsize = ctx.arg.wordsize; } @@ -2110,8 +2116,9 @@ template bool RelrSection::updateAllocSize(Ctx &ctx) { return relrRelocs.size() != oldSize; } -SymbolTableBaseSection::SymbolTableBaseSection(StringTableSection &strTabSec) - : SyntheticSection(strTabSec.isDynamic() ? (uint64_t)SHF_ALLOC : 0, +SymbolTableBaseSection::SymbolTableBaseSection(Ctx &ctx, + StringTableSection &strTabSec) + : SyntheticSection(ctx, strTabSec.isDynamic() ? (uint64_t)SHF_ALLOC : 0, strTabSec.isDynamic() ? SHT_DYNSYM : SHT_SYMTAB, ctx.arg.wordsize, strTabSec.isDynamic() ? ".dynsym" : ".symtab"), @@ -2226,8 +2233,9 @@ size_t SymbolTableBaseSection::getSymbolIndex(const Symbol &sym) { } template -SymbolTableSection::SymbolTableSection(StringTableSection &strTabSec) - : SymbolTableBaseSection(strTabSec) { +SymbolTableSection::SymbolTableSection(Ctx &ctx, + StringTableSection &strTabSec) + : SymbolTableBaseSection(ctx, strTabSec) { this->entsize = sizeof(Elf_Sym); } @@ -2327,8 +2335,8 @@ void SymbolTableSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -SymtabShndxSection::SymtabShndxSection() - : SyntheticSection(0, SHT_SYMTAB_SHNDX, 4, ".symtab_shndx") { +SymtabShndxSection::SymtabShndxSection(Ctx &ctx) + : SyntheticSection(ctx, 0, SHT_SYMTAB_SHNDX, 4, ".symtab_shndx") { this->entsize = 4; } @@ -2396,9 +2404,9 @@ size_t SymtabShndxSection::getSize(Ctx &ctx) const { // DSOs very quickly. If you are sure that your dynamic linker knows // about .gnu.hash, you want to specify --hash-style=gnu. Otherwise, a // safe bet is to specify --hash-style=both for backward compatibility. -GnuHashTableSection::GnuHashTableSection() - : SyntheticSection(SHF_ALLOC, SHT_GNU_HASH, ctx.arg.wordsize, ".gnu.hash") { -} +GnuHashTableSection::GnuHashTableSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_HASH, ctx.arg.wordsize, + ".gnu.hash") {} void GnuHashTableSection::finalizeContents(Ctx &) { if (OutputSection *sec = getPartition().dynSymTab->getParent()) @@ -2505,8 +2513,8 @@ void GnuHashTableSection::addSymbols(SmallVectorImpl &v) { v.push_back({ent.sym, ent.strTabOffset}); } -HashTableSection::HashTableSection() - : SyntheticSection(SHF_ALLOC, SHT_HASH, 4, ".hash") { +HashTableSection::HashTableSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_HASH, 4, ".hash") { this->entsize = 4; } @@ -2545,8 +2553,9 @@ void HashTableSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -PltSection::PltSection() - : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".plt"), +PltSection::PltSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, + ".plt"), headerSize(ctx.target->pltHeaderSize) { // On PowerPC, this section contains lazy symbol resolvers. if (ctx.arg.emachine == EM_PPC64) { @@ -2606,8 +2615,9 @@ void PltSection::addSymbols() { } } -IpltSection::IpltSection() - : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".iplt") { +IpltSection::IpltSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, + ".iplt") { if (ctx.arg.emachine == EM_PPC || ctx.arg.emachine == EM_PPC64) { name = ".glink"; addralign = 4; @@ -2641,7 +2651,7 @@ void IpltSection::addSymbols() { } } -PPC32GlinkSection::PPC32GlinkSection() { +PPC32GlinkSection::PPC32GlinkSection(Ctx &ctx) : PltSection(ctx) { name = ".glink"; addralign = 4; } @@ -2712,8 +2722,9 @@ size_t PPC32GlinkSection::getSize(Ctx &ctx) const { // // That said, the 2-PLT scheme is a part of the ABI, debuggers and other tools // depend on it, so we implement the ABI. -IBTPltSection::IBTPltSection() - : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".plt") {} +IBTPltSection::IBTPltSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, + ".plt") {} void IBTPltSection::writeTo(Ctx &ctx, uint8_t *buf) { ctx.target->writeIBTPlt(buf, ctx.in.plt->getNumEntries()); @@ -2728,9 +2739,9 @@ bool IBTPltSection::isNeeded(Ctx &ctx) const { return ctx.in.plt->getNumEntries() > 0; } -RelroPaddingSection::RelroPaddingSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_NOBITS, 1, ".relro_padding") { -} +RelroPaddingSection::RelroPaddingSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_NOBITS, 1, + ".relro_padding") {} // The string hash function for .gdb_index. static uint32_t computeGdbHash(StringRef s) { @@ -2742,8 +2753,8 @@ static uint32_t computeGdbHash(StringRef s) { // 4-byte alignment ensures that values in the hash lookup table and the name // table are aligned. -DebugNamesBaseSection::DebugNamesBaseSection() - : SyntheticSection(0, SHT_PROGBITS, 4, ".debug_names") {} +DebugNamesBaseSection::DebugNamesBaseSection(Ctx &ctx) + : SyntheticSection(ctx, 0, SHT_PROGBITS, 4, ".debug_names") {} // Get the size of the .debug_names section header in bytes for DWARF32: static uint32_t getDebugNamesHeaderSize(uint32_t augmentationStringSize) { @@ -3173,7 +3184,9 @@ void DebugNamesBaseSection::init( hdr.UnitLength = size - 4; } -template DebugNamesSection::DebugNamesSection() { +template +DebugNamesSection::DebugNamesSection(Ctx &ctx) + : DebugNamesBaseSection(ctx) { init([](InputFile *f, InputChunk &inputChunk, OutputChunk &chunk) { auto *file = cast>(f); DWARFContext dwarf(std::make_unique>(file)); @@ -3337,8 +3350,8 @@ void DebugNamesSection::writeTo(Ctx &ctx, uint8_t *buf) { assert(uint64_t(buf - beginBuf) == size); } -GdbIndexSection::GdbIndexSection() - : SyntheticSection(0, SHT_PROGBITS, 1, ".gdb_index") {} +GdbIndexSection::GdbIndexSection(Ctx &ctx) + : SyntheticSection(ctx, 0, SHT_PROGBITS, 1, ".gdb_index") {} // Returns the desired size of an on-disk hash table for a .gdb_index section. // There's a tradeoff between size and collision rate. We aim 75% utilization. @@ -3500,7 +3513,7 @@ createSymbols( // Returns a newly-created .gdb_index section. template -std::unique_ptr GdbIndexSection::create() { +std::unique_ptr GdbIndexSection::create(Ctx &) { llvm::TimeTraceScope timeScope("Create gdb index"); // Collect InputFiles with .debug_info. See the comment in @@ -3546,7 +3559,7 @@ std::unique_ptr GdbIndexSection::create() { nameAttrs[i] = readPubNamesAndTypes(dobj, chunks[i].compilationUnits); }); - auto ret = std::make_unique(); + auto ret = std::make_unique(ctx); ret->chunks = std::move(chunks); std::tie(ret->symbols, ret->size) = createSymbols(nameAttrs, ret->chunks); @@ -3630,8 +3643,8 @@ void GdbIndexSection::writeTo(Ctx &ctx, uint8_t *buf) { bool GdbIndexSection::isNeeded(Ctx &) const { return !chunks.empty(); } -EhFrameHeader::EhFrameHeader() - : SyntheticSection(SHF_ALLOC, SHT_PROGBITS, 4, ".eh_frame_hdr") {} +EhFrameHeader::EhFrameHeader(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".eh_frame_hdr") {} void EhFrameHeader::writeTo(Ctx &ctx, uint8_t *buf) { // Unlike most sections, the EhFrameHeader section is written while writing @@ -3675,8 +3688,8 @@ bool EhFrameHeader::isNeeded(Ctx &ctx) const { return isLive() && getPartition().ehFrame->isNeeded(ctx); } -VersionDefinitionSection::VersionDefinitionSection() - : SyntheticSection(SHF_ALLOC, SHT_GNU_verdef, sizeof(uint32_t), +VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_verdef, sizeof(uint32_t), ".gnu.version_d") {} StringRef VersionDefinitionSection::getFileDefName() { @@ -3737,8 +3750,8 @@ size_t VersionDefinitionSection::getSize(Ctx &ctx) const { } // .gnu.version is a table where each entry is 2 byte long. -VersionTableSection::VersionTableSection() - : SyntheticSection(SHF_ALLOC, SHT_GNU_versym, sizeof(uint16_t), +VersionTableSection::VersionTableSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_versym, sizeof(uint16_t), ".gnu.version") { this->entsize = 2; } @@ -3788,8 +3801,8 @@ void elf::addVerneed(Symbol *ss) { } template -VersionNeedSection::VersionNeedSection() - : SyntheticSection(SHF_ALLOC, SHT_GNU_verneed, sizeof(uint32_t), +VersionNeedSection::VersionNeedSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_verneed, sizeof(uint32_t), ".gnu.version_r") {} template void VersionNeedSection::finalizeContents(Ctx &) { @@ -3872,9 +3885,9 @@ void MergeSyntheticSection::addSection(MergeInputSection *ms) { addralign = std::max(addralign, ms->addralign); } -MergeTailSection::MergeTailSection(StringRef name, uint32_t type, +MergeTailSection::MergeTailSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags, uint32_t alignment) - : MergeSyntheticSection(name, type, flags, alignment), + : MergeSyntheticSection(ctx, name, type, flags, alignment), builder(StringTableBuilder::RAW, llvm::Align(alignment)) {} size_t MergeTailSection::getSize(Ctx &) const { return builder.getSize(); } @@ -3997,12 +4010,12 @@ void elf::combineEhSections(Ctx &ctx) { }); } -MipsRldMapSection::MipsRldMapSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, ctx.arg.wordsize, - ".rld_map") {} +MipsRldMapSection::MipsRldMapSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, + ctx.arg.wordsize, ".rld_map") {} -ARMExidxSyntheticSection::ARMExidxSyntheticSection() - : SyntheticSection(SHF_ALLOC | SHF_LINK_ORDER, SHT_ARM_EXIDX, +ARMExidxSyntheticSection::ARMExidxSyntheticSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_LINK_ORDER, SHT_ARM_EXIDX, ctx.arg.wordsize, ".ARM.exidx") {} static InputSection *findExidxSection(InputSection *isec) { @@ -4225,8 +4238,8 @@ bool ARMExidxSyntheticSection::isNeeded(Ctx &) const { [](InputSection *isec) { return isec->isLive(); }); } -ThunkSection::ThunkSection(OutputSection *os, uint64_t off) - : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, +ThunkSection::ThunkSection(Ctx &ctx, OutputSection *os, uint64_t off) + : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, ctx.arg.emachine == EM_PPC64 ? 16 : 4, ".text.thunk") { this->parent = os; this->outSecOff = off; @@ -4269,8 +4282,8 @@ bool ThunkSection::assignOffsets() { return changed; } -PPC32Got2Section::PPC32Got2Section() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 4, ".got2") {} +PPC32Got2Section::PPC32Got2Section(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 4, ".got2") {} bool PPC32Got2Section::isNeeded(Ctx &) const { // See the comment below. This is not needed if there is no other @@ -4302,8 +4315,8 @@ void PPC32Got2Section::finalizeContents(Ctx &) { // directly in the binary so the section has type SHT_PROGBITS. If linking // position-independent code the section has type SHT_NOBITS since it will be // allocated and filled in by the dynamic linker. -PPC64LongBranchTargetSection::PPC64LongBranchTargetSection() - : SyntheticSection(SHF_ALLOC | SHF_WRITE, +PPC64LongBranchTargetSection::PPC64LongBranchTargetSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, ctx.arg.isPic ? SHT_NOBITS : SHT_PROGBITS, 8, ".branch_lt") {} @@ -4415,8 +4428,8 @@ template void elf::writePhdrs(uint8_t *buf, Partition &part) { } template -PartitionElfHeaderSection::PartitionElfHeaderSection() - : SyntheticSection(SHF_ALLOC, SHT_LLVM_PART_EHDR, 1, "") {} +PartitionElfHeaderSection::PartitionElfHeaderSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_EHDR, 1, "") {} template size_t PartitionElfHeaderSection::getSize(Ctx &ctx) const { @@ -4433,8 +4446,8 @@ void PartitionElfHeaderSection::writeTo(Ctx &ctx, uint8_t *buf) { } template -PartitionProgramHeadersSection::PartitionProgramHeadersSection() - : SyntheticSection(SHF_ALLOC, SHT_LLVM_PART_PHDR, 1, ".phdrs") {} +PartitionProgramHeadersSection::PartitionProgramHeadersSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_PHDR, 1, ".phdrs") {} template size_t PartitionProgramHeadersSection::getSize(Ctx &ctx) const { @@ -4446,8 +4459,8 @@ void PartitionProgramHeadersSection::writeTo(Ctx &ctx, uint8_t *buf) { writePhdrs(buf, getPartition()); } -PartitionIndexSection::PartitionIndexSection() - : SyntheticSection(SHF_ALLOC, SHT_PROGBITS, 4, ".rodata") {} +PartitionIndexSection::PartitionIndexSection(Ctx &ctx) + : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".rodata") {} size_t PartitionIndexSection::getSize(Ctx &ctx) const { return 12 * (ctx.partitions.size() - 1); @@ -4680,18 +4693,20 @@ template void elf::createSyntheticSections(Ctx &ctx) { auto add = [&](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); }; if (ctx.arg.zSectionHeader) - ctx.in.shStrTab = std::make_unique(".shstrtab", false); + ctx.in.shStrTab = + std::make_unique(ctx, ".shstrtab", false); ctx.out.programHeaders = make("", 0, SHF_ALLOC); ctx.out.programHeaders->addralign = ctx.arg.wordsize; if (ctx.arg.strip != StripPolicy::All) { - ctx.in.strTab = std::make_unique(".strtab", false); - ctx.in.symTab = std::make_unique>(*ctx.in.strTab); - ctx.in.symTabShndx = std::make_unique(); + ctx.in.strTab = std::make_unique(ctx, ".strtab", false); + ctx.in.symTab = + std::make_unique>(ctx, *ctx.in.strTab); + ctx.in.symTabShndx = std::make_unique(ctx); } - ctx.in.bss = std::make_unique(".bss", 0, 1); + ctx.in.bss = std::make_unique(ctx, ".bss", 0, 1); add(*ctx.in.bss); // If there is a SECTIONS command and a .data.rel.ro section name use name @@ -4700,13 +4715,13 @@ template void elf::createSyntheticSections(Ctx &ctx) { bool hasDataRelRo = ctx.script->hasSectionsCommand && findSection(".data.rel.ro"); ctx.in.bssRelRo = std::make_unique( - hasDataRelRo ? ".data.rel.ro.bss" : ".bss.rel.ro", 0, 1); + ctx, hasDataRelRo ? ".data.rel.ro.bss" : ".bss.rel.ro", 0, 1); add(*ctx.in.bssRelRo); // Add MIPS-specific sections. if (ctx.arg.emachine == EM_MIPS) { if (!ctx.arg.shared && ctx.arg.hasDynSymTab) { - ctx.in.mipsRldMap = std::make_unique(); + ctx.in.mipsRldMap = std::make_unique(ctx); add(*ctx.in.mipsRldMap); } if ((ctx.in.mipsAbiFlags = MipsAbiFlagsSection::create(ctx))) @@ -4727,68 +4742,68 @@ template void elf::createSyntheticSections(Ctx &ctx) { }; if (!part.name.empty()) { - part.elfHeader = std::make_unique>(); + part.elfHeader = std::make_unique>(ctx); part.elfHeader->name = part.name; add(*part.elfHeader); part.programHeaders = - std::make_unique>(); + std::make_unique>(ctx); add(*part.programHeaders); } if (ctx.arg.buildId != BuildIdKind::None) { - part.buildId = std::make_unique(); + part.buildId = std::make_unique(ctx); add(*part.buildId); } // dynSymTab is always present to simplify sym->includeInDynsym() in // finalizeSections. - part.dynStrTab = std::make_unique(".dynstr", true); + part.dynStrTab = std::make_unique(ctx, ".dynstr", true); part.dynSymTab = - std::make_unique>(*part.dynStrTab); + std::make_unique>(ctx, *part.dynStrTab); if (ctx.arg.relocatable) continue; - part.dynamic = std::make_unique>(); + part.dynamic = std::make_unique>(ctx); if (hasMemtag()) { - part.memtagAndroidNote = std::make_unique(); + part.memtagAndroidNote = std::make_unique(ctx); add(*part.memtagAndroidNote); if (canHaveMemtagGlobals()) { part.memtagGlobalDescriptors = - std::make_unique(); + std::make_unique(ctx); add(*part.memtagGlobalDescriptors); } } if (ctx.arg.androidPackDynRelocs) part.relaDyn = std::make_unique>( - relaDynName, threadCount); + ctx, relaDynName, threadCount); else part.relaDyn = std::make_unique>( - relaDynName, ctx.arg.zCombreloc, threadCount); + ctx, relaDynName, ctx.arg.zCombreloc, threadCount); if (ctx.arg.hasDynSymTab) { add(*part.dynSymTab); - part.verSym = std::make_unique(); + part.verSym = std::make_unique(ctx); add(*part.verSym); if (!namedVersionDefs(ctx).empty()) { - part.verDef = std::make_unique(); + part.verDef = std::make_unique(ctx); add(*part.verDef); } - part.verNeed = std::make_unique>(); + part.verNeed = std::make_unique>(ctx); add(*part.verNeed); if (ctx.arg.gnuHash) { - part.gnuHashTab = std::make_unique(); + part.gnuHashTab = std::make_unique(ctx); add(*part.gnuHashTab); } if (ctx.arg.sysvHash) { - part.hashTab = std::make_unique(); + part.hashTab = std::make_unique(ctx); add(*part.hashTab); } @@ -4798,28 +4813,28 @@ template void elf::createSyntheticSections(Ctx &ctx) { add(*part.relaDyn); if (ctx.arg.relrPackDynRelocs) { - part.relrDyn = std::make_unique>(threadCount); + part.relrDyn = std::make_unique>(ctx, threadCount); add(*part.relrDyn); part.relrAuthDyn = std::make_unique>( - threadCount, /*isAArch64Auth=*/true); + ctx, threadCount, /*isAArch64Auth=*/true); add(*part.relrAuthDyn); } if (ctx.arg.ehFrameHdr) { - part.ehFrameHdr = std::make_unique(); + part.ehFrameHdr = std::make_unique(ctx); add(*part.ehFrameHdr); } - part.ehFrame = std::make_unique(); + part.ehFrame = std::make_unique(ctx); add(*part.ehFrame); if (ctx.arg.emachine == EM_ARM) { // This section replaces all the individual .ARM.exidx InputSections. - part.armExidx = std::make_unique(); + part.armExidx = std::make_unique(ctx); add(*part.armExidx); } if (!ctx.arg.packageMetadata.empty()) { - part.packageMetadataNote = std::make_unique(); + part.packageMetadataNote = std::make_unique(ctx); add(*part.packageMetadataNote); } } @@ -4829,11 +4844,11 @@ template void elf::createSyntheticSections(Ctx &ctx) { // so that it is sorted after all other partitions. It also has other // special handling (see createPhdrs() and combineEhSections()). ctx.in.partEnd = - std::make_unique(".part.end", ctx.arg.maxPageSize, 1); + std::make_unique(ctx, ".part.end", ctx.arg.maxPageSize, 1); ctx.in.partEnd->partition = 255; add(*ctx.in.partEnd); - ctx.in.partIndex = std::make_unique(); + ctx.in.partIndex = std::make_unique(ctx); addOptionalRegular("__part_index_begin", ctx.in.partIndex.get(), 0); addOptionalRegular("__part_index_end", ctx.in.partIndex.get(), ctx.in.partIndex->getSize(ctx)); @@ -4843,34 +4858,34 @@ template void elf::createSyntheticSections(Ctx &ctx) { // Add .got. MIPS' .got is so different from the other archs, // it has its own class. if (ctx.arg.emachine == EM_MIPS) { - ctx.in.mipsGot = std::make_unique(); + ctx.in.mipsGot = std::make_unique(ctx); add(*ctx.in.mipsGot); } else { - ctx.in.got = std::make_unique(); + ctx.in.got = std::make_unique(ctx); add(*ctx.in.got); } if (ctx.arg.emachine == EM_PPC) { - ctx.in.ppc32Got2 = std::make_unique(); + ctx.in.ppc32Got2 = std::make_unique(ctx); add(*ctx.in.ppc32Got2); } if (ctx.arg.emachine == EM_PPC64) { ctx.in.ppc64LongBranchTarget = - std::make_unique(); + std::make_unique(ctx); add(*ctx.in.ppc64LongBranchTarget); } - ctx.in.gotPlt = std::make_unique(); + ctx.in.gotPlt = std::make_unique(ctx); add(*ctx.in.gotPlt); - ctx.in.igotPlt = std::make_unique(); + ctx.in.igotPlt = std::make_unique(ctx); add(*ctx.in.igotPlt); // Add .relro_padding if DATA_SEGMENT_RELRO_END is used; otherwise, add the // section in the absence of PHDRS/SECTIONS commands. if (ctx.arg.zRelro && ((ctx.script->phdrsCommands.empty() && !ctx.script->hasSectionsCommand) || ctx.script->seenRelroEnd)) { - ctx.in.relroPadding = std::make_unique(); + ctx.in.relroPadding = std::make_unique(ctx); add(*ctx.in.relroPadding); } @@ -4891,34 +4906,34 @@ template void elf::createSyntheticSections(Ctx &ctx) { // We always need to add rel[a].plt to output if it has entries. // Even for static linking it can contain R_[*]_IRELATIVE relocations. ctx.in.relaPlt = std::make_unique>( - ctx.arg.isRela ? ".rela.plt" : ".rel.plt", /*sort=*/false, + ctx, ctx.arg.isRela ? ".rela.plt" : ".rel.plt", /*sort=*/false, /*threadCount=*/1); add(*ctx.in.relaPlt); if ((ctx.arg.emachine == EM_386 || ctx.arg.emachine == EM_X86_64) && (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT)) { - ctx.in.ibtPlt = std::make_unique(); + ctx.in.ibtPlt = std::make_unique(ctx); add(*ctx.in.ibtPlt); } if (ctx.arg.emachine == EM_PPC) - ctx.in.plt = std::make_unique(); + ctx.in.plt = std::make_unique(ctx); else - ctx.in.plt = std::make_unique(); + ctx.in.plt = std::make_unique(ctx); add(*ctx.in.plt); - ctx.in.iplt = std::make_unique(); + ctx.in.iplt = std::make_unique(ctx); add(*ctx.in.iplt); if (ctx.arg.andFeatures || !ctx.aarch64PauthAbiCoreInfo.empty()) - add(*make()); + add(*make(ctx)); if (ctx.arg.debugNames) { - ctx.in.debugNames = std::make_unique>(); + ctx.in.debugNames = std::make_unique>(ctx); add(*ctx.in.debugNames); } if (ctx.arg.gdbIndex) { - ctx.in.gdbIndex = GdbIndexSection::create(); + ctx.in.gdbIndex = GdbIndexSection::create(ctx); add(*ctx.in.gdbIndex); } @@ -4928,7 +4943,7 @@ template void elf::createSyntheticSections(Ctx &ctx) { // is irrelevant these days. Stack area should always be non-executable // by default. So we emit this section unconditionally. if (ctx.arg.relocatable) - add(*make()); + add(*make(ctx)); if (ctx.in.symTab) add(*ctx.in.symTab); diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index a40e091500545a..b89860dd21371a 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -50,7 +50,7 @@ struct CieRecord { // Section for .eh_frame. class EhFrameSection final : public SyntheticSection { public: - EhFrameSection(); + EhFrameSection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; void finalizeContents(Ctx &) override; bool isNeeded(Ctx &) const override { return !sections.empty(); } @@ -104,7 +104,7 @@ class EhFrameSection final : public SyntheticSection { class GotSection final : public SyntheticSection { public: - GotSection(); + GotSection(Ctx &); size_t getSize(Ctx &ctx) const override { return size; } void finalizeContents(Ctx &) override; bool isNeeded(Ctx &) const override; @@ -136,15 +136,16 @@ class GotSection final : public SyntheticSection { // .note.GNU-stack section. class GnuStackSection : public SyntheticSection { public: - GnuStackSection() - : SyntheticSection(0, llvm::ELF::SHT_PROGBITS, 1, ".note.GNU-stack") {} + GnuStackSection(Ctx &ctx) + : SyntheticSection(ctx, 0, llvm::ELF::SHT_PROGBITS, 1, + ".note.GNU-stack") {} void writeTo(Ctx &, uint8_t *buf) override {} size_t getSize(Ctx &ctx) const override { return 0; } }; class GnuPropertySection final : public SyntheticSection { public: - GnuPropertySection(); + GnuPropertySection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; }; @@ -156,7 +157,7 @@ class BuildIdSection : public SyntheticSection { public: const size_t hashSize; - BuildIdSection(); + BuildIdSection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { return headerSize + hashSize; } void writeBuildId(llvm::ArrayRef buf); @@ -171,7 +172,7 @@ class BuildIdSection : public SyntheticSection { // respectively. class BssSection final : public SyntheticSection { public: - BssSection(StringRef name, uint64_t size, uint32_t addralign); + BssSection(Ctx &, StringRef name, uint64_t size, uint32_t addralign); void writeTo(Ctx &, uint8_t *) override {} bool isNeeded(Ctx &) const override { return size != 0; } size_t getSize(Ctx &ctx) const override { return size; } @@ -182,7 +183,7 @@ class BssSection final : public SyntheticSection { class MipsGotSection final : public SyntheticSection { public: - MipsGotSection(); + MipsGotSection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { return size; } bool updateAllocSize(Ctx &) override; @@ -359,7 +360,7 @@ class MipsGotSection final : public SyntheticSection { class GotPltSection final : public SyntheticSection { public: - GotPltSection(); + GotPltSection(Ctx &); void addEntry(Symbol &sym); size_t getSize(Ctx &) const override; void writeTo(Ctx &, uint8_t *buf) override; @@ -379,7 +380,7 @@ class GotPltSection final : public SyntheticSection { // on ARM the IgotPltSection will immediately follow the GotSection. class IgotPltSection final : public SyntheticSection { public: - IgotPltSection(); + IgotPltSection(Ctx &); void addEntry(Symbol &sym); size_t getSize(Ctx &) const override; void writeTo(Ctx &, uint8_t *buf) override; @@ -391,7 +392,7 @@ class IgotPltSection final : public SyntheticSection { class StringTableSection final : public SyntheticSection { public: - StringTableSection(StringRef name, bool dynamic); + StringTableSection(Ctx &, StringRef name, bool dynamic); unsigned addString(StringRef s, bool hashIt = true); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { return size; } @@ -483,7 +484,7 @@ template class DynamicSection final : public SyntheticSection { LLVM_ELF_IMPORT_TYPES_ELFT(ELFT) public: - DynamicSection(); + DynamicSection(Ctx &); void finalizeContents(Ctx &) override; void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { return size; } @@ -495,9 +496,9 @@ template class DynamicSection final : public SyntheticSection { class RelocationBaseSection : public SyntheticSection { public: - RelocationBaseSection(StringRef name, uint32_t type, int32_t dynamicTag, - int32_t sizeDynamicTag, bool combreloc, - unsigned concurrency); + RelocationBaseSection(Ctx &, StringRef name, uint32_t type, + int32_t dynamicTag, int32_t sizeDynamicTag, + bool combreloc, unsigned concurrency); /// Add a dynamic relocation without writing an addend to the output section. /// This overload can be used if the addends are written directly instead of /// using relocations on the input section (e.g. MipsGotSection::writeTo()). @@ -578,7 +579,8 @@ class RelocationSection final : public RelocationBaseSection { using Elf_Rela = typename ELFT::Rela; public: - RelocationSection(StringRef name, bool combreloc, unsigned concurrency); + RelocationSection(Ctx &, StringRef name, bool combreloc, + unsigned concurrency); void writeTo(Ctx &, uint8_t *buf) override; }; @@ -588,7 +590,7 @@ class AndroidPackedRelocationSection final : public RelocationBaseSection { using Elf_Rela = typename ELFT::Rela; public: - AndroidPackedRelocationSection(StringRef name, unsigned concurrency); + AndroidPackedRelocationSection(Ctx &, StringRef name, unsigned concurrency); bool updateAllocSize(Ctx &) override; size_t getSize(Ctx &ctx) const override { return relocData.size(); } @@ -611,7 +613,7 @@ struct RelativeReloc { class RelrBaseSection : public SyntheticSection { public: - RelrBaseSection(unsigned concurrency, bool isAArch64Auth = false); + RelrBaseSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false); void mergeRels(); bool isNeeded(Ctx &) const override { return !relocs.empty() || @@ -629,7 +631,7 @@ template class RelrSection final : public RelrBaseSection { using Elf_Relr = typename ELFT::Relr; public: - RelrSection(unsigned concurrency, bool isAArch64Auth = false); + RelrSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false); bool updateAllocSize(Ctx &) override; size_t getSize(Ctx &ctx) const override { @@ -650,7 +652,7 @@ struct SymbolTableEntry { class SymbolTableBaseSection : public SyntheticSection { public: - SymbolTableBaseSection(StringTableSection &strTabSec); + SymbolTableBaseSection(Ctx &ctx, StringTableSection &strTabSec); void finalizeContents(Ctx &) override; size_t getSize(Ctx &ctx) const override { return getNumSymbols() * entsize; } void addSymbol(Symbol *sym); @@ -676,13 +678,13 @@ class SymbolTableSection final : public SymbolTableBaseSection { using Elf_Sym = typename ELFT::Sym; public: - SymbolTableSection(StringTableSection &strTabSec); + SymbolTableSection(Ctx &, StringTableSection &strTabSec); void writeTo(Ctx &, uint8_t *buf) override; }; class SymtabShndxSection final : public SyntheticSection { public: - SymtabShndxSection(); + SymtabShndxSection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; @@ -694,7 +696,7 @@ class SymtabShndxSection final : public SyntheticSection { // https://blogs.oracle.com/ali/entry/gnu_hash_elf_sections class GnuHashTableSection final : public SyntheticSection { public: - GnuHashTableSection(); + GnuHashTableSection(Ctx &); void finalizeContents(Ctx &) override; void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { return size; } @@ -722,7 +724,7 @@ class GnuHashTableSection final : public SyntheticSection { class HashTableSection final : public SyntheticSection { public: - HashTableSection(); + HashTableSection(Ctx &); void finalizeContents(Ctx &) override; void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { return size; } @@ -744,7 +746,7 @@ class HashTableSection final : public SyntheticSection { // target (BIND_NOW) or a .plt entry. class PltSection : public SyntheticSection { public: - PltSection(); + PltSection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; bool isNeeded(Ctx &) const override; @@ -765,7 +767,7 @@ class IpltSection final : public SyntheticSection { SmallVector entries; public: - IpltSection(); + IpltSection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; bool isNeeded(Ctx &) const override { return !entries.empty(); } @@ -775,7 +777,7 @@ class IpltSection final : public SyntheticSection { class PPC32GlinkSection : public PltSection { public: - PPC32GlinkSection(); + PPC32GlinkSection(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; @@ -786,7 +788,7 @@ class PPC32GlinkSection : public PltSection { // This is x86-only. class IBTPltSection : public SyntheticSection { public: - IBTPltSection(); + IBTPltSection(Ctx &); void writeTo(Ctx &, uint8_t *Buf) override; bool isNeeded(Ctx &) const override; size_t getSize(Ctx &) const override; @@ -797,7 +799,7 @@ class IBTPltSection : public SyntheticSection { // pages in the PT_LOAD segment is covered by at least one section. class RelroPaddingSection final : public SyntheticSection { public: - RelroPaddingSection(); + RelroPaddingSection(Ctx &); size_t getSize(Ctx &ctx) const override { return 0; } void writeTo(Ctx &, uint8_t *buf) override {} }; @@ -872,7 +874,7 @@ class DebugNamesBaseSection : public SyntheticSection { SmallVector compUnits; }; - DebugNamesBaseSection(); + DebugNamesBaseSection(Ctx &); size_t getSize(Ctx &ctx) const override { return size; } bool isNeeded(Ctx &) const override { return numChunks > 0; } @@ -916,7 +918,7 @@ class DebugNamesBaseSection : public SyntheticSection { template class DebugNamesSection final : public DebugNamesBaseSection { public: - DebugNamesSection(); + DebugNamesSection(Ctx &); void finalizeContents(Ctx &) override; void writeTo(Ctx &, uint8_t *buf) override; @@ -963,8 +965,9 @@ class GdbIndexSection final : public SyntheticSection { uint32_t cuVectorOff; }; - GdbIndexSection(); - template static std::unique_ptr create(); + GdbIndexSection(Ctx &); + template + static std::unique_ptr create(Ctx &); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { return size; } bool isNeeded(Ctx &) const override; @@ -1002,7 +1005,7 @@ class GdbIndexSection final : public SyntheticSection { // http://www.airs.com/blog/archives/462 (".eh_frame_hdr") class EhFrameHeader final : public SyntheticSection { public: - EhFrameHeader(); + EhFrameHeader(Ctx &); void write(); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; @@ -1019,7 +1022,7 @@ class EhFrameHeader final : public SyntheticSection { // followed by an array of Elf_Verdaux structures. class VersionDefinitionSection final : public SyntheticSection { public: - VersionDefinitionSection(); + VersionDefinitionSection(Ctx &); void finalizeContents(Ctx &) override; size_t getSize(Ctx &) const override; void writeTo(Ctx &, uint8_t *buf) override; @@ -1041,7 +1044,7 @@ class VersionDefinitionSection final : public SyntheticSection { // the own object or in any of the dependencies. class VersionTableSection final : public SyntheticSection { public: - VersionTableSection(); + VersionTableSection(Ctx &); void finalizeContents(Ctx &) override; size_t getSize(Ctx &) const override; void writeTo(Ctx &, uint8_t *buf) override; @@ -1072,7 +1075,7 @@ class VersionNeedSection final : public SyntheticSection { SmallVector verneeds; public: - VersionNeedSection(); + VersionNeedSection(Ctx &); void finalizeContents(Ctx &) override; void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; @@ -1089,14 +1092,14 @@ class MergeSyntheticSection : public SyntheticSection { SmallVector sections; protected: - MergeSyntheticSection(StringRef name, uint32_t type, uint64_t flags, + MergeSyntheticSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags, uint32_t addralign) - : SyntheticSection(flags, type, addralign, name) {} + : SyntheticSection(ctx, flags, type, addralign, name) {} }; class MergeTailSection final : public MergeSyntheticSection { public: - MergeTailSection(StringRef name, uint32_t type, uint64_t flags, + MergeTailSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags, uint32_t addralign); size_t getSize(Ctx &) const override; @@ -1109,9 +1112,9 @@ class MergeTailSection final : public MergeSyntheticSection { class MergeNoTailSection final : public MergeSyntheticSection { public: - MergeNoTailSection(StringRef name, uint32_t type, uint64_t flags, + MergeNoTailSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags, uint32_t addralign) - : MergeSyntheticSection(name, type, flags, addralign) {} + : MergeSyntheticSection(ctx, name, type, flags, addralign) {} size_t getSize(Ctx &ctx) const override { return size; } void writeTo(Ctx &, uint8_t *buf) override; @@ -1145,7 +1148,7 @@ class MipsAbiFlagsSection final : public SyntheticSection { public: static std::unique_ptr create(Ctx &); - MipsAbiFlagsSection(Elf_Mips_ABIFlags flags); + MipsAbiFlagsSection(Ctx &, Elf_Mips_ABIFlags flags); size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_ABIFlags); } void writeTo(Ctx &, uint8_t *buf) override; @@ -1161,7 +1164,7 @@ template class MipsOptionsSection final : public SyntheticSection { public: static std::unique_ptr> create(Ctx &); - MipsOptionsSection(Elf_Mips_RegInfo reginfo); + MipsOptionsSection(Ctx &, Elf_Mips_RegInfo reginfo); void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &ctx) const override { @@ -1179,7 +1182,7 @@ template class MipsReginfoSection final : public SyntheticSection { public: static std::unique_ptr create(Ctx &); - MipsReginfoSection(Elf_Mips_RegInfo reginfo); + MipsReginfoSection(Ctx &, Elf_Mips_RegInfo reginfo); size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_RegInfo); } void writeTo(Ctx &, uint8_t *buf) override; @@ -1193,7 +1196,7 @@ template class MipsReginfoSection final : public SyntheticSection { // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf class MipsRldMapSection final : public SyntheticSection { public: - MipsRldMapSection(); + MipsRldMapSection(Ctx &); size_t getSize(Ctx &ctx) const override { return ctx.arg.wordsize; } void writeTo(Ctx &, uint8_t *buf) override {} }; @@ -1234,7 +1237,7 @@ class MipsRldMapSection final : public SyntheticSection { // either find the .ARM.exidx section or know that we need to generate one. class ARMExidxSyntheticSection : public SyntheticSection { public: - ARMExidxSyntheticSection(); + ARMExidxSyntheticSection(Ctx &); // Add an input section to the ARMExidxSyntheticSection. Returns whether the // section needs to be removed from the main input section list. @@ -1281,7 +1284,7 @@ class ARMExidxSyntheticSection : public SyntheticSection { class ThunkSection final : public SyntheticSection { public: // ThunkSection in OS, with desired outSecOff of Off - ThunkSection(OutputSection *os, uint64_t off); + ThunkSection(Ctx &, OutputSection *os, uint64_t off); // Add a newly created Thunk to this container: // Thunk is given offset from start of this InputSection @@ -1332,7 +1335,7 @@ class ArmCmseSGSection final : public SyntheticSection { // synthesize PLT entries for PPC32 Secure PLT ABI. class PPC32Got2Section final : public SyntheticSection { public: - PPC32Got2Section(); + PPC32Got2Section(Ctx &); size_t getSize(Ctx &ctx) const override { return 0; } bool isNeeded(Ctx &) const override; void finalizeContents(Ctx &) override; @@ -1346,7 +1349,7 @@ class PPC32Got2Section final : public SyntheticSection { // filled in by the dynamic linker. class PPC64LongBranchTargetSection final : public SyntheticSection { public: - PPC64LongBranchTargetSection(); + PPC64LongBranchTargetSection(Ctx &); uint64_t getEntryVA(const Symbol *sym, int64_t addend); std::optional addEntry(const Symbol *sym, int64_t addend); size_t getSize(Ctx &) const override; @@ -1363,7 +1366,7 @@ class PPC64LongBranchTargetSection final : public SyntheticSection { template class PartitionElfHeaderSection final : public SyntheticSection { public: - PartitionElfHeaderSection(); + PartitionElfHeaderSection(Ctx &); size_t getSize(Ctx &) const override; void writeTo(Ctx &, uint8_t *buf) override; }; @@ -1371,14 +1374,14 @@ class PartitionElfHeaderSection final : public SyntheticSection { template class PartitionProgramHeadersSection final : public SyntheticSection { public: - PartitionProgramHeadersSection(); + PartitionProgramHeadersSection(Ctx &); size_t getSize(Ctx &) const override; void writeTo(Ctx &, uint8_t *buf) override; }; class PartitionIndexSection final : public SyntheticSection { public: - PartitionIndexSection(); + PartitionIndexSection(Ctx &); size_t getSize(Ctx &) const override; void finalizeContents(Ctx &) override; void writeTo(Ctx &, uint8_t *buf) override; @@ -1389,8 +1392,8 @@ class PartitionIndexSection final : public SyntheticSection { // https://cs.android.com/android/platform/superproject/+/master:bionic/libc/bionic/libc_init_static.cpp;drc=9425b16978f9c5aa8f2c50c873db470819480d1d;l=192 class MemtagAndroidNote final : public SyntheticSection { public: - MemtagAndroidNote() - : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, + MemtagAndroidNote(Ctx &ctx) + : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, /*alignment=*/4, ".note.android.memtag") {} void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; @@ -1398,8 +1401,8 @@ class MemtagAndroidNote final : public SyntheticSection { class PackageMetadataNote final : public SyntheticSection { public: - PackageMetadataNote() - : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, + PackageMetadataNote(Ctx &ctx) + : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, /*alignment=*/4, ".note.package") {} void writeTo(Ctx &, uint8_t *buf) override; size_t getSize(Ctx &) const override; @@ -1407,8 +1410,8 @@ class PackageMetadataNote final : public SyntheticSection { class MemtagGlobalDescriptors final : public SyntheticSection { public: - MemtagGlobalDescriptors() - : SyntheticSection(llvm::ELF::SHF_ALLOC, + MemtagGlobalDescriptors(Ctx &ctx) + : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_AARCH64_MEMTAG_GLOBALS_DYNAMIC, /*alignment=*/4, ".memtag.globals.dynamic") {} void writeTo(Ctx &, uint8_t *buf) override; From 1fd79f105da64cec7986807c1d9c4896bd39dafa Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 08:08:28 +0200 Subject: [PATCH 129/177] [clang][bytecode] Check number of addcarry/subborrow args (#111952) Apparently this can fail as well. --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 1765193f5bebbc..74e9e1cf629372 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1287,7 +1287,7 @@ static bool interp__builtin_ia32_addcarry_subborrow(InterpState &S, const InterpFrame *Frame, const Function *Func, const CallExpr *Call) { - if (!Call->getArg(0)->getType()->isIntegerType() || + if (Call->getNumArgs() != 4 || !Call->getArg(0)->getType()->isIntegerType() || !Call->getArg(1)->getType()->isIntegerType() || !Call->getArg(2)->getType()->isIntegerType()) return false; From d91c103a107ab16b59c1bb67687233a1100d7ecf Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 10 Oct 2024 23:28:25 -0700 Subject: [PATCH 130/177] [ELF] Pass Ctx & to SyntheticSections --- lld/ELF/InputSection.h | 4 +++- lld/ELF/SyntheticSections.cpp | 18 ++++++++++-------- lld/ELF/SyntheticSections.h | 4 ++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index 543ff4db3c3270..bf2cf09f2921b2 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -472,10 +472,12 @@ static_assert(sizeof(InputSection) <= 160, "InputSection is too big"); class SyntheticSection : public InputSection { public: + Ctx &ctx; SyntheticSection(Ctx &ctx, uint64_t flags, uint32_t type, uint32_t addralign, StringRef name) : InputSection(ctx.internalFile, flags, type, addralign, {}, name, - InputSectionBase::Synthetic) {} + InputSectionBase::Synthetic), + ctx(ctx) {} virtual ~SyntheticSection() = default; virtual size_t getSize(Ctx &) const = 0; diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index a65c137762ce63..5d62f089e40848 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -59,7 +59,7 @@ using llvm::support::endian::write64le; constexpr size_t MergeNoTailSection::numShards; -static uint64_t readUint(uint8_t *buf) { +static uint64_t readUint(Ctx &ctx, uint8_t *buf) { return ctx.arg.is64 ? read64(buf) : read32(buf); } @@ -267,7 +267,7 @@ MipsReginfoSection::create(Ctx &ctx) { return std::make_unique>(ctx, reginfo); } -InputSection *elf::createInterpSection() { +InputSection *elf::createInterpSection(Ctx &) { // StringSaver guarantees that the returned string ends with '\0'. StringRef s = saver().save(ctx.arg.dynamicLinker); ArrayRef contents = {(const uint8_t *)s.data(), s.size() + 1}; @@ -609,7 +609,7 @@ static uint64_t readFdeAddr(uint8_t *buf, int size) { case DW_EH_PE_sdata8: return read64(buf); case DW_EH_PE_absptr: - return readUint(buf); + return readUint(ctx, buf); } fatal("unknown FDE size encoding"); } @@ -1452,7 +1452,8 @@ DynamicSection::computeContents() { addInSec(DT_PLTGOT, *ctx.in.plt); break; case EM_AARCH64: - if (llvm::find_if(ctx.in.relaPlt->relocs, [](const DynamicReloc &r) { + if (llvm::find_if(ctx.in.relaPlt->relocs, [&ctx = ctx]( + const DynamicReloc &r) { return r.type == ctx.target->pltRel && r.sym->stOther & STO_AARCH64_VARIANT_PCS; }) != ctx.in.relaPlt->relocs.end()) @@ -1460,7 +1461,8 @@ DynamicSection::computeContents() { addInSec(DT_PLTGOT, *ctx.in.gotPlt); break; case EM_RISCV: - if (llvm::any_of(ctx.in.relaPlt->relocs, [](const DynamicReloc &r) { + if (llvm::any_of(ctx.in.relaPlt->relocs, [&ctx = ctx]( + const DynamicReloc &r) { return r.type == ctx.target->pltRel && (r.sym->stOther & STO_RISCV_VARIANT_CC); })) @@ -2441,7 +2443,7 @@ void GnuHashTableSection::writeTo(Ctx &ctx, uint8_t *buf) { // When C = 64, we choose a word with bits [6:...] and set 1 to two bits in // the word using bits [0:5] and [26:31]. size_t i = (sym.hash / c) & (maskWords - 1); - uint64_t val = readUint(buf + i * ctx.arg.wordsize); + uint64_t val = readUint(ctx, buf + i * ctx.arg.wordsize); val |= uint64_t(1) << (sym.hash % c); val |= uint64_t(1) << ((sym.hash >> Shift2) % c); writeUint(buf + i * ctx.arg.wordsize, val); @@ -3513,7 +3515,7 @@ createSymbols( // Returns a newly-created .gdb_index section. template -std::unique_ptr GdbIndexSection::create(Ctx &) { +std::unique_ptr GdbIndexSection::create(Ctx &ctx) { llvm::TimeTraceScope timeScope("Create gdb index"); // Collect InputFiles with .debug_info. See the comment in @@ -4684,7 +4686,7 @@ template void elf::createSyntheticSections(Ctx &ctx) { // SyntheticSections coming last. if (needsInterpSection(ctx)) { for (size_t i = 1; i <= ctx.partitions.size(); ++i) { - InputSection *sec = createInterpSection(); + InputSection *sec = createInterpSection(ctx); sec->partition = i; ctx.inputSections.push_back(sec); } diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index b89860dd21371a..283b2953449e59 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -554,7 +554,7 @@ class RelocationBaseSection : public SyntheticSection { (d->type == llvm::ELF::SHT_RELA || d->type == llvm::ELF::SHT_REL || d->type == llvm::ELF::SHT_RELR || (d->type == llvm::ELF::SHT_AARCH64_AUTH_RELR && - ctx.arg.emachine == llvm::ELF::EM_AARCH64)); + elf::ctx.arg.emachine == llvm::ELF::EM_AARCH64)); } int32_t dynamicTag, sizeDynamicTag; SmallVector relocs; @@ -1433,7 +1433,7 @@ class MemtagGlobalDescriptors final : public SyntheticSection { }; template void createSyntheticSections(Ctx &); -InputSection *createInterpSection(); +InputSection *createInterpSection(Ctx &); MergeInputSection *createCommentSection(); template void splitSections(Ctx &); void combineEhSections(Ctx &); From d656b2063262d59c3565e63095104c01d1f6a5a3 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Fri, 11 Oct 2024 08:37:20 +0200 Subject: [PATCH 131/177] [AMDGPU][SplitModule] Cleanup CallsExternal Handling (#106528) - Don't treat inline ASM as indirect calls - Remove call to alias testing, which was broken (only working by pure luck right now) and isn't needed anyway. GlobalOpt should take care of them for us. --- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 85 ++++++++++++++----- .../AMDGPU/indirect-call-inline-asm-debug.ll | 28 ++++++ .../AMDGPU/indirect-call-inline-asm.ll | 30 +++++++ .../AMDGPU/kernels-alias-dependencies.ll | 41 --------- .../AMDGPU/kernels-dependency-indirect.ll | 12 --- 5 files changed, 121 insertions(+), 75 deletions(-) create mode 100644 llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index e97a7f4e075f7f..a62c72d124825e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -43,6 +43,7 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" #include "llvm/IR/User.h" @@ -103,6 +104,11 @@ static cl::opt NoExternalizeGlobals( cl::desc("disables externalization of global variable with local linkage; " "may cause globals to be duplicated which increases binary size")); +static cl::opt NoExternalizeOnAddrTaken( + "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden, + cl::desc( + "disables externalization of functions whose addresses are taken")); + static cl::opt ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg", cl::Hidden, @@ -482,6 +488,9 @@ void SplitGraph::buildGraph(CallGraph &CG) { dbgs() << "[build graph] constructing graph representation of the input\n"); + // FIXME(?): Is the callgraph really worth using if we have to iterate the + // function again whenever it fails to give us enough information? + // We build the graph by just iterating all functions in the module and // working on their direct callees. At the end, all nodes should be linked // together as expected. @@ -492,29 +501,52 @@ void SplitGraph::buildGraph(CallGraph &CG) { continue; // Look at direct callees and create the necessary edges in the graph. - bool HasIndirectCall = false; - Node &N = getNode(Cache, Fn); + SetVector DirectCallees; + bool CallsExternal = false; for (auto &CGEntry : *CG[&Fn]) { auto *CGNode = CGEntry.second; - auto *Callee = CGNode->getFunction(); - if (!Callee) { - // TODO: Don't consider inline assembly as indirect calls. - if (CGNode == CG.getCallsExternalNode()) - HasIndirectCall = true; - continue; - } - - if (!Callee->isDeclaration()) - createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall); + if (auto *Callee = CGNode->getFunction()) { + if (!Callee->isDeclaration()) + DirectCallees.insert(Callee); + } else if (CGNode == CG.getCallsExternalNode()) + CallsExternal = true; } // Keep track of this function if it contains an indirect call and/or if it // can be indirectly called. - if (HasIndirectCall) { - LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n"); - FnsWithIndirectCalls.push_back(&Fn); + if (CallsExternal) { + LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for "; + Fn.printAsOperand(dbgs()); + dbgs() << " - analyzing function\n"); + + bool HasIndirectCall = false; + for (const auto &Inst : instructions(Fn)) { + // look at all calls without a direct callee. + if (const auto *CB = dyn_cast(&Inst); + CB && !CB->getCalledFunction()) { + // inline assembly can be ignored, unless InlineAsmIsIndirectCall is + // true. + if (CB->isInlineAsm()) { + LLVM_DEBUG(dbgs() << " found inline assembly\n"); + continue; + } + + // everything else is handled conservatively. + HasIndirectCall = true; + break; + } + } + + if (HasIndirectCall) { + LLVM_DEBUG(dbgs() << " indirect call found\n"); + FnsWithIndirectCalls.push_back(&Fn); + } } + Node &N = getNode(Cache, Fn); + for (const auto *Callee : DirectCallees) + createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall); + if (canBeIndirectlyCalled(Fn)) IndirectlyCallableFns.push_back(&Fn); } @@ -1326,13 +1358,21 @@ static void splitAMDGPUModule( // // Additionally, it guides partitioning to not duplicate this function if it's // called directly at some point. - for (auto &Fn : M) { - if (Fn.hasAddressTaken()) { - if (Fn.hasLocalLinkage()) { - LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName() - << " because its address is taken\n"); + // + // TODO: Could we be smarter about this ? This makes all functions whose + // addresses are taken non-copyable. We should probably model this type of + // constraint in the graph and use it to guide splitting, instead of + // externalizing like this. Maybe non-copyable should really mean "keep one + // visible copy, then internalize all other copies" for some functions? + if (!NoExternalizeOnAddrTaken) { + for (auto &Fn : M) { + // TODO: Should aliases count? Probably not but they're so rare I'm not + // sure it's worth fixing. + if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) { + LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs()); + dbgs() << " because its address is taken\n"); + externalize(Fn); } - externalize(Fn); } } @@ -1368,7 +1408,8 @@ static void splitAMDGPUModule( dbgs() << "[graph] nodes:\n"; for (const SplitGraph::Node *N : SG.nodes()) { dbgs() << " - [" << N->getID() << "]: " << N->getName() << " " - << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n"; + << (N->isGraphEntryPoint() ? "(entry)" : "") << " " + << (N->isNonCopyable() ? "(noncopyable)" : "") << "\n"; } }); diff --git a/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll new file mode 100644 index 00000000000000..5b15e740f76b96 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll @@ -0,0 +1,28 @@ +; REQUIRES: asserts + +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken -debug-only=amdgpu-split-module 2>&1 | FileCheck %s + +; CHECK: [!] callgraph is incomplete for ptr @A - analyzing function +; CHECK-NEXT: found inline assembly +; CHECK-NOT: indirect call found + +@addrthief = global [2 x ptr] [ptr @HelperA, ptr @HelperB] + +define internal void @HelperA() { + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @A() { + call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"() + call void @HelperA() + ret void +} + +define amdgpu_kernel void @B(ptr %out) { + call void @HelperB() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll new file mode 100644 index 00000000000000..13c30c9e45e808 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll @@ -0,0 +1,30 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s + +; CHECK0: define internal void @HelperB +; CHECK0: define amdgpu_kernel void @B + +; CHECK1: define internal void @HelperA() +; CHECK1: define amdgpu_kernel void @A() + +@addrthief = global [2 x ptr] [ptr @HelperA, ptr @HelperB] + +define internal void @HelperA() { + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @A() { + call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"() + call void @HelperA() + ret void +} + +define amdgpu_kernel void @B(ptr %out) { + call void @HelperB() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll deleted file mode 100644 index d7e84abd5f968d..00000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s - -; 3 kernels: -; - A calls nothing -; - B calls @PerryThePlatypus -; - C calls @Perry, an alias of @PerryThePlatypus -; -; We should see through the alias and put B/C in the same -; partition. -; -; Additionally, @PerryThePlatypus gets externalized as -; the alias counts as taking its address. - -; CHECK0: define amdgpu_kernel void @A - -; CHECK1: @Perry = internal alias ptr (), ptr @PerryThePlatypus -; CHECK1: define hidden void @PerryThePlatypus() -; CHECK1: define amdgpu_kernel void @B -; CHECK1: define amdgpu_kernel void @C - -@Perry = internal alias ptr(), ptr @PerryThePlatypus - -define internal void @PerryThePlatypus() { - ret void -} - -define amdgpu_kernel void @A() { - ret void -} - -define amdgpu_kernel void @B() { - call void @PerryThePlatypus() - ret void -} - -define amdgpu_kernel void @C() { - call void @Perry() - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll index 5be945bda48bf4..c2acb06d3e72e5 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll @@ -3,18 +3,6 @@ ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s -; We have 4 kernels: -; - Each kernel has an internal helper -; - @A and @B's helpers does an indirect call. -; -; We default to putting A/B in P0, alongside a copy -; of all helpers who have their address taken. -; The other kernels can still go into separate partitions. -; -; Note that dependency discovery shouldn't stop upon finding an -; indirect call. HelperC/D should also end up in P0 as they -; are dependencies of HelperB. - ; CHECK0: define internal void @HelperD ; CHECK0: define amdgpu_kernel void @D From 81bd712f928b3c736d83252df75c1c1bd3374122 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 10 Oct 2024 23:43:21 -0700 Subject: [PATCH 132/177] [ELF] Revert Ctx & parameters from SyntheticSection Since Ctx &ctx is a member variable, 1f391a75af8685e6bba89421443d72ac6a186599 7a5b9ef54eb96abd8415fd893576c42e51fd95db e2f0ec3a3a8a2981be8a1aac2004cfb9064c61e8 can be reverted. --- lld/ELF/AArch64ErrataFix.cpp | 8 +- lld/ELF/ARMErrataFix.cpp | 8 +- lld/ELF/Arch/ARM.cpp | 13 +- lld/ELF/Arch/RISCV.cpp | 8 +- lld/ELF/InputSection.cpp | 2 +- lld/ELF/InputSection.h | 8 +- lld/ELF/LinkerScript.cpp | 2 +- lld/ELF/OutputSections.cpp | 4 +- lld/ELF/Relocations.cpp | 9 +- lld/ELF/SyntheticSections.cpp | 229 ++++++++++++++-------------- lld/ELF/SyntheticSections.h | 275 +++++++++++++++++----------------- lld/ELF/Writer.cpp | 18 +-- 12 files changed, 283 insertions(+), 301 deletions(-) diff --git a/lld/ELF/AArch64ErrataFix.cpp b/lld/ELF/AArch64ErrataFix.cpp index a5129c58da13d9..b1f6c424688b36 100644 --- a/lld/ELF/AArch64ErrataFix.cpp +++ b/lld/ELF/AArch64ErrataFix.cpp @@ -374,9 +374,9 @@ class elf::Patch843419Section final : public SyntheticSection { public: Patch843419Section(Ctx &, InputSection *p, uint64_t off); - void writeTo(Ctx &, uint8_t *buf) override; + void writeTo(uint8_t *buf) override; - size_t getSize(Ctx &) const override { return 8; } + size_t getSize() const override { return 8; } uint64_t getLDSTAddr() const; @@ -399,7 +399,7 @@ Patch843419Section::Patch843419Section(Ctx &ctx, InputSection *p, uint64_t off) this->parent = p->getParent(); patchSym = addSyntheticLocal( saver().save("__CortexA53843419_" + utohexstr(getLDSTAddr())), STT_FUNC, - 0, getSize(ctx), *this); + 0, getSize(), *this); addSyntheticLocal(saver().save("$x"), STT_NOTYPE, 0, 0, *this); } @@ -407,7 +407,7 @@ uint64_t Patch843419Section::getLDSTAddr() const { return patchee->getVA(patcheeOffset); } -void Patch843419Section::writeTo(Ctx &ctx, uint8_t *buf) { +void Patch843419Section::writeTo(uint8_t *buf) { // Copy the instruction that we will be replacing with a branch in the // patchee Section. write32le(buf, read32le(patchee->content().begin() + patcheeOffset)); diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp index 57df542e57ec48..6dc6432c40ea5c 100644 --- a/lld/ELF/ARMErrataFix.cpp +++ b/lld/ELF/ARMErrataFix.cpp @@ -73,9 +73,9 @@ class elf::Patch657417Section final : public SyntheticSection { Patch657417Section(Ctx &, InputSection *p, uint64_t off, uint32_t instr, bool isARM); - void writeTo(Ctx &, uint8_t *buf) override; + void writeTo(uint8_t *buf) override; - size_t getSize(Ctx &) const override { return 4; } + size_t getSize() const override { return 4; } // Get the virtual address of the branch instruction at patcheeOffset. uint64_t getBranchAddr() const; @@ -142,7 +142,7 @@ Patch657417Section::Patch657417Section(Ctx &ctx, InputSection *p, uint64_t off, parent = p->getParent(); patchSym = addSyntheticLocal( saver().save("__CortexA8657417_" + utohexstr(getBranchAddr())), STT_FUNC, - isARM ? 0 : 1, getSize(ctx), *this); + isARM ? 0 : 1, getSize(), *this); addSyntheticLocal(saver().save(isARM ? "$a" : "$t"), STT_NOTYPE, 0, 0, *this); } @@ -176,7 +176,7 @@ static uint64_t getThumbDestAddr(Ctx &ctx, uint64_t sourceAddr, return sourceAddr + offset + 4; } -void Patch657417Section::writeTo(Ctx &ctx, uint8_t *buf) { +void Patch657417Section::writeTo(uint8_t *buf) { // The base instruction of the patch is always a 32-bit unconditional branch. if (isARM) write32le(buf, 0xea000000); diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index ecf293602ac99d..d964994a4b3cff 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1333,8 +1333,7 @@ class elf::ArmCmseSGVeneer { ArmCmseSGSection::ArmCmseSGSection(Ctx &ctx) : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC | llvm::ELF::SHF_EXECINSTR, llvm::ELF::SHT_PROGBITS, - /*alignment=*/32, ".gnu.sgstubs"), - ctx(ctx) { + /*alignment=*/32, ".gnu.sgstubs") { entsize = ACLESESYM_SIZE; // The range of addresses used in the CMSE import library should be fixed. for (auto &[_, sym] : ctx.symtab->cmseImportLib) { @@ -1384,7 +1383,7 @@ void ArmCmseSGSection::addSGVeneer(Symbol *acleSeSym, Symbol *sym) { sgVeneers.emplace_back(ss); } -void ArmCmseSGSection::writeTo(Ctx &ctx, uint8_t *buf) { +void ArmCmseSGSection::writeTo(uint8_t *buf) { for (ArmCmseSGVeneer *s : sgVeneers) { uint8_t *p = buf + s->offset; write16(p + 0, 0xe97f); // SG @@ -1401,14 +1400,14 @@ void ArmCmseSGSection::addMappingSymbol() { addSyntheticLocal("$t", STT_NOTYPE, /*off=*/0, /*size=*/0, *this); } -size_t ArmCmseSGSection::getSize(Ctx &) const { +size_t ArmCmseSGSection::getSize() const { if (sgVeneers.empty()) return (impLibMaxAddr ? impLibMaxAddr - getVA() : 0) + newEntries * entsize; return entries.size() * entsize; } -void ArmCmseSGSection::finalizeContents(Ctx &) { +void ArmCmseSGSection::finalizeContents() { if (sgVeneers.empty()) return; @@ -1476,8 +1475,8 @@ template void elf::writeARMCmseImportLib(Ctx &ctx) { osec->recordSection(isec); osec->finalizeInputSections(ctx); osec->shName = shstrtab->addString(osec->name); - osec->size = isec->getSize(ctx); - isec->finalizeContents(ctx); + osec->size = isec->getSize(); + isec->finalizeContents(); osec->offset = alignToPowerOf2(off, osec->addralign); off = osec->offset + osec->size; } diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 351cca025b3864..d65467f10378be 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -1048,8 +1048,8 @@ class RISCVAttributesSection final : public SyntheticSection { : SyntheticSection(ctx, 0, SHT_RISCV_ATTRIBUTES, 1, ".riscv.attributes") { } - size_t getSize(Ctx &) const override { return size; } - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override { return size; } + void writeTo(uint8_t *buf) override; static constexpr StringRef vendor = "riscv"; DenseMap intAttr; @@ -1278,8 +1278,8 @@ mergeAttributesSection(Ctx &ctx, return &merged; } -void RISCVAttributesSection::writeTo(Ctx &ctx, uint8_t *buf) { - const size_t size = getSize(ctx); +void RISCVAttributesSection::writeTo(uint8_t *buf) { + const size_t size = getSize(); uint8_t *const end = buf + size; *buf = ELFAttrs::Format_Version; write32(buf + 1, size - 1); diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 90716f4f3675cc..082fdb9f5c9ac4 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -101,7 +101,7 @@ InputSectionBase::InputSectionBase(ObjFile &file, size_t InputSectionBase::getSize() const { if (auto *s = dyn_cast(this)) - return s->getSize(ctx); + return s->getSize(); return size - bytesDropped; } diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index bf2cf09f2921b2..2b34047bc0682a 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -480,13 +480,13 @@ class SyntheticSection : public InputSection { ctx(ctx) {} virtual ~SyntheticSection() = default; - virtual size_t getSize(Ctx &) const = 0; + virtual size_t getSize() const = 0; virtual bool updateAllocSize(Ctx &) { return false; } // If the section has the SHF_ALLOC flag and the size may be changed if // thunks are added, update the section size. - virtual bool isNeeded(Ctx &) const { return true; } - virtual void finalizeContents(Ctx &) {} - virtual void writeTo(Ctx &, uint8_t *buf) = 0; + virtual bool isNeeded() const { return true; } + virtual void finalizeContents() {} + virtual void writeTo(uint8_t *buf) = 0; static bool classof(const SectionBase *sec) { return sec->kind() == InputSectionBase::Synthetic; diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index e9a637bac4e9bd..b736cb1beef37e 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -1058,7 +1058,7 @@ void LinkerScript::diagnoseOrphanHandling() const { } void LinkerScript::diagnoseMissingSGSectionAddress() const { - if (!ctx.arg.cmseImplib || !ctx.in.armCmseSGSection->isNeeded(ctx)) + if (!ctx.arg.cmseImplib || !ctx.in.armCmseSGSection->isNeeded()) return; OutputSection *sec = findByName(sectionCommands, ".gnu.sgstubs"); diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index 3f3b80830d80d5..7a65858a6f8c18 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -248,7 +248,7 @@ void OutputSection::finalizeInputSections(Ctx &ctx) { commitSection(ctx, s); } for (auto *ms : mergeSections) - ms->finalizeContents(ctx); + ms->finalizeContents(); } static void sortByOrder(MutableArrayRef in, @@ -525,7 +525,7 @@ void OutputSection::writeTo(Ctx &ctx, uint8_t *buf, parallel::TaskGroup &tg) { for (size_t i = begin; i != end; ++i) { InputSection *isec = sections[i]; if (auto *s = dyn_cast(isec)) - s->writeTo(ctx, buf + isec->outSecOff); + s->writeTo(buf + isec->outSecOff); else isec->writeTo(buf + isec->outSecOff); diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 5d81d0cccb78e5..cb33f35e59e43b 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -2020,15 +2020,14 @@ static void forEachInputSectionDescription( // This may invalidate any output section offsets stored outside of InputSection void ThunkCreator::mergeThunks(ArrayRef outputSections) { forEachInputSectionDescription( - outputSections, - [&, &ctx = ctx](OutputSection *os, InputSectionDescription *isd) { + outputSections, [&](OutputSection *os, InputSectionDescription *isd) { if (isd->thunkSections.empty()) return; // Remove any zero sized precreated Thunks. llvm::erase_if(isd->thunkSections, - [&ctx](const std::pair &ts) { - return ts.first->getSize(ctx) == 0; + [](const std::pair &ts) { + return ts.first->getSize() == 0; }); // ISD->ThunkSections contains all created ThunkSections, including @@ -2081,7 +2080,7 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os, for (std::pair tp : isd->thunkSections) { ThunkSection *ts = tp.first; uint64_t tsBase = os->addr + ts->outSecOff - pcBias; - uint64_t tsLimit = tsBase + ts->getSize(ctx); + uint64_t tsLimit = tsBase + ts->getSize(); if (ctx.target->inBranchRange(rel.type, src, (src > tsLimit) ? tsBase : tsLimit)) return ts; diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 5d62f089e40848..ee0e9c513740ac 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -103,8 +103,7 @@ MipsAbiFlagsSection::MipsAbiFlagsSection(Ctx &ctx, this->entsize = sizeof(Elf_Mips_ABIFlags); } -template -void MipsAbiFlagsSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void MipsAbiFlagsSection::writeTo(uint8_t *buf) { memcpy(buf, &flags, sizeof(flags)); } @@ -165,11 +164,10 @@ MipsOptionsSection::MipsOptionsSection(Ctx &ctx, Elf_Mips_RegInfo reginfo) this->entsize = sizeof(Elf_Mips_Options) + sizeof(Elf_Mips_RegInfo); } -template -void MipsOptionsSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void MipsOptionsSection::writeTo(uint8_t *buf) { auto *options = reinterpret_cast(buf); options->kind = ODK_REGINFO; - options->size = getSize(ctx); + options->size = getSize(); if (!ctx.arg.relocatable) reginfo.ri_gp_value = ctx.in.mipsGot->getGp(); @@ -228,8 +226,7 @@ MipsReginfoSection::MipsReginfoSection(Ctx &ctx, Elf_Mips_RegInfo reginfo) this->entsize = sizeof(Elf_Mips_RegInfo); } -template -void MipsReginfoSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void MipsReginfoSection::writeTo(uint8_t *buf) { if (!ctx.arg.relocatable) reginfo.ri_gp_value = ctx.in.mipsGot->getGp(); memcpy(buf, ®info, sizeof(reginfo)); @@ -324,9 +321,9 @@ GnuPropertySection::GnuPropertySection(Ctx &ctx) : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, ctx.arg.wordsize, ".note.gnu.property") {} -void GnuPropertySection::writeTo(Ctx &ctx, uint8_t *buf) { +void GnuPropertySection::writeTo(uint8_t *buf) { write32(buf, 4); // Name size - write32(buf + 4, getSize(ctx) - 16); // Content size + write32(buf + 4, getSize() - 16); // Content size write32(buf + 8, NT_GNU_PROPERTY_TYPE_0); // Type memcpy(buf + 12, "GNU", 4); // Name string @@ -352,7 +349,7 @@ void GnuPropertySection::writeTo(Ctx &ctx, uint8_t *buf) { } } -size_t GnuPropertySection::getSize(Ctx &ctx) const { +size_t GnuPropertySection::getSize() const { uint32_t contentSize = 0; if (ctx.arg.andFeatures != 0) contentSize += ctx.arg.is64 ? 16 : 12; @@ -366,7 +363,7 @@ BuildIdSection::BuildIdSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"), hashSize(getHashSize()) {} -void BuildIdSection::writeTo(Ctx &ctx, uint8_t *buf) { +void BuildIdSection::writeTo(uint8_t *buf) { write32(buf, 4); // Name size write32(buf + 4, hashSize); // Content size write32(buf + 8, NT_GNU_BUILD_ID); // Type @@ -514,7 +511,7 @@ static void writeCieFde(uint8_t *buf, ArrayRef d) { write32(buf, d.size() - 4); } -void EhFrameSection::finalizeContents(Ctx &) { +void EhFrameSection::finalizeContents() { assert(!this->size); // Not finalized. switch (ctx.arg.ekind) { @@ -630,7 +627,7 @@ uint64_t EhFrameSection::getFdePc(uint8_t *buf, size_t fdeOff, fatal("unknown FDE size relative encoding"); } -void EhFrameSection::writeTo(Ctx &ctx, uint8_t *buf) { +void EhFrameSection::writeTo(uint8_t *buf) { // Write CIE and FDE records. for (CieRecord *rec : cieRecords) { size_t cieOffset = rec->cie->outputOff; @@ -709,7 +706,7 @@ uint64_t GotSection::getGlobalDynOffset(const Symbol &b) const { return b.getTlsGdIdx(ctx) * ctx.arg.wordsize; } -void GotSection::finalizeContents(Ctx &) { +void GotSection::finalizeContents() { if (ctx.arg.emachine == EM_PPC64 && numEntries <= ctx.target->gotHeaderEntriesNum && !ctx.sym.globalOffsetTable) @@ -718,13 +715,13 @@ void GotSection::finalizeContents(Ctx &) { size = numEntries * ctx.arg.wordsize; } -bool GotSection::isNeeded(Ctx &ctx) const { +bool GotSection::isNeeded() const { // Needed if the GOT symbol is used or the number of entries is more than just // the header. A GOT with just the header may not be needed. return hasGotOffRel || numEntries > ctx.target->gotHeaderEntriesNum; } -void GotSection::writeTo(Ctx &ctx, uint8_t *buf) { +void GotSection::writeTo(uint8_t *buf) { // On PPC64 .got may be needed but empty. Skip the write. if (size == 0) return; @@ -878,7 +875,7 @@ bool MipsGotSection::tryMergeGots(FileGot &dst, FileGot &src, bool isPrimary) { return true; } -void MipsGotSection::finalizeContents(Ctx &) { updateAllocSize(ctx); } +void MipsGotSection::finalizeContents() { updateAllocSize(ctx); } bool MipsGotSection::updateAllocSize(Ctx &ctx) { size = headerEntriesNum * ctx.arg.wordsize; @@ -1090,7 +1087,7 @@ void MipsGotSection::build() { } } -bool MipsGotSection::isNeeded(Ctx &ctx) const { +bool MipsGotSection::isNeeded() const { // We add the .got section to the result for dynamic MIPS target because // its address and properties are mentioned in the .dynamic section. return !ctx.arg.relocatable; @@ -1105,7 +1102,7 @@ uint64_t MipsGotSection::getGp(const InputFile *f) const { return getVA() + gots[f->mipsGotIndex].startIndex * ctx.arg.wordsize + 0x7ff0; } -void MipsGotSection::writeTo(Ctx &ctx, uint8_t *buf) { +void MipsGotSection::writeTo(uint8_t *buf) { // Set the MSB of the second GOT slot. This is not required by any // MIPS ABI documentation, though. // @@ -1189,12 +1186,12 @@ void GotPltSection::addEntry(Symbol &sym) { entries.push_back(&sym); } -size_t GotPltSection::getSize(Ctx &ctx) const { +size_t GotPltSection::getSize() const { return (ctx.target->gotPltHeaderEntriesNum + entries.size()) * ctx.target->gotEntrySize; } -void GotPltSection::writeTo(Ctx &ctx, uint8_t *buf) { +void GotPltSection::writeTo(uint8_t *buf) { ctx.target->writeGotPltHeader(buf); buf += ctx.target->gotPltHeaderEntriesNum * ctx.target->gotEntrySize; for (const Symbol *b : entries) { @@ -1203,7 +1200,7 @@ void GotPltSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -bool GotPltSection::isNeeded(Ctx &) const { +bool GotPltSection::isNeeded() const { // We need to emit GOTPLT even if it's empty if there's a relocation relative // to it. return !entries.empty() || hasGotPltOffRel; @@ -1234,11 +1231,11 @@ void IgotPltSection::addEntry(Symbol &sym) { entries.push_back(&sym); } -size_t IgotPltSection::getSize(Ctx &ctx) const { +size_t IgotPltSection::getSize() const { return entries.size() * ctx.target->gotEntrySize; } -void IgotPltSection::writeTo(Ctx &ctx, uint8_t *buf) { +void IgotPltSection::writeTo(uint8_t *buf) { for (const Symbol *b : entries) { ctx.target->writeIgotPlt(buf, *b); buf += ctx.target->gotEntrySize; @@ -1273,7 +1270,7 @@ unsigned StringTableSection::addString(StringRef s, bool hashIt) { return ret; } -void StringTableSection::writeTo(Ctx &ctx, uint8_t *buf) { +void StringTableSection::writeTo(uint8_t *buf) { for (StringRef s : strings) { memcpy(buf, s.data(), s.size()); buf[s.size()] = '\0'; @@ -1308,9 +1305,9 @@ DynamicSection::DynamicSection(Ctx &ctx) // // DT_RELASZ is the total size of the included sections. static uint64_t addRelaSz(const RelocationBaseSection &relaDyn) { - size_t size = relaDyn.getSize(ctx); + size_t size = relaDyn.getSize(); if (ctx.in.relaPlt->getParent() == relaDyn.getParent()) - size += ctx.in.relaPlt->getSize(ctx); + size += ctx.in.relaPlt->getSize(); return size; } @@ -1318,7 +1315,7 @@ static uint64_t addRelaSz(const RelocationBaseSection &relaDyn) { // output section. When this occurs we cannot just use the OutputSection // Size. Moreover the [DT_JMPREL, DT_JMPREL + DT_PLTRELSZ) is permitted to // overlap with the [DT_RELA, DT_RELA + DT_RELASZ). -static uint64_t addPltRelSz() { return ctx.in.relaPlt->getSize(ctx); } +static uint64_t addPltRelSz() { return ctx.in.relaPlt->getSize(); } // Add remaining entries to complete .dynamic contents. template @@ -1405,7 +1402,7 @@ DynamicSection::computeContents() { if (!ctx.arg.shared && !ctx.arg.relocatable && !ctx.arg.zRodynamic) addInt(DT_DEBUG, 0); - if (part.relaDyn->isNeeded(ctx)) { + if (part.relaDyn->isNeeded()) { addInSec(part.relaDyn->dynamicTag, *part.relaDyn); entries.emplace_back(part.relaDyn->sizeDynamicTag, addRelaSz(*part.relaDyn)); @@ -1438,7 +1435,7 @@ DynamicSection::computeContents() { addInt(DT_AARCH64_AUTH_RELRSZ, part.relrAuthDyn->getParent()->size); addInt(DT_AARCH64_AUTH_RELRENT, sizeof(Elf_Relr)); } - if (isMain && ctx.in.relaPlt->isNeeded(ctx)) { + if (isMain && ctx.in.relaPlt->isNeeded()) { addInSec(DT_JMPREL, *ctx.in.relaPlt); entries.emplace_back(DT_PLTRELSZ, addPltRelSz()); switch (ctx.arg.emachine) { @@ -1485,11 +1482,11 @@ DynamicSection::computeContents() { addInt(DT_AARCH64_MEMTAG_MODE, ctx.arg.androidMemtagMode == NT_MEMTAG_LEVEL_ASYNC); addInt(DT_AARCH64_MEMTAG_HEAP, ctx.arg.androidMemtagHeap); addInt(DT_AARCH64_MEMTAG_STACK, ctx.arg.androidMemtagStack); - if (ctx.mainPart->memtagGlobalDescriptors->isNeeded(ctx)) { + if (ctx.mainPart->memtagGlobalDescriptors->isNeeded()) { addInSec(DT_AARCH64_MEMTAG_GLOBALS, *ctx.mainPart->memtagGlobalDescriptors); addInt(DT_AARCH64_MEMTAG_GLOBALSSZ, - ctx.mainPart->memtagGlobalDescriptors->getSize(ctx)); + ctx.mainPart->memtagGlobalDescriptors->getSize()); } } } @@ -1497,7 +1494,7 @@ DynamicSection::computeContents() { addInSec(DT_SYMTAB, *part.dynSymTab); addInt(DT_SYMENT, sizeof(Elf_Sym)); addInSec(DT_STRTAB, *part.dynStrTab); - addInt(DT_STRSZ, part.dynStrTab->getSize(ctx)); + addInt(DT_STRSZ, part.dynStrTab->getSize()); if (!ctx.arg.zText) addInt(DT_TEXTREL, 0); if (part.gnuHashTab && part.gnuHashTab->getParent()) @@ -1527,13 +1524,13 @@ DynamicSection::computeContents() { addInt(DT_FINI, b->getVA()); } - if (part.verSym && part.verSym->isNeeded(ctx)) + if (part.verSym && part.verSym->isNeeded()) addInSec(DT_VERSYM, *part.verSym); if (part.verDef && part.verDef->isLive()) { addInSec(DT_VERDEF, *part.verDef); addInt(DT_VERDEFNUM, getVerDefNum()); } - if (part.verNeed && part.verNeed->isNeeded(ctx)) { + if (part.verNeed && part.verNeed->isNeeded()) { addInSec(DT_VERNEED, *part.verNeed); unsigned needNum = 0; for (SharedFile *f : ctx.sharedFiles) @@ -1570,7 +1567,7 @@ DynamicSection::computeContents() { addInSec(DT_PPC_GOT, *ctx.in.got); // Glink dynamic tag is required by the V2 abi if the plt section isn't empty. - if (ctx.arg.emachine == EM_PPC64 && ctx.in.plt->isNeeded(ctx)) { + if (ctx.arg.emachine == EM_PPC64 && ctx.in.plt->isNeeded()) { // The Glink tag points to 32 bytes before the first lazy symbol resolution // stub, which starts directly after the header. addInt(DT_PPC64_GLINK, @@ -1584,14 +1581,13 @@ DynamicSection::computeContents() { return entries; } -template void DynamicSection::finalizeContents(Ctx &) { +template void DynamicSection::finalizeContents() { if (OutputSection *sec = getPartition().dynStrTab->getParent()) getParent()->link = sec->sectionIndex; this->size = computeContents().size() * this->entsize; } -template -void DynamicSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void DynamicSection::writeTo(uint8_t *buf) { auto *p = reinterpret_cast(buf); for (std::pair kv : computeContents()) { @@ -1686,7 +1682,7 @@ void RelocationBaseSection::partitionRels() { relocs.begin(); } -void RelocationBaseSection::finalizeContents(Ctx &) { +void RelocationBaseSection::finalizeContents() { SymbolTableBaseSection *symTab = getPartition().dynSymTab.get(); // When linking glibc statically, .rel{,a}.plt contains R_*_IRELATIVE @@ -1743,8 +1739,7 @@ RelocationSection::RelocationSection(Ctx &ctx, StringRef name, this->entsize = ctx.arg.isRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel); } -template -void RelocationSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void RelocationSection::writeTo(uint8_t *buf) { computeRels(); for (const DynamicReloc &rel : relocs) { auto *p = reinterpret_cast(buf); @@ -2142,7 +2137,7 @@ static bool sortMipsSymbols(const SymbolTableEntry &l, return !l.sym->isInGot(ctx); } -void SymbolTableBaseSection::finalizeContents(Ctx &) { +void SymbolTableBaseSection::finalizeContents() { if (OutputSection *sec = strTabSec.getParent()) getParent()->link = sec->sectionIndex; @@ -2259,8 +2254,7 @@ static uint32_t getSymSectionIndex(Symbol *sym) { } // Write the internal symbol table contents to the output symbol table. -template -void SymbolTableSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void SymbolTableSection::writeTo(uint8_t *buf) { // The first entry is a null entry as per the ELF spec. buf += sizeof(Elf_Sym); @@ -2342,7 +2336,7 @@ SymtabShndxSection::SymtabShndxSection(Ctx &ctx) this->entsize = 4; } -void SymtabShndxSection::writeTo(Ctx &ctx, uint8_t *buf) { +void SymtabShndxSection::writeTo(uint8_t *buf) { // We write an array of 32 bit values, where each value has 1:1 association // with an entry in ctx.in.symTab if the corresponding entry contains // SHN_XINDEX, we need to write actual index, otherwise, we must write @@ -2355,7 +2349,7 @@ void SymtabShndxSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -bool SymtabShndxSection::isNeeded(Ctx &ctx) const { +bool SymtabShndxSection::isNeeded() const { // SHT_SYMTAB can hold symbols with section indices values up to // SHN_LORESERVE. If we need more, we want to use extension SHT_SYMTAB_SHNDX // section. Problem is that we reveal the final section indices a bit too @@ -2368,11 +2362,11 @@ bool SymtabShndxSection::isNeeded(Ctx &ctx) const { return size >= SHN_LORESERVE; } -void SymtabShndxSection::finalizeContents(Ctx &) { +void SymtabShndxSection::finalizeContents() { getParent()->link = ctx.in.symTab->getParent()->sectionIndex; } -size_t SymtabShndxSection::getSize(Ctx &ctx) const { +size_t SymtabShndxSection::getSize() const { return ctx.in.symTab->getNumSymbols() * 4; } @@ -2410,7 +2404,7 @@ GnuHashTableSection::GnuHashTableSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_HASH, ctx.arg.wordsize, ".gnu.hash") {} -void GnuHashTableSection::finalizeContents(Ctx &) { +void GnuHashTableSection::finalizeContents() { if (OutputSection *sec = getPartition().dynSymTab->getParent()) getParent()->link = sec->sectionIndex; @@ -2429,7 +2423,7 @@ void GnuHashTableSection::finalizeContents(Ctx &) { size += symbols.size() * 4; // Hash values } -void GnuHashTableSection::writeTo(Ctx &ctx, uint8_t *buf) { +void GnuHashTableSection::writeTo(uint8_t *buf) { // Write a header. write32(buf, nBuckets); write32(buf + 4, getPartition().dynSymTab->getNumSymbols() - symbols.size()); @@ -2520,7 +2514,7 @@ HashTableSection::HashTableSection(Ctx &ctx) this->entsize = 4; } -void HashTableSection::finalizeContents(Ctx &) { +void HashTableSection::finalizeContents() { SymbolTableBaseSection *symTab = getPartition().dynSymTab.get(); if (OutputSection *sec = symTab->getParent()) @@ -2534,7 +2528,7 @@ void HashTableSection::finalizeContents(Ctx &) { this->size = numEntries * 4; } -void HashTableSection::writeTo(Ctx &ctx, uint8_t *buf) { +void HashTableSection::writeTo(uint8_t *buf) { SymbolTableBaseSection *symTab = getPartition().dynSymTab.get(); unsigned numSymbols = symTab->getNumSymbols(); @@ -2577,7 +2571,7 @@ PltSection::PltSection(Ctx &ctx) this->flags |= SHF_WRITE; } -void PltSection::writeTo(Ctx &ctx, uint8_t *buf) { +void PltSection::writeTo(uint8_t *buf) { // At beginning of PLT, we have code to call the dynamic // linker to resolve dynsyms at runtime. Write such code. ctx.target->writePltHeader(buf); @@ -2595,14 +2589,13 @@ void PltSection::addEntry(Symbol &sym) { entries.push_back(&sym); } -size_t PltSection::getSize(Ctx &ctx) const { +size_t PltSection::getSize() const { return headerSize + entries.size() * ctx.target->pltEntrySize; } -bool PltSection::isNeeded(Ctx &ctx) const { +bool PltSection::isNeeded() const { // For -z retpolineplt, .iplt needs the .plt header. - return !entries.empty() || - (ctx.arg.zRetpolineplt && ctx.in.iplt->isNeeded(ctx)); + return !entries.empty() || (ctx.arg.zRetpolineplt && ctx.in.iplt->isNeeded()); } // Used by ARM to add mapping symbols in the PLT section, which aid @@ -2626,7 +2619,7 @@ IpltSection::IpltSection(Ctx &ctx) } } -void IpltSection::writeTo(Ctx &ctx, uint8_t *buf) { +void IpltSection::writeTo(uint8_t *buf) { uint32_t off = 0; for (const Symbol *sym : entries) { ctx.target->writeIplt(buf + off, *sym, getVA() + off); @@ -2634,7 +2627,7 @@ void IpltSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -size_t IpltSection::getSize(Ctx &ctx) const { +size_t IpltSection::getSize() const { return entries.size() * ctx.target->ipltEntrySize; } @@ -2658,11 +2651,11 @@ PPC32GlinkSection::PPC32GlinkSection(Ctx &ctx) : PltSection(ctx) { addralign = 4; } -void PPC32GlinkSection::writeTo(Ctx &ctx, uint8_t *buf) { +void PPC32GlinkSection::writeTo(uint8_t *buf) { writePPC32GlinkSection(ctx, buf, entries.size()); } -size_t PPC32GlinkSection::getSize(Ctx &ctx) const { +size_t PPC32GlinkSection::getSize() const { return headerSize + entries.size() * ctx.target->pltEntrySize + footerSize; } @@ -2728,18 +2721,16 @@ IBTPltSection::IBTPltSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".plt") {} -void IBTPltSection::writeTo(Ctx &ctx, uint8_t *buf) { +void IBTPltSection::writeTo(uint8_t *buf) { ctx.target->writeIBTPlt(buf, ctx.in.plt->getNumEntries()); } -size_t IBTPltSection::getSize(Ctx &ctx) const { +size_t IBTPltSection::getSize() const { // 16 is the header size of .plt. return 16 + ctx.in.plt->getNumEntries() * ctx.target->pltEntrySize; } -bool IBTPltSection::isNeeded(Ctx &ctx) const { - return ctx.in.plt->getNumEntries() > 0; -} +bool IBTPltSection::isNeeded() const { return ctx.in.plt->getNumEntries() > 0; } RelroPaddingSection::RelroPaddingSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_NOBITS, 1, @@ -3242,7 +3233,7 @@ void DebugNamesSection::getNameRelocs( } } -template void DebugNamesSection::finalizeContents(Ctx &) { +template void DebugNamesSection::finalizeContents() { // Get relocations of .debug_names sections. auto relocs = std::make_unique[]>(numChunks); parallelFor(0, numChunks, [&](size_t i) { @@ -3262,8 +3253,7 @@ template void DebugNamesSection::finalizeContents(Ctx &) { }); } -template -void DebugNamesSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void DebugNamesSection::writeTo(uint8_t *buf) { [[maybe_unused]] const uint8_t *const beginBuf = buf; // Write the header. endian::writeNext(buf, hdr.UnitLength); @@ -3574,7 +3564,7 @@ std::unique_ptr GdbIndexSection::create(Ctx &ctx) { return ret; } -void GdbIndexSection::writeTo(Ctx &ctx, uint8_t *buf) { +void GdbIndexSection::writeTo(uint8_t *buf) { // Write the header. auto *hdr = reinterpret_cast(buf); uint8_t *start = buf; @@ -3643,12 +3633,12 @@ void GdbIndexSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -bool GdbIndexSection::isNeeded(Ctx &) const { return !chunks.empty(); } +bool GdbIndexSection::isNeeded() const { return !chunks.empty(); } EhFrameHeader::EhFrameHeader(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".eh_frame_hdr") {} -void EhFrameHeader::writeTo(Ctx &ctx, uint8_t *buf) { +void EhFrameHeader::writeTo(uint8_t *buf) { // Unlike most sections, the EhFrameHeader section is written while writing // another section, namely EhFrameSection, which calls the write() function // below from its writeTo() function. This is necessary because the contents @@ -3681,13 +3671,13 @@ void EhFrameHeader::write() { } } -size_t EhFrameHeader::getSize(Ctx &ctx) const { +size_t EhFrameHeader::getSize() const { // .eh_frame_hdr has a 12 bytes header followed by an array of FDEs. return 12 + getPartition().ehFrame->numFdes * 8; } -bool EhFrameHeader::isNeeded(Ctx &ctx) const { - return isLive() && getPartition().ehFrame->isNeeded(ctx); +bool EhFrameHeader::isNeeded() const { + return isLive() && getPartition().ehFrame->isNeeded(); } VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx) @@ -3702,7 +3692,7 @@ StringRef VersionDefinitionSection::getFileDefName() { return ctx.arg.outputFile; } -void VersionDefinitionSection::finalizeContents(Ctx &) { +void VersionDefinitionSection::finalizeContents() { fileDefNameOff = getPartition().dynStrTab->addString(getFileDefName()); for (const VersionDefinition &v : namedVersionDefs(ctx)) verDefNameOffs.push_back(getPartition().dynStrTab->addString(v.name)); @@ -3734,7 +3724,7 @@ void VersionDefinitionSection::writeOne(uint8_t *buf, uint32_t index, write32(buf + 24, 0); // vda_next } -void VersionDefinitionSection::writeTo(Ctx &ctx, uint8_t *buf) { +void VersionDefinitionSection::writeTo(uint8_t *buf) { writeOne(buf, 1, getFileDefName(), fileDefNameOff); auto nameOffIt = verDefNameOffs.begin(); @@ -3747,7 +3737,7 @@ void VersionDefinitionSection::writeTo(Ctx &ctx, uint8_t *buf) { write32(buf + 16, 0); // vd_next } -size_t VersionDefinitionSection::getSize(Ctx &ctx) const { +size_t VersionDefinitionSection::getSize() const { return EntrySize * getVerDefNum(); } @@ -3758,17 +3748,17 @@ VersionTableSection::VersionTableSection(Ctx &ctx) this->entsize = 2; } -void VersionTableSection::finalizeContents(Ctx &) { +void VersionTableSection::finalizeContents() { // At the moment of june 2016 GNU docs does not mention that sh_link field // should be set, but Sun docs do. Also readelf relies on this field. getParent()->link = getPartition().dynSymTab->getParent()->sectionIndex; } -size_t VersionTableSection::getSize(Ctx &ctx) const { +size_t VersionTableSection::getSize() const { return (getPartition().dynSymTab->getSymbols().size() + 1) * 2; } -void VersionTableSection::writeTo(Ctx &ctx, uint8_t *buf) { +void VersionTableSection::writeTo(uint8_t *buf) { buf += 2; for (const SymbolTableEntry &s : getPartition().dynSymTab->getSymbols()) { // For an unextracted lazy symbol (undefined weak), it must have been @@ -3779,9 +3769,9 @@ void VersionTableSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -bool VersionTableSection::isNeeded(Ctx &ctx) const { +bool VersionTableSection::isNeeded() const { return isLive() && - (getPartition().verDef || getPartition().verNeed->isNeeded(ctx)); + (getPartition().verDef || getPartition().verNeed->isNeeded()); } void elf::addVerneed(Symbol *ss) { @@ -3807,7 +3797,7 @@ VersionNeedSection::VersionNeedSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_verneed, sizeof(uint32_t), ".gnu.version_r") {} -template void VersionNeedSection::finalizeContents(Ctx &) { +template void VersionNeedSection::finalizeContents() { for (SharedFile *f : ctx.sharedFiles) { if (f->vernauxs.empty()) continue; @@ -3840,8 +3830,7 @@ template void VersionNeedSection::finalizeContents(Ctx &) { getParent()->info = verneeds.size(); } -template -void VersionNeedSection::writeTo(Ctx &ctx, uint8_t *buf) { +template void VersionNeedSection::writeTo(uint8_t *buf) { // The Elf_Verneeds need to appear first, followed by the Elf_Vernauxs. auto *verneed = reinterpret_cast(buf); auto *vernaux = reinterpret_cast(verneed + verneeds.size()); @@ -3871,12 +3860,12 @@ void VersionNeedSection::writeTo(Ctx &ctx, uint8_t *buf) { verneed[-1].vn_next = 0; } -template size_t VersionNeedSection::getSize(Ctx &ctx) const { +template size_t VersionNeedSection::getSize() const { return verneeds.size() * sizeof(Elf_Verneed) + SharedFile::vernauxNum * sizeof(Elf_Vernaux); } -template bool VersionNeedSection::isNeeded(Ctx &) const { +template bool VersionNeedSection::isNeeded() const { return isLive() && SharedFile::vernauxNum != 0; } @@ -3892,11 +3881,11 @@ MergeTailSection::MergeTailSection(Ctx &ctx, StringRef name, uint32_t type, : MergeSyntheticSection(ctx, name, type, flags, alignment), builder(StringTableBuilder::RAW, llvm::Align(alignment)) {} -size_t MergeTailSection::getSize(Ctx &) const { return builder.getSize(); } +size_t MergeTailSection::getSize() const { return builder.getSize(); } -void MergeTailSection::writeTo(Ctx &, uint8_t *buf) { builder.write(buf); } +void MergeTailSection::writeTo(uint8_t *buf) { builder.write(buf); } -void MergeTailSection::finalizeContents(Ctx &) { +void MergeTailSection::finalizeContents() { // Add all string pieces to the string table builder to create section // contents. for (MergeInputSection *sec : sections) @@ -3916,7 +3905,7 @@ void MergeTailSection::finalizeContents(Ctx &) { sec->pieces[i].outputOff = builder.getOffset(sec->getData(i)); } -void MergeNoTailSection::writeTo(Ctx &ctx, uint8_t *buf) { +void MergeNoTailSection::writeTo(uint8_t *buf) { parallelFor(0, numShards, [&](size_t i) { shards[i].write(buf + shardOffsets[i]); }); } @@ -3929,7 +3918,7 @@ void MergeNoTailSection::writeTo(Ctx &ctx, uint8_t *buf) { // value is different from T's. If that's the case, we can safely put S and // T into different string builders without worrying about merge misses. // We do it in parallel. -void MergeNoTailSection::finalizeContents(Ctx &) { +void MergeNoTailSection::finalizeContents() { // Initializes string table builders. for (size_t i = 0; i < numShards; ++i) shards.emplace_back(StringTableBuilder::RAW, llvm::Align(addralign)); @@ -4111,7 +4100,7 @@ static bool isDuplicateArmExidxSec(InputSection *prev, InputSection *cur) { // must be sorted in ascending order of address, Sentinel is set to the // InputSection with the highest address and any InputSections that have // mergeable .ARM.exidx table entries are removed from it. -void ARMExidxSyntheticSection::finalizeContents(Ctx &) { +void ARMExidxSyntheticSection::finalizeContents() { // Ensure that any fixed-point iterations after the first see the original set // of sections. if (!originalExecutableSections.empty()) @@ -4198,7 +4187,7 @@ InputSection *ARMExidxSyntheticSection::getLinkOrderDep() const { // section is to terminate the address range of the previous entry. // 3.) A trailing EXIDX_CANTUNWIND sentinel section is required at the end of // the table to terminate the address range of the final entry. -void ARMExidxSyntheticSection::writeTo(Ctx &ctx, uint8_t *buf) { +void ARMExidxSyntheticSection::writeTo(uint8_t *buf) { // A linker generated CANTUNWIND entry is made up of two words: // 0x0 with R_ARM_PREL31 relocation to target. @@ -4235,7 +4224,7 @@ void ARMExidxSyntheticSection::writeTo(Ctx &ctx, uint8_t *buf) { assert(size == offset + 8); } -bool ARMExidxSyntheticSection::isNeeded(Ctx &) const { +bool ARMExidxSyntheticSection::isNeeded() const { return llvm::any_of(exidxSections, [](InputSection *isec) { return isec->isLive(); }); } @@ -4247,7 +4236,7 @@ ThunkSection::ThunkSection(Ctx &ctx, OutputSection *os, uint64_t off) this->outSecOff = off; } -size_t ThunkSection::getSize(Ctx &) const { +size_t ThunkSection::getSize() const { if (roundUpSizeForErrata) return alignTo(size, 4096); return size; @@ -4258,7 +4247,7 @@ void ThunkSection::addThunk(Thunk *t) { t->addSymbols(*this); } -void ThunkSection::writeTo(Ctx &ctx, uint8_t *buf) { +void ThunkSection::writeTo(uint8_t *buf) { for (Thunk *t : thunks) t->writeTo(buf + t->offset); } @@ -4287,7 +4276,7 @@ bool ThunkSection::assignOffsets() { PPC32Got2Section::PPC32Got2Section(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 4, ".got2") {} -bool PPC32Got2Section::isNeeded(Ctx &) const { +bool PPC32Got2Section::isNeeded() const { // See the comment below. This is not needed if there is no other // InputSection. for (SectionCommand *cmd : getParent()->commands) @@ -4298,7 +4287,7 @@ bool PPC32Got2Section::isNeeded(Ctx &) const { return false; } -void PPC32Got2Section::finalizeContents(Ctx &) { +void PPC32Got2Section::finalizeContents() { // PPC32 may create multiple GOT sections for -fPIC/-fPIE, one per file in // .got2 . This function computes outSecOff of each .got2 to be used in // PPC32PltCallStub::writeTo(). The purpose of this empty synthetic section is @@ -4337,11 +4326,11 @@ PPC64LongBranchTargetSection::addEntry(const Symbol *sym, int64_t addend) { return res.first->second; } -size_t PPC64LongBranchTargetSection::getSize(Ctx &ctx) const { +size_t PPC64LongBranchTargetSection::getSize() const { return entries.size() * 8; } -void PPC64LongBranchTargetSection::writeTo(Ctx &ctx, uint8_t *buf) { +void PPC64LongBranchTargetSection::writeTo(uint8_t *buf) { // If linking non-pic we have the final addresses of the targets and they get // written to the table directly. For pic the dynamic linker will allocate // the section and fill it. @@ -4360,7 +4349,7 @@ void PPC64LongBranchTargetSection::writeTo(Ctx &ctx, uint8_t *buf) { } } -bool PPC64LongBranchTargetSection::isNeeded(Ctx &) const { +bool PPC64LongBranchTargetSection::isNeeded() const { // `removeUnusedSyntheticSections()` is called before thunk allocation which // is too early to determine if this section will be empty or not. We need // Finalized to keep the section alive until after thunk creation. Finalized @@ -4434,12 +4423,12 @@ PartitionElfHeaderSection::PartitionElfHeaderSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_EHDR, 1, "") {} template -size_t PartitionElfHeaderSection::getSize(Ctx &ctx) const { +size_t PartitionElfHeaderSection::getSize() const { return sizeof(typename ELFT::Ehdr); } template -void PartitionElfHeaderSection::writeTo(Ctx &ctx, uint8_t *buf) { +void PartitionElfHeaderSection::writeTo(uint8_t *buf) { writeEhdr(buf, getPartition()); // Loadable partitions are always ET_DYN. @@ -4452,29 +4441,29 @@ PartitionProgramHeadersSection::PartitionProgramHeadersSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_PHDR, 1, ".phdrs") {} template -size_t PartitionProgramHeadersSection::getSize(Ctx &ctx) const { +size_t PartitionProgramHeadersSection::getSize() const { return sizeof(typename ELFT::Phdr) * getPartition().phdrs.size(); } template -void PartitionProgramHeadersSection::writeTo(Ctx &ctx, uint8_t *buf) { +void PartitionProgramHeadersSection::writeTo(uint8_t *buf) { writePhdrs(buf, getPartition()); } PartitionIndexSection::PartitionIndexSection(Ctx &ctx) : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".rodata") {} -size_t PartitionIndexSection::getSize(Ctx &ctx) const { +size_t PartitionIndexSection::getSize() const { return 12 * (ctx.partitions.size() - 1); } -void PartitionIndexSection::finalizeContents(Ctx &) { +void PartitionIndexSection::finalizeContents() { for (size_t i = 1; i != ctx.partitions.size(); ++i) ctx.partitions[i].nameStrTab = ctx.mainPart->dynStrTab->addString(ctx.partitions[i].name); } -void PartitionIndexSection::writeTo(Ctx &ctx, uint8_t *buf) { +void PartitionIndexSection::writeTo(uint8_t *buf) { uint64_t va = getVA(); for (size_t i = 1; i != ctx.partitions.size(); ++i) { write32(buf, ctx.mainPart->dynStrTab->getVA() + @@ -4544,7 +4533,7 @@ bool elf::canHaveMemtagGlobals() { } constexpr char kMemtagAndroidNoteName[] = "Android"; -void MemtagAndroidNote::writeTo(Ctx &ctx, uint8_t *buf) { +void MemtagAndroidNote::writeTo(uint8_t *buf) { static_assert( sizeof(kMemtagAndroidNoteName) == 8, "Android 11 & 12 have an ABI that the note name is 8 bytes long. Keep it " @@ -4567,13 +4556,13 @@ void MemtagAndroidNote::writeTo(Ctx &ctx, uint8_t *buf) { write32(buf, value); // note value } -size_t MemtagAndroidNote::getSize(Ctx &ctx) const { +size_t MemtagAndroidNote::getSize() const { return sizeof(llvm::ELF::Elf64_Nhdr) + /*namesz=*/alignTo(sizeof(kMemtagAndroidNoteName), 4) + /*descsz=*/sizeof(uint32_t); } -void PackageMetadataNote::writeTo(Ctx &ctx, uint8_t *buf) { +void PackageMetadataNote::writeTo(uint8_t *buf) { write32(buf, 4); write32(buf + 4, ctx.arg.packageMetadata.size() + 1); write32(buf + 8, FDO_PACKAGING_METADATA); @@ -4582,7 +4571,7 @@ void PackageMetadataNote::writeTo(Ctx &ctx, uint8_t *buf) { ctx.arg.packageMetadata.size()); } -size_t PackageMetadataNote::getSize(Ctx &ctx) const { +size_t PackageMetadataNote::getSize() const { return sizeof(llvm::ELF::Elf64_Nhdr) + 4 + alignTo(ctx.arg.packageMetadata.size() + 1, 4); } @@ -4643,19 +4632,19 @@ createMemtagGlobalDescriptors(Ctx &ctx, } bool MemtagGlobalDescriptors::updateAllocSize(Ctx &ctx) { - size_t oldSize = getSize(ctx); + size_t oldSize = getSize(); std::stable_sort(symbols.begin(), symbols.end(), [](const Symbol *s1, const Symbol *s2) { return s1->getVA() < s2->getVA(); }); - return oldSize != getSize(ctx); + return oldSize != getSize(); } -void MemtagGlobalDescriptors::writeTo(Ctx &ctx, uint8_t *buf) { +void MemtagGlobalDescriptors::writeTo(uint8_t *buf) { createMemtagGlobalDescriptors(ctx, symbols, buf); } -size_t MemtagGlobalDescriptors::getSize(Ctx &ctx) const { +size_t MemtagGlobalDescriptors::getSize() const { return createMemtagGlobalDescriptors(ctx, symbols); } @@ -4853,7 +4842,7 @@ template void elf::createSyntheticSections(Ctx &ctx) { ctx.in.partIndex = std::make_unique(ctx); addOptionalRegular("__part_index_begin", ctx.in.partIndex.get(), 0); addOptionalRegular("__part_index_end", ctx.in.partIndex.get(), - ctx.in.partIndex->getSize(ctx)); + ctx.in.partIndex->getSize()); add(*ctx.in.partIndex); } diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index 283b2953449e59..421ef760ef4a09 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -51,10 +51,10 @@ struct CieRecord { class EhFrameSection final : public SyntheticSection { public: EhFrameSection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - void finalizeContents(Ctx &) override; - bool isNeeded(Ctx &) const override { return !sections.empty(); } - size_t getSize(Ctx &ctx) const override { return size; } + void writeTo(uint8_t *buf) override; + void finalizeContents() override; + bool isNeeded() const override { return !sections.empty(); } + size_t getSize() const override { return size; } static bool classof(const SectionBase *d) { return SyntheticSection::classof(d) && d->name == ".eh_frame"; @@ -105,10 +105,10 @@ class EhFrameSection final : public SyntheticSection { class GotSection final : public SyntheticSection { public: GotSection(Ctx &); - size_t getSize(Ctx &ctx) const override { return size; } - void finalizeContents(Ctx &) override; - bool isNeeded(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override { return size; } + void finalizeContents() override; + bool isNeeded() const override; + void writeTo(uint8_t *buf) override; void addConstant(const Relocation &r); void addEntry(const Symbol &sym); @@ -139,15 +139,15 @@ class GnuStackSection : public SyntheticSection { GnuStackSection(Ctx &ctx) : SyntheticSection(ctx, 0, llvm::ELF::SHT_PROGBITS, 1, ".note.GNU-stack") {} - void writeTo(Ctx &, uint8_t *buf) override {} - size_t getSize(Ctx &ctx) const override { return 0; } + void writeTo(uint8_t *buf) override {} + size_t getSize() const override { return 0; } }; class GnuPropertySection final : public SyntheticSection { public: GnuPropertySection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; }; // .note.gnu.build-id section. @@ -158,8 +158,8 @@ class BuildIdSection : public SyntheticSection { public: const size_t hashSize; BuildIdSection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { return headerSize + hashSize; } + void writeTo(uint8_t *buf) override; + size_t getSize() const override { return headerSize + hashSize; } void writeBuildId(llvm::ArrayRef buf); private: @@ -173,9 +173,9 @@ class BuildIdSection : public SyntheticSection { class BssSection final : public SyntheticSection { public: BssSection(Ctx &, StringRef name, uint64_t size, uint32_t addralign); - void writeTo(Ctx &, uint8_t *) override {} - bool isNeeded(Ctx &) const override { return size != 0; } - size_t getSize(Ctx &ctx) const override { return size; } + void writeTo(uint8_t *) override {} + bool isNeeded() const override { return size != 0; } + size_t getSize() const override { return size; } static bool classof(const SectionBase *s) { return s->bss; } uint64_t size; @@ -184,11 +184,11 @@ class BssSection final : public SyntheticSection { class MipsGotSection final : public SyntheticSection { public: MipsGotSection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { return size; } + void writeTo(uint8_t *buf) override; + size_t getSize() const override { return size; } bool updateAllocSize(Ctx &) override; - void finalizeContents(Ctx &) override; - bool isNeeded(Ctx &) const override; + void finalizeContents() override; + bool isNeeded() const override; // Join separate GOTs built for each input file to generate // primary and optional multiple secondary GOTs. @@ -362,9 +362,9 @@ class GotPltSection final : public SyntheticSection { public: GotPltSection(Ctx &); void addEntry(Symbol &sym); - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; - bool isNeeded(Ctx &) const override; + size_t getSize() const override; + void writeTo(uint8_t *buf) override; + bool isNeeded() const override; // Flag to force GotPlt to be in output if we have relocations // that relies on its address. @@ -382,9 +382,9 @@ class IgotPltSection final : public SyntheticSection { public: IgotPltSection(Ctx &); void addEntry(Symbol &sym); - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; - bool isNeeded(Ctx &) const override { return !entries.empty(); } + size_t getSize() const override; + void writeTo(uint8_t *buf) override; + bool isNeeded() const override { return !entries.empty(); } private: SmallVector entries; @@ -394,8 +394,8 @@ class StringTableSection final : public SyntheticSection { public: StringTableSection(Ctx &, StringRef name, bool dynamic); unsigned addString(StringRef s, bool hashIt = true); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { return size; } + void writeTo(uint8_t *buf) override; + size_t getSize() const override { return size; } bool isDynamic() const { return dynamic; } private: @@ -485,9 +485,9 @@ template class DynamicSection final : public SyntheticSection { public: DynamicSection(Ctx &); - void finalizeContents(Ctx &) override; - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { return size; } + void finalizeContents() override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override { return size; } private: std::vector> computeContents(); @@ -538,17 +538,15 @@ class RelocationBaseSection : public SyntheticSection { sec.addReloc({expr, addendRelType, offsetInSec, addend, &sym}); addReloc({dynType, &sec, offsetInSec, kind, sym, addend, expr}); } - bool isNeeded(Ctx &) const override { + bool isNeeded() const override { return !relocs.empty() || llvm::any_of(relocsVec, [](auto &v) { return !v.empty(); }); } - size_t getSize(Ctx &ctx) const override { - return relocs.size() * this->entsize; - } + size_t getSize() const override { return relocs.size() * this->entsize; } size_t getRelativeRelocCount() const { return numRelativeRelocs; } void mergeRels(); void partitionRels(); - void finalizeContents(Ctx &) override; + void finalizeContents() override; static bool classof(const SectionBase *d) { return SyntheticSection::classof(d) && (d->type == llvm::ELF::SHT_RELA || d->type == llvm::ELF::SHT_REL || @@ -581,7 +579,7 @@ class RelocationSection final : public RelocationBaseSection { public: RelocationSection(Ctx &, StringRef name, bool combreloc, unsigned concurrency); - void writeTo(Ctx &, uint8_t *buf) override; + void writeTo(uint8_t *buf) override; }; template @@ -593,8 +591,8 @@ class AndroidPackedRelocationSection final : public RelocationBaseSection { AndroidPackedRelocationSection(Ctx &, StringRef name, unsigned concurrency); bool updateAllocSize(Ctx &) override; - size_t getSize(Ctx &ctx) const override { return relocData.size(); } - void writeTo(Ctx &, uint8_t *buf) override { + size_t getSize() const override { return relocData.size(); } + void writeTo(uint8_t *buf) override { memcpy(buf, relocData.data(), relocData.size()); } @@ -615,7 +613,7 @@ class RelrBaseSection : public SyntheticSection { public: RelrBaseSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false); void mergeRels(); - bool isNeeded(Ctx &) const override { + bool isNeeded() const override { return !relocs.empty() || llvm::any_of(relocsVec, [](auto &v) { return !v.empty(); }); } @@ -634,11 +632,9 @@ template class RelrSection final : public RelrBaseSection { RelrSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false); bool updateAllocSize(Ctx &) override; - size_t getSize(Ctx &ctx) const override { - return relrRelocs.size() * this->entsize; - } - void writeTo(Ctx &ctx, uint8_t *buf) override { - memcpy(buf, relrRelocs.data(), getSize(ctx)); + size_t getSize() const override { return relrRelocs.size() * this->entsize; } + void writeTo(uint8_t *buf) override { + memcpy(buf, relrRelocs.data(), getSize()); } private: @@ -653,8 +649,8 @@ struct SymbolTableEntry { class SymbolTableBaseSection : public SyntheticSection { public: SymbolTableBaseSection(Ctx &ctx, StringTableSection &strTabSec); - void finalizeContents(Ctx &) override; - size_t getSize(Ctx &ctx) const override { return getNumSymbols() * entsize; } + void finalizeContents() override; + size_t getSize() const override { return getNumSymbols() * entsize; } void addSymbol(Symbol *sym); unsigned getNumSymbols() const { return symbols.size() + 1; } size_t getSymbolIndex(const Symbol &sym); @@ -679,17 +675,17 @@ class SymbolTableSection final : public SymbolTableBaseSection { public: SymbolTableSection(Ctx &, StringTableSection &strTabSec); - void writeTo(Ctx &, uint8_t *buf) override; + void writeTo(uint8_t *buf) override; }; class SymtabShndxSection final : public SyntheticSection { public: SymtabShndxSection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; - bool isNeeded(Ctx &) const override; - void finalizeContents(Ctx &) override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; + bool isNeeded() const override; + void finalizeContents() override; }; // Outputs GNU Hash section. For detailed explanation see: @@ -697,9 +693,9 @@ class SymtabShndxSection final : public SyntheticSection { class GnuHashTableSection final : public SyntheticSection { public: GnuHashTableSection(Ctx &); - void finalizeContents(Ctx &) override; - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { return size; } + void finalizeContents() override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override { return size; } // Adds symbols to the hash table. // Sorts the input to satisfy GNU hash section requirements. @@ -725,9 +721,9 @@ class GnuHashTableSection final : public SyntheticSection { class HashTableSection final : public SyntheticSection { public: HashTableSection(Ctx &); - void finalizeContents(Ctx &) override; - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { return size; } + void finalizeContents() override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override { return size; } private: size_t size = 0; @@ -747,9 +743,9 @@ class HashTableSection final : public SyntheticSection { class PltSection : public SyntheticSection { public: PltSection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; - bool isNeeded(Ctx &) const override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; + bool isNeeded() const override; void addSymbols(); void addEntry(Symbol &sym); size_t getNumEntries() const { return entries.size(); } @@ -768,9 +764,9 @@ class IpltSection final : public SyntheticSection { public: IpltSection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; - bool isNeeded(Ctx &) const override { return !entries.empty(); } + void writeTo(uint8_t *buf) override; + size_t getSize() const override; + bool isNeeded() const override { return !entries.empty(); } void addSymbols(); void addEntry(Symbol &sym); }; @@ -778,8 +774,8 @@ class IpltSection final : public SyntheticSection { class PPC32GlinkSection : public PltSection { public: PPC32GlinkSection(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; SmallVector canonical_plts; static constexpr size_t footerSize = 64; @@ -789,9 +785,9 @@ class PPC32GlinkSection : public PltSection { class IBTPltSection : public SyntheticSection { public: IBTPltSection(Ctx &); - void writeTo(Ctx &, uint8_t *Buf) override; - bool isNeeded(Ctx &) const override; - size_t getSize(Ctx &) const override; + void writeTo(uint8_t *Buf) override; + bool isNeeded() const override; + size_t getSize() const override; }; // Used to align the end of the PT_GNU_RELRO segment and the associated PT_LOAD @@ -800,8 +796,8 @@ class IBTPltSection : public SyntheticSection { class RelroPaddingSection final : public SyntheticSection { public: RelroPaddingSection(Ctx &); - size_t getSize(Ctx &ctx) const override { return 0; } - void writeTo(Ctx &, uint8_t *buf) override {} + size_t getSize() const override { return 0; } + void writeTo(uint8_t *buf) override {} }; // Used by the merged DWARF32 .debug_names (a per-module index). If we @@ -875,8 +871,8 @@ class DebugNamesBaseSection : public SyntheticSection { }; DebugNamesBaseSection(Ctx &); - size_t getSize(Ctx &ctx) const override { return size; } - bool isNeeded(Ctx &) const override { return numChunks > 0; } + size_t getSize() const override { return size; } + bool isNeeded() const override { return numChunks > 0; } protected: void init(llvm::function_ref); @@ -919,8 +915,8 @@ template class DebugNamesSection final : public DebugNamesBaseSection { public: DebugNamesSection(Ctx &); - void finalizeContents(Ctx &) override; - void writeTo(Ctx &, uint8_t *buf) override; + void finalizeContents() override; + void writeTo(uint8_t *buf) override; template void getNameRelocs(const InputFile &file, @@ -968,9 +964,9 @@ class GdbIndexSection final : public SyntheticSection { GdbIndexSection(Ctx &); template static std::unique_ptr create(Ctx &); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { return size; } - bool isNeeded(Ctx &) const override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override { return size; } + bool isNeeded() const override; private: struct GdbIndexHeader { @@ -1007,9 +1003,9 @@ class EhFrameHeader final : public SyntheticSection { public: EhFrameHeader(Ctx &); void write(); - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; - bool isNeeded(Ctx &) const override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; + bool isNeeded() const override; }; // For more information about .gnu.version and .gnu.version_r see: @@ -1023,9 +1019,9 @@ class EhFrameHeader final : public SyntheticSection { class VersionDefinitionSection final : public SyntheticSection { public: VersionDefinitionSection(Ctx &); - void finalizeContents(Ctx &) override; - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; + void finalizeContents() override; + size_t getSize() const override; + void writeTo(uint8_t *buf) override; private: enum { EntrySize = 28 }; @@ -1045,10 +1041,10 @@ class VersionDefinitionSection final : public SyntheticSection { class VersionTableSection final : public SyntheticSection { public: VersionTableSection(Ctx &); - void finalizeContents(Ctx &) override; - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; - bool isNeeded(Ctx &) const override; + void finalizeContents() override; + size_t getSize() const override; + void writeTo(uint8_t *buf) override; + bool isNeeded() const override; }; // The .gnu.version_r section defines the version identifiers used by @@ -1076,10 +1072,10 @@ class VersionNeedSection final : public SyntheticSection { public: VersionNeedSection(Ctx &); - void finalizeContents(Ctx &) override; - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; - bool isNeeded(Ctx &) const override; + void finalizeContents() override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; + bool isNeeded() const override; }; // MergeSyntheticSection is a class that allows us to put mergeable sections @@ -1102,9 +1098,9 @@ class MergeTailSection final : public MergeSyntheticSection { MergeTailSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags, uint32_t addralign); - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; - void finalizeContents(Ctx &) override; + size_t getSize() const override; + void writeTo(uint8_t *buf) override; + void finalizeContents() override; private: llvm::StringTableBuilder builder; @@ -1116,9 +1112,9 @@ class MergeNoTailSection final : public MergeSyntheticSection { uint32_t addralign) : MergeSyntheticSection(ctx, name, type, flags, addralign) {} - size_t getSize(Ctx &ctx) const override { return size; } - void writeTo(Ctx &, uint8_t *buf) override; - void finalizeContents(Ctx &) override; + size_t getSize() const override { return size; } + void writeTo(uint8_t *buf) override; + void finalizeContents() override; private: // We use the most significant bits of a hash as a shard ID. @@ -1149,8 +1145,8 @@ class MipsAbiFlagsSection final : public SyntheticSection { static std::unique_ptr create(Ctx &); MipsAbiFlagsSection(Ctx &, Elf_Mips_ABIFlags flags); - size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_ABIFlags); } - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override { return sizeof(Elf_Mips_ABIFlags); } + void writeTo(uint8_t *buf) override; private: Elf_Mips_ABIFlags flags; @@ -1165,9 +1161,9 @@ template class MipsOptionsSection final : public SyntheticSection { static std::unique_ptr> create(Ctx &); MipsOptionsSection(Ctx &, Elf_Mips_RegInfo reginfo); - void writeTo(Ctx &, uint8_t *buf) override; + void writeTo(uint8_t *buf) override; - size_t getSize(Ctx &ctx) const override { + size_t getSize() const override { return sizeof(Elf_Mips_Options) + sizeof(Elf_Mips_RegInfo); } @@ -1183,8 +1179,8 @@ template class MipsReginfoSection final : public SyntheticSection { static std::unique_ptr create(Ctx &); MipsReginfoSection(Ctx &, Elf_Mips_RegInfo reginfo); - size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_RegInfo); } - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override { return sizeof(Elf_Mips_RegInfo); } + void writeTo(uint8_t *buf) override; private: Elf_Mips_RegInfo reginfo; @@ -1197,8 +1193,8 @@ template class MipsReginfoSection final : public SyntheticSection { class MipsRldMapSection final : public SyntheticSection { public: MipsRldMapSection(Ctx &); - size_t getSize(Ctx &ctx) const override { return ctx.arg.wordsize; } - void writeTo(Ctx &, uint8_t *buf) override {} + size_t getSize() const override { return ctx.arg.wordsize; } + void writeTo(uint8_t *buf) override {} }; // Representation of the combined .ARM.Exidx input sections. We process these @@ -1243,11 +1239,11 @@ class ARMExidxSyntheticSection : public SyntheticSection { // section needs to be removed from the main input section list. bool addSection(InputSection *isec); - size_t getSize(Ctx &ctx) const override { return size; } - void writeTo(Ctx &, uint8_t *buf) override; - bool isNeeded(Ctx &) const override; + size_t getSize() const override { return size; } + void writeTo(uint8_t *buf) override; + bool isNeeded() const override; // Sort and remove duplicate entries. - void finalizeContents(Ctx &) override; + void finalizeContents() override; InputSection *getLinkOrderDep() const; static bool classof(const SectionBase *sec) { @@ -1291,8 +1287,8 @@ class ThunkSection final : public SyntheticSection { // Thunk defines a symbol in this InputSection that can be used as target // of a relocation void addThunk(Thunk *t); - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override; + void writeTo(uint8_t *buf) override; InputSection *getTargetInputSection() const; bool assignOffsets(); @@ -1315,17 +1311,16 @@ class ArmCmseSGVeneer; class ArmCmseSGSection final : public SyntheticSection { public: ArmCmseSGSection(Ctx &ctx); - bool isNeeded(Ctx &) const override { return !entries.empty(); } - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; + bool isNeeded() const override { return !entries.empty(); } + size_t getSize() const override; + void writeTo(uint8_t *buf) override; void addSGVeneer(Symbol *sym, Symbol *ext_sym); void addMappingSymbol(); - void finalizeContents(Ctx &) override; + void finalizeContents() override; void exportEntries(SymbolTableBaseSection *symTab); uint64_t impLibMaxAddr = 0; private: - Ctx &ctx; SmallVector, 0> entries; SmallVector sgVeneers; uint64_t newEntries = 0; @@ -1336,10 +1331,10 @@ class ArmCmseSGSection final : public SyntheticSection { class PPC32Got2Section final : public SyntheticSection { public: PPC32Got2Section(Ctx &); - size_t getSize(Ctx &ctx) const override { return 0; } - bool isNeeded(Ctx &) const override; - void finalizeContents(Ctx &) override; - void writeTo(Ctx &, uint8_t *buf) override {} + size_t getSize() const override { return 0; } + bool isNeeded() const override; + void finalizeContents() override; + void writeTo(uint8_t *buf) override {} }; // This section is used to store the addresses of functions that are called @@ -1352,10 +1347,10 @@ class PPC64LongBranchTargetSection final : public SyntheticSection { PPC64LongBranchTargetSection(Ctx &); uint64_t getEntryVA(const Symbol *sym, int64_t addend); std::optional addEntry(const Symbol *sym, int64_t addend); - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; - bool isNeeded(Ctx &) const override; - void finalizeContents(Ctx &) override { finalized = true; } + size_t getSize() const override; + void writeTo(uint8_t *buf) override; + bool isNeeded() const override; + void finalizeContents() override { finalized = true; } private: SmallVector, 0> entries; @@ -1367,24 +1362,24 @@ template class PartitionElfHeaderSection final : public SyntheticSection { public: PartitionElfHeaderSection(Ctx &); - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override; + void writeTo(uint8_t *buf) override; }; template class PartitionProgramHeadersSection final : public SyntheticSection { public: PartitionProgramHeadersSection(Ctx &); - size_t getSize(Ctx &) const override; - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override; + void writeTo(uint8_t *buf) override; }; class PartitionIndexSection final : public SyntheticSection { public: PartitionIndexSection(Ctx &); - size_t getSize(Ctx &) const override; - void finalizeContents(Ctx &) override; - void writeTo(Ctx &, uint8_t *buf) override; + size_t getSize() const override; + void finalizeContents() override; + void writeTo(uint8_t *buf) override; }; // See the following link for the Android-specific loader code that operates on @@ -1395,8 +1390,8 @@ class MemtagAndroidNote final : public SyntheticSection { MemtagAndroidNote(Ctx &ctx) : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, /*alignment=*/4, ".note.android.memtag") {} - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; }; class PackageMetadataNote final : public SyntheticSection { @@ -1404,8 +1399,8 @@ class PackageMetadataNote final : public SyntheticSection { PackageMetadataNote(Ctx &ctx) : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, /*alignment=*/4, ".note.package") {} - void writeTo(Ctx &, uint8_t *buf) override; - size_t getSize(Ctx &) const override; + void writeTo(uint8_t *buf) override; + size_t getSize() const override; }; class MemtagGlobalDescriptors final : public SyntheticSection { @@ -1414,19 +1409,19 @@ class MemtagGlobalDescriptors final : public SyntheticSection { : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_AARCH64_MEMTAG_GLOBALS_DYNAMIC, /*alignment=*/4, ".memtag.globals.dynamic") {} - void writeTo(Ctx &, uint8_t *buf) override; + void writeTo(uint8_t *buf) override; // The size of the section is non-computable until all addresses are // synthetized, because the section's contents contain a sorted // varint-compressed list of pointers to global variables. We only know the // final size after `finalizeAddressDependentContent()`. - size_t getSize(Ctx &) const override; + size_t getSize() const override; bool updateAllocSize(Ctx &) override; void addSymbol(const Symbol &sym) { symbols.push_back(&sym); } - bool isNeeded(Ctx &) const override { return !symbols.empty(); } + bool isNeeded() const override { return !symbols.empty(); } private: SmallVector symbols; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index f4a22ea953ec49..f9a21b6745fdd1 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -837,10 +837,10 @@ template void Writer::setReservedSymbolSections() { } // .rela_iplt_{start,end} mark the start and the end of .rel[a].dyn. - if (ctx.sym.relaIpltStart && ctx.mainPart->relaDyn->isNeeded(ctx)) { + if (ctx.sym.relaIpltStart && ctx.mainPart->relaDyn->isNeeded()) { ctx.sym.relaIpltStart->section = ctx.mainPart->relaDyn.get(); ctx.sym.relaIpltEnd->section = ctx.mainPart->relaDyn.get(); - ctx.sym.relaIpltEnd->value = ctx.mainPart->relaDyn->getSize(ctx); + ctx.sym.relaIpltEnd->value = ctx.mainPart->relaDyn->getSize(); } PhdrEntry *last = nullptr; @@ -1425,9 +1425,9 @@ template void Writer::resolveShfLinkOrder() { } static void finalizeSynthetic(Ctx &ctx, SyntheticSection *sec) { - if (sec && sec->isNeeded(ctx) && sec->getParent()) { + if (sec && sec->isNeeded() && sec->getParent()) { llvm::TimeTraceScope timeScope("Finalize synthetic sections", sec->name); - sec->finalizeContents(ctx); + sec->finalizeContents(); } } @@ -1679,7 +1679,7 @@ static void removeUnusedSyntheticSections(Ctx &ctx) { auto end = std::remove_if(start, ctx.inputSections.end(), [&](InputSectionBase *s) { auto *sec = cast(s); - if (sec->getParent() && sec->isNeeded(ctx)) + if (sec->getParent() && sec->isNeeded()) return false; // .relr.auth.dyn relocations may be moved to .rela.dyn in // finalizeAddressDependentContent, making .rela.dyn no longer empty. @@ -1810,9 +1810,9 @@ template void Writer::finalizeSections() { reportUndefinedSymbols(ctx); postScanRelocations(ctx); - if (ctx.in.plt && ctx.in.plt->isNeeded(ctx)) + if (ctx.in.plt && ctx.in.plt->isNeeded()) ctx.in.plt->addSymbols(); - if (ctx.in.iplt && ctx.in.iplt->isNeeded(ctx)) + if (ctx.in.iplt && ctx.in.iplt->isNeeded()) ctx.in.iplt->addSymbols(); if (ctx.arg.unresolvedSymbolsInShlib != UnresolvedPolicy::Ignore) { @@ -2312,7 +2312,7 @@ SmallVector Writer::createPhdrs(Partition &part) { ret.push_back(relRo); // PT_GNU_EH_FRAME is a special section pointing on .eh_frame_hdr. - if (part.ehFrame->isNeeded(ctx) && part.ehFrameHdr && + if (part.ehFrame->isNeeded() && part.ehFrameHdr && part.ehFrame->getParent() && part.ehFrameHdr->getParent()) addHdr(PT_GNU_EH_FRAME, part.ehFrameHdr->getParent()->getPhdrFlags()) ->add(part.ehFrameHdr->getParent()); @@ -2574,7 +2574,7 @@ template void Writer::setPhdrs(Partition &part) { // output section. We always want to describe just the // SyntheticSection. if (part.armExidx && p->p_type == PT_ARM_EXIDX) { - p->p_filesz = part.armExidx->getSize(ctx); + p->p_filesz = part.armExidx->getSize(); p->p_memsz = p->p_filesz; p->p_offset = first->offset + part.armExidx->outSecOff; p->p_vaddr = first->addr + part.armExidx->outSecOff; From 173c68239d1d11f4e36c8af07a28310da67568a7 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 11 Oct 2024 08:50:49 +0200 Subject: [PATCH 133/177] [AMDGPU] Enable unaligned scratch accesses (#110219) This allows us to emit wide generic and scratch memory accesses when we do not have alignment information. In cases where accesses happen to be properly aligned or where generic accesses do not go to scratch memory, this improves performance of the generated code by a factor of up to 16x and reduces code size, especially when lowering memcpy and memmove intrinsics. Also: Make the use of the FeatureUnalignedScratchAccess feature more consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now orthogonal, whereas, before, code assumed that the latter implies the former at some places. Part of SWDEV-455845. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 24 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 16 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 1037 ++- .../AMDGPU/GlobalISel/legalize-load-flat.mir | 3222 +------- .../GlobalISel/legalize-load-private.mir | 5246 +++++++------ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 28 +- .../test/CodeGen/AMDGPU/flat-address-space.ll | 12 +- .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 98 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 2438 +----- .../AMDGPU/memcpy-param-combinations.ll | 6516 +++-------------- .../AMDGPU/memmove-param-combinations.ll | 5196 ++----------- llvm/test/CodeGen/AMDGPU/sdwa-commute.ll | 4 +- .../CodeGen/AMDGPU/unaligned-load-store.ll | 28 +- 15 files changed, 6082 insertions(+), 17791 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 25117544d6a849..62fac085897ab6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1178,9 +1178,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero, - FeatureVmemWriteVgprInOrder + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder ] >; @@ -1199,9 +1199,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, - FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength63, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, FeatureVmemWriteVgprInOrder @@ -1223,9 +1223,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, - FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureVmemWriteVgprInOrder ] @@ -1246,9 +1246,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureTrue16BitInsts, + FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAgentScopeFineGrainedRemoteMemoryAtomics ] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3f4f42377d56ee..d701bf037fdfa6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -387,8 +387,8 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && - ChainSizeInBytes <= ST->getMaxPrivateElementSize(); + return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 19458126093167..1ea3beb2855d69 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -591,6 +591,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return UnalignedScratchAccess; } + bool hasUnalignedScratchAccessEnabled() const { + return UnalignedScratchAccess && UnalignedAccessMode; + } + bool hasUnalignedAccessMode() const { return UnalignedAccessMode; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3d8e03521e2b90..8c197f23149612 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1824,26 +1824,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( Subtarget->hasUnalignedDSAccessEnabled(); } - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - bool AlignedBy4 = Alignment >= Align(4); - if (IsFast) - *IsFast = AlignedBy4; - - return AlignedBy4 || - Subtarget->enableFlatScratch() || - Subtarget->hasUnalignedScratchAccess(); - } - // FIXME: We have to be conservative here and assume that flat operations // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. - if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && - !Subtarget->hasUnalignedScratchAccess()) { + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS) { bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; - return AlignedBy4; + return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled(); } // So long as they are correct, wide global memory operations perform better diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index ce528467cd35b4..6e2e88f22600a8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -2428,11 +2428,54 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v4, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v3, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v3, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v8, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,30 +2484,143 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v3, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v3, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v4, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v8, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v3, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v3, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2475,12 +2631,39 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2572,59 +2755,293 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v12, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v10, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v10, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, s2 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, s1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, s0 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 2, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_add_nc_u32 v5, 1, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v10, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2635,16 +3052,57 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2742,64 +3200,382 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v13, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v14, v6, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v13, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v14, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v16, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v15, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v0, v6, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v11, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v10, v11, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v8, 4 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v14, v8, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v17, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v10, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v14, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v15, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v16, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v17, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v13, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v14, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v16, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v15, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v14, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v16, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v15, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v0, v6, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v11, 3 :: v_dual_add_nc_u32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v8, 4 :: v_dual_add_nc_u32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v3, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v10, v11, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v14, v8, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v15, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v16, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v17, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v10, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v14, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v15, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v16, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v17, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2810,17 +3586,74 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 4 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index b1d7d36f9912e7..032ca7c0d4fee9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -483,40 +483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -664,40 +646,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -798,70 +762,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1247,76 +1163,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1485,130 +1347,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -2075,87 +1829,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2 @@ -2369,165 +2060,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1 @@ -3334,210 +2884,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX12-LABEL: name: test_load_flat_s128_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1 @@ -4132,133 +3496,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX12-LABEL: name: test_load_flat_p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4662,79 +3915,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4906,133 +4102,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5274,43 +4359,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5416,73 +4480,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5732,40 +4745,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v2s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6158,121 +5153,106 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX12-LABEL: name: test_load_flat_v3s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6503,40 +5483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6638,70 +5600,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7185,40 +6099,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7327,70 +6223,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -8291,36 +7139,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8334,36 +7168,22 @@ body: | ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8377,36 +7197,22 @@ body: | ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8765,70 +7571,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -9005,124 +7763,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -10686,133 +9342,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -11100,235 +9645,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -12078,342 +10410,42 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v3s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1 @@ -13306,441 +11338,33 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v4s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1 @@ -14762,210 +12386,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX12-LABEL: name: test_load_flat_v2p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1 @@ -15422,124 +12860,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_flat_v2p3_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 741f878c86f8b6..6d93112aae1a06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -636,27 +636,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s16_align1 ; GFX11: liveins: $vgpr0 @@ -702,15 +690,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -853,27 +853,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align2 ; GFX11: liveins: $vgpr0 @@ -919,15 +907,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -1012,47 +1012,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align1 ; GFX11: liveins: $vgpr0 @@ -1118,15 +1086,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -1529,39 +1529,27 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX10-LABEL: name: test_load_private_s24_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX11-LABEL: name: test_load_private_s24_align1 ; GFX11: liveins: $vgpr0 @@ -1631,27 +1619,39 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR1]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load (s24), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -2147,42 +2147,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align2 @@ -2245,15 +2225,51 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2386,78 +2402,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align1 @@ -2556,15 +2516,87 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2742,53 +2774,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2796,53 +2789,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2974,16 +2928,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -3381,28 +3427,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3410,28 +3442,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3513,16 +3531,58 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 5) @@ -3701,53 +3761,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3755,53 +3776,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3933,16 +3915,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -4166,68 +4240,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4235,68 +4258,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4458,16 +4430,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -4928,35 +5022,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4964,35 +5040,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5088,16 +5146,72 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 2, addrspace 5) @@ -5321,68 +5435,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5390,68 +5453,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5613,16 +5625,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -5932,42 +6066,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align2 @@ -6030,15 +6144,53 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6171,78 +6323,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align1 @@ -6341,15 +6437,89 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6494,29 +6664,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align2 ; GFX11: liveins: $vgpr0 @@ -6564,15 +6720,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -6660,49 +6830,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align1 ; GFX11: liveins: $vgpr0 @@ -6770,15 +6906,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -6923,29 +7093,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align2 ; GFX11: liveins: $vgpr0 @@ -6993,15 +7149,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -7089,49 +7259,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align1 ; GFX11: liveins: $vgpr0 @@ -7199,15 +7335,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -7357,30 +7527,20 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s8_align1 @@ -7437,20 +7597,30 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 5) @@ -7938,81 +8108,71 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX10-LABEL: name: test_load_private_v3s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX10-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11-LABEL: name: test_load_private_v3s8_align1 ; GFX11: liveins: $vgpr0 @@ -8168,71 +8328,81 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR4]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 5) %2:_(s24) = G_BITCAST %1 @@ -8658,136 +8828,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v16s8_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v16s8_align16 @@ -8944,15 +9012,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s8_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 1, addrspace 5) %2:_(<4 x s32>) = G_BITCAST %1 @@ -9107,27 +9297,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align2 ; GFX11: liveins: $vgpr0 @@ -9173,15 +9351,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -9278,47 +9468,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align1 ; GFX11: liveins: $vgpr0 @@ -9384,15 +9542,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -9824,27 +10014,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -9853,27 +10042,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10215,41 +10403,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10258,41 +10431,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10445,22 +10603,36 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10474,22 +10646,36 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10827,44 +11013,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align2 @@ -10929,15 +11093,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11091,80 +11287,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align1 @@ -11265,15 +11403,83 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11582,42 +11788,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align2 @@ -11680,15 +11866,43 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11821,78 +12035,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align1 @@ -11991,15 +12149,79 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -12174,106 +12396,28 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX10-LABEL: name: test_load_private_v3s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX11-LABEL: name: test_load_private_v3s32_align16 @@ -12400,15 +12544,107 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -12764,136 +13000,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align16 @@ -13050,15 +13184,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13493,70 +13749,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align2 @@ -13647,15 +13867,71 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 2, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13875,136 +14151,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align1 @@ -14161,15 +14335,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -15262,68 +15558,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15331,68 +15576,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15552,15 +15746,155 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -18178,98 +18512,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18280,98 +18539,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX10-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX10-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX10-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX10-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX10-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX10-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX10-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX10-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX10-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX10-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX10-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX10-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX10-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18616,12 +18800,99 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX11-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX11-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18631,12 +18902,99 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18818,49 +19176,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18871,49 +19203,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -19060,12 +19366,50 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -19075,12 +19419,50 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index ea10547da6ab7f..3fc5d0d4b279eb 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -475,8 +475,14 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 +; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR-NEXT: s_endpgm @@ -537,8 +543,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: s_clause 0x1 -; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1) +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR_GFX10-NEXT: s_endpgm @@ -561,8 +572,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v0, off, off -; GFX11-NEXT: scratch_load_b32 v1, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v3, off, off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 0ad53083d0ff3f..12593e3760fd3e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -123,10 +123,8 @@ define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr } ; GCN-LABEL: flat_scratch_unaligned_load: -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} +; GFX9: flat_load_dword +; GFX10PLUS: flat_load_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr @@ -136,10 +134,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() { } ; GCN-LABEL: flat_scratch_unaligned_store: -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} +; GFX9: flat_store_dword +; GFX10PLUS: flat_store_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 1dd18b4228fe5e..9d43efbdf07b1f 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v9, s7 ; CHECK-NEXT: v_mov_b32_e32 v8, s6 -; CHECK-NEXT: flat_load_ubyte v10, v[8:9] offset:5 -; CHECK-NEXT: flat_load_ubyte v11, v[8:9] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[8:9] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[8:9] offset:3 -; CHECK-NEXT: flat_load_ubyte v14, v[8:9] offset:2 -; CHECK-NEXT: flat_load_ubyte v15, v[8:9] offset:1 -; CHECK-NEXT: flat_load_ubyte v16, v[8:9] -; CHECK-NEXT: flat_load_ubyte v17, v[8:9] offset:4 -; CHECK-NEXT: flat_load_ubyte v18, v[8:9] offset:13 -; CHECK-NEXT: flat_load_ubyte v19, v[8:9] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[8:9] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[8:9] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[8:9] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[8:9] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[8:9] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[8:9] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s6, v6 +; CHECK-NEXT: v_mov_b32_e32 v13, s7 +; CHECK-NEXT: v_add_co_u32_e32 v12, vcc, s6, v6 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[4:5], 2 -; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v7, vcc ; CHECK-NEXT: s_add_u32 s6, s6, 16 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[8:9], v13 offset:3 -; CHECK-NEXT: flat_store_byte v[8:9], v14 offset:2 -; CHECK-NEXT: flat_store_byte v[8:9], v15 offset:1 -; CHECK-NEXT: flat_store_byte v[8:9], v16 -; CHECK-NEXT: flat_store_byte v[8:9], v12 offset:7 -; CHECK-NEXT: flat_store_byte v[8:9], v11 offset:6 -; CHECK-NEXT: flat_store_byte v[8:9], v10 offset:5 -; CHECK-NEXT: flat_store_byte v[8:9], v17 offset:4 -; CHECK-NEXT: flat_store_byte v[8:9], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[8:9], v22 offset:10 -; CHECK-NEXT: flat_store_byte v[8:9], v23 offset:9 -; CHECK-NEXT: flat_store_byte v[8:9], v24 offset:8 -; CHECK-NEXT: flat_store_byte v[8:9], v20 offset:15 -; CHECK-NEXT: flat_store_byte v[8:9], v19 offset:14 -; CHECK-NEXT: flat_store_byte v[8:9], v18 offset:13 -; CHECK-NEXT: flat_store_byte v[8:9], v25 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_cbranch_vccz .LBB0_2 ; CHECK-NEXT: ; %bb.3: ; %loop-memcpy-residual-header ; CHECK-NEXT: s_mov_b32 s4, 0 @@ -128,47 +99,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v10, s10 ; CHECK-NEXT: v_mov_b32_e32 v11, s11 -; CHECK-NEXT: flat_load_ubyte v12, v[10:11] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[10:11] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[10:11] offset:7 -; CHECK-NEXT: flat_load_ubyte v15, v[10:11] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[10:11] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[10:11] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[10:11] -; CHECK-NEXT: flat_load_ubyte v19, v[10:11] offset:4 -; CHECK-NEXT: flat_load_ubyte v20, v[10:11] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[10:11] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[10:11] offset:15 -; CHECK-NEXT: flat_load_ubyte v23, v[10:11] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[10:11] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[10:11] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[10:11] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[10:11] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[10:11] +; CHECK-NEXT: v_mov_b32_e32 v15, s11 ; CHECK-NEXT: s_add_u32 s14, s14, 1 -; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s10, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v3, vcc +; CHECK-NEXT: v_add_co_u32_e32 v14, vcc, s10, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc ; CHECK-NEXT: s_addc_u32 s15, s15, 0 ; CHECK-NEXT: s_add_u32 s10, s10, 16 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5] ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[10:11], v15 offset:3 -; CHECK-NEXT: flat_store_byte v[10:11], v16 offset:2 -; CHECK-NEXT: flat_store_byte v[10:11], v17 offset:1 -; CHECK-NEXT: flat_store_byte v[10:11], v18 -; CHECK-NEXT: flat_store_byte v[10:11], v14 offset:7 -; CHECK-NEXT: flat_store_byte v[10:11], v13 offset:6 -; CHECK-NEXT: flat_store_byte v[10:11], v12 offset:5 -; CHECK-NEXT: flat_store_byte v[10:11], v19 offset:4 -; CHECK-NEXT: flat_store_byte v[10:11], v23 offset:11 -; CHECK-NEXT: flat_store_byte v[10:11], v24 offset:10 -; CHECK-NEXT: flat_store_byte v[10:11], v25 offset:9 -; CHECK-NEXT: flat_store_byte v[10:11], v26 offset:8 -; CHECK-NEXT: flat_store_byte v[10:11], v22 offset:15 -; CHECK-NEXT: flat_store_byte v[10:11], v21 offset:14 -; CHECK-NEXT: flat_store_byte v[10:11], v20 offset:13 -; CHECK-NEXT: flat_store_byte v[10:11], v27 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] ; CHECK-NEXT: s_cbranch_execnz .LBB0_14 ; CHECK-NEXT: .LBB0_15: ; %Flow20 @@ -251,23 +193,11 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB1_8: ; %post-loop-memcpy-expansion ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: .LBB1_9: ; %loop-memcpy-expansion2 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB1_9 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0a76e169e9c385..8c28fac0d839c2 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -10,108 +10,21 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -185,375 +98,59 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -569,363 +166,57 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -972,279 +263,27 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) @@ -1256,108 +295,21 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -1431,375 +383,59 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -1815,363 +451,57 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -2218,279 +548,27 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll index 7575782c1b2acd..cadc3dadb0a1e9 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll @@ -13,55 +13,9 @@ define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,101 +27,19 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -179,104 +51,13 @@ define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -288,31 +69,9 @@ define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -324,55 +83,19 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -384,55 +107,13 @@ define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -458,58 +139,13 @@ define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -553,58 +189,13 @@ define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -634,55 +225,9 @@ define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -694,101 +239,19 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -800,104 +263,13 @@ define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -909,31 +281,9 @@ define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -945,55 +295,19 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1005,55 +319,13 @@ define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1080,35 +352,12 @@ define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1153,35 +402,12 @@ define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1211,54 +437,9 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1270,96 +451,19 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v17, v2 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1371,100 +475,12 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v18, v2 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1476,30 +492,9 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1511,54 +506,19 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1570,54 +530,12 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,35 +561,12 @@ define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1714,35 +609,12 @@ define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b128 v[3:6], v2 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1771,55 +643,12 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1831,100 +660,24 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:3 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:5 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:7 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:9 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:11 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1936,104 +689,18 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2045,30 +712,12 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ushort v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:10 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:12 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:14 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2080,55 +729,24 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2140,55 +758,18 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2219,30 +800,7 @@ define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2290,30 +848,7 @@ define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2342,55 +877,13 @@ define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,99 +895,23 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2506,103 +923,19 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2614,31 +947,13 @@ define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2650,55 +965,23 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2710,55 +993,19 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2788,53 +1035,19 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2888,53 +1101,19 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2970,41 +1149,8 @@ define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3016,79 +1162,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v5, v7, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v23, 8, v24 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v30 -; CHECK-NEXT: v_lshl_or_b32 v4, v14, 16, v12 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v33, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v2, 16, v18 ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3100,79 +1182,13 @@ define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -3183,23 +1199,8 @@ define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3211,52 +1212,16 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v20, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v21, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v22, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v23, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v4, v19, 16, v20 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v3, v21, 16, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v13, 16, v12 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v23, 16, v2 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -3267,39 +1232,13 @@ define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3323,47 +1262,13 @@ define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -3404,47 +1309,13 @@ define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -4042,44 +1913,13 @@ define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -4090,81 +1930,21 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -4175,81 +1955,19 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4260,24 +1978,13 @@ define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4288,52 +1995,21 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v15 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v20, 16, v19 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v22, 16, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4344,41 +2020,19 @@ define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4406,49 +2060,18 @@ define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4500,49 +2123,18 @@ define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4577,41 +2169,8 @@ define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4624,80 +2183,16 @@ define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v8 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v22, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v29 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v32, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23 -; CHECK-NEXT: ds_write_b64 v0, v[3:4] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4709,79 +2204,13 @@ define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4793,23 +2222,8 @@ define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4822,51 +2236,16 @@ define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v19, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v20, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v21, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v22, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v11, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v18 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v22 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4878,40 +2257,13 @@ define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4937,47 +2289,13 @@ define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5021,47 +2339,13 @@ define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b128 v0, v[1:4] -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[3:6] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5701,44 +2985,13 @@ define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5750,81 +3003,21 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5836,81 +3029,19 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5922,24 +3053,13 @@ define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5951,52 +3071,21 @@ define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v10, v15, 8, v14 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6008,41 +3097,19 @@ define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6072,50 +3139,19 @@ define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v1, v13, 16, v12 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6169,49 +3205,18 @@ define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v9, v1, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[2:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -6248,55 +3253,12 @@ define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6307,101 +3269,24 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6412,104 +3297,19 @@ define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6520,31 +3320,12 @@ define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6555,55 +3336,24 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6614,55 +3364,19 @@ define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6689,61 +3403,19 @@ define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -6793,61 +3465,19 @@ define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -6881,55 +3511,12 @@ define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6940,207 +3527,47 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) ret void -} - -define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { -; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +} + +define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7151,31 +3578,12 @@ define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7186,55 +3594,24 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7245,55 +3622,19 @@ define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7329,30 +3670,10 @@ define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7411,30 +3732,10 @@ define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false) @@ -7468,54 +3769,12 @@ define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7526,85 +3785,25 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7615,79 +3814,18 @@ define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v10, v1 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7698,30 +3836,12 @@ define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7732,54 +3852,25 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7790,54 +3881,18 @@ define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7872,30 +3927,10 @@ define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7952,30 +3987,10 @@ define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8008,55 +4023,12 @@ define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8067,100 +4039,24 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8171,103 +4067,19 @@ define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8278,31 +4090,12 @@ define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8313,55 +4106,24 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8372,55 +4134,19 @@ define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8456,30 +4182,10 @@ define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false) @@ -8538,30 +4244,10 @@ define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8595,55 +4281,19 @@ define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8654,99 +4304,34 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8757,103 +4342,31 @@ define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8864,31 +4377,19 @@ define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8899,55 +4400,34 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8958,55 +4438,31 @@ define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -9040,67 +4496,31 @@ define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false) @@ -9169,67 +4589,31 @@ define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll index cc5256620bfe08..4e5688adcd6bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll @@ -13,55 +13,9 @@ define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,100 +27,19 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -178,103 +51,13 @@ define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -286,31 +69,9 @@ define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -322,55 +83,19 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -382,55 +107,13 @@ define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -559,55 +242,9 @@ define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -619,100 +256,19 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -724,103 +280,13 @@ define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -832,31 +298,9 @@ define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -868,55 +312,19 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -928,55 +336,13 @@ define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1105,54 +471,9 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1164,72 +485,19 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v2 -; CHECK-NEXT: ds_read_u8 v27, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v26 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1241,74 +509,12 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v27, v2 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v33, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v27 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1320,30 +526,9 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1355,54 +540,19 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1414,54 +564,12 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1583,55 +691,9 @@ define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,100 +705,19 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1748,103 +729,13 @@ define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1856,31 +747,9 @@ define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1892,55 +761,19 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1952,55 +785,13 @@ define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2129,55 +920,13 @@ define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2189,100 +938,23 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:5 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:1 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2294,103 +966,19 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,31 +990,13 @@ define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2438,55 +1008,23 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2498,55 +1036,19 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2698,41 +1200,8 @@ define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2744,79 +1213,18 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v12, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v16, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v15, v18, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v14, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v18, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v17, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v20, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v19, v28, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v4, v18, 16, v17 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v21, v30, 8, v31 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v22, v32, 8, v33 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v12, v12, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v20, 16, v19 -; CHECK-NEXT: v_lshl_or_b32 v2, v22, 16, v21 -; CHECK-NEXT: global_store_byte v[0:1], v13, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v12, off offset:28 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2828,79 +1236,13 @@ define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -2911,23 +1253,8 @@ define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2939,41 +1266,18 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v7, v12, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v2, v18, 16, v19 -; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v20, off offset:30 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2985,39 +1289,13 @@ define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3783,44 +2061,13 @@ define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -3831,82 +2078,24 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v2, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v4, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v8, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v13, v19, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v14, v24, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v27 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v18, v4, v32 -; CHECK-NEXT: v_lshl_or_b32 v4, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v33, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -3917,81 +2106,19 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4002,24 +2129,13 @@ define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4030,43 +2146,24 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v8, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v7 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v19, off offset:30 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4077,41 +2174,19 @@ define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4258,41 +2333,8 @@ define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4305,82 +2347,20 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v11, 8, v11 -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v2, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v10, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v6, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v16, v25, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v3, v14, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v11, v11, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v18, 16, v17 -; CHECK-NEXT: ds_write_b8 v0, v12 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v13 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v11 offset:28 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4392,79 +2372,13 @@ define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4476,23 +2390,8 @@ define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4505,43 +2404,20 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v1, v12, 16, v13 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v16, 16, v17 -; CHECK-NEXT: ds_write_b16 v0, v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) -; CHECK-NEXT: ds_write_b8 v0, v18 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4553,40 +2429,13 @@ define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5398,44 +3247,13 @@ define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5447,83 +3265,26 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v3, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v7, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v4, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v16, v2, 16, v1 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v1, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v22, 8, v21 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v12, v23, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v30, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v17, v3, v31 -; CHECK-NEXT: v_lshl_or_b32 v3, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v32 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v16 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v17 offset:28 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5535,81 +3296,19 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5621,24 +3320,13 @@ define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5650,44 +3338,26 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v18, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: ds_write_b16 v0, v16 offset:28 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v17 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v18 offset:24 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5699,41 +3369,19 @@ define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5890,55 +3538,12 @@ define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -5949,100 +3554,24 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6053,103 +3582,19 @@ define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6160,31 +3605,12 @@ define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6195,55 +3621,24 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6254,55 +3649,19 @@ define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6452,55 +3811,12 @@ define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6511,100 +3827,24 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6615,103 +3855,19 @@ define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6722,31 +3878,12 @@ define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6757,55 +3894,24 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6816,55 +3922,19 @@ define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7009,54 +4079,12 @@ define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7067,72 +4095,25 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v25, v1 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7143,74 +4124,18 @@ define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7221,30 +4146,12 @@ define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7255,54 +4162,25 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7313,54 +4191,18 @@ define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7505,55 +4347,12 @@ define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7564,100 +4363,24 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7668,103 +4391,19 @@ define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7775,31 +4414,12 @@ define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7810,55 +4430,24 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7869,55 +4458,19 @@ define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8062,55 +4615,19 @@ define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8121,100 +4638,34 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:9 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8225,103 +4676,31 @@ define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8332,31 +4711,19 @@ define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8367,55 +4734,34 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8426,55 +4772,31 @@ define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll index 94bc6d46b2395b..8ad6a4e534d232 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll @@ -19,8 +19,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs ; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; CHECK-NEXT: v_add_f16_e32 v4, v6, v7 ; CHECK-NEXT: v_add_f16_e32 v2, v3, v2 -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:2 +; CHECK-NEXT: v_pack_b32_f16 v2, v4, v2 +; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll index 4e734d6e0884bc..fc33a274d7b11a 100644 --- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -607,7 +607,14 @@ define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 { ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte define double @private_load_align1_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 1 ret double %x @@ -622,7 +629,14 @@ define double @private_load_align1_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 1 ret void @@ -651,7 +665,10 @@ define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 { ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort define double @private_load_align2_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 2 ret double %x @@ -662,7 +679,10 @@ define double @private_load_align2_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 2 ret void From 65780f4d8e34461e6bd3baf2ff77496f97874b94 Mon Sep 17 00:00:00 2001 From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com> Date: Fri, 11 Oct 2024 08:23:35 +0100 Subject: [PATCH 134/177] [C++20][Modules] Allow import for a header unit after #pragma (#111662) Summary: `#pragma` and headers that finish with them shouldn't prevent `import "header_unit.h"` syntax. Test Plan: check-clang --- clang/lib/Lex/Preprocessor.cpp | 4 ++++ .../import_header_unit_after_pragma.cpp | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 clang/test/Headers/import_header_unit_after_pragma.cpp diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index f0b4593e0cc22e..ecc5166d7b814c 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -902,6 +902,10 @@ void Preprocessor::Lex(Token &Result) { case tok::r_brace: StdCXXImportSeqState.handleCloseBrace(); break; +#define PRAGMA_ANNOTATION(X) case tok::annot_##X: +// For `#pragma ...` mimic ';'. +#include "clang/Basic/TokenKinds.def" +#undef PRAGMA_ANNOTATION // This token is injected to represent the translation of '#include "a.h"' // into "import a.h;". Mimic the notional ';'. case tok::annot_module_include: diff --git a/clang/test/Headers/import_header_unit_after_pragma.cpp b/clang/test/Headers/import_header_unit_after_pragma.cpp new file mode 100644 index 00000000000000..b1ad3b07fea29c --- /dev/null +++ b/clang/test/Headers/import_header_unit_after_pragma.cpp @@ -0,0 +1,18 @@ +// RUN: rm -fR %t +// RUN: split-file %s %t +// RUN: cd %t +// RUN: %clang_cc1 -verify -std=c++20 -emit-header-unit -xc++-user-header bz0.h +// RUN: %clang_cc1 -verify -std=c++20 -emit-header-unit -xc++-user-header -fmodule-file=bz0.pcm bz.cpp + +//--- compare +#pragma GCC visibility push(default) +#pragma GCC visibility pop + +//--- bz0.h +#include "compare" +// expected-no-diagnostics + +//--- bz.cpp +#include "compare" + +import "bz0.h"; // expected-warning {{the implementation of header units is in an experimental phase}} From ff04bb8f4064274aedcb6e916079132ab6042a10 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 09:31:49 +0200 Subject: [PATCH 135/177] [clang][bytecode] Use PredefinedExpr as base for its variable (#111956) This fixes the error message generated. --- clang/lib/AST/ByteCode/Compiler.cpp | 5 +++++ clang/lib/AST/ByteCode/Program.cpp | 15 +++++++++------ clang/lib/AST/ByteCode/Program.h | 3 ++- clang/test/AST/ByteCode/cxx1z.cpp | 4 ++++ 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 0a3b38b0dc6e57..b2663714340b93 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2869,6 +2869,11 @@ bool Compiler::VisitPredefinedExpr(const PredefinedExpr *E) { if (DiscardResult) return true; + if (!Initializing) { + unsigned StringIndex = P.createGlobalString(E->getFunctionName(), E); + return this->emitGetPtrGlobal(StringIndex, E); + } + return this->delegate(E->getFunctionName()); } diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index 23245a66b578ae..cd2665f755d7cb 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -33,7 +33,7 @@ const void *Program::getNativePointer(unsigned Idx) { return NativePointers[Idx]; } -unsigned Program::createGlobalString(const StringLiteral *S) { +unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) { const size_t CharWidth = S->getCharByteWidth(); const size_t BitWidth = CharWidth * Ctx.getCharBit(); @@ -52,12 +52,15 @@ unsigned Program::createGlobalString(const StringLiteral *S) { llvm_unreachable("unsupported character width"); } + if (!Base) + Base = S; + // Create a descriptor for the string. - Descriptor *Desc = - allocateDescriptor(S, CharType, Descriptor::GlobalMD, S->getLength() + 1, - /*isConst=*/true, - /*isTemporary=*/false, - /*isMutable=*/false); + Descriptor *Desc = allocateDescriptor(Base, CharType, Descriptor::GlobalMD, + S->getLength() + 1, + /*isConst=*/true, + /*isTemporary=*/false, + /*isMutable=*/false); // Allocate storage for the string. // The byte length does not include the null terminator. diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h index be84c40714a60b..f676672fb7ced5 100644 --- a/clang/lib/AST/ByteCode/Program.h +++ b/clang/lib/AST/ByteCode/Program.h @@ -64,7 +64,8 @@ class Program final { const void *getNativePointer(unsigned Idx); /// Emits a string literal among global data. - unsigned createGlobalString(const StringLiteral *S); + unsigned createGlobalString(const StringLiteral *S, + const Expr *Base = nullptr); /// Returns a pointer to a global. Pointer getPtrGlobal(unsigned Idx) const; diff --git a/clang/test/AST/ByteCode/cxx1z.cpp b/clang/test/AST/ByteCode/cxx1z.cpp index 1a06597fa348fe..57f99235a2b201 100644 --- a/clang/test/AST/ByteCode/cxx1z.cpp +++ b/clang/test/AST/ByteCode/cxx1z.cpp @@ -13,3 +13,7 @@ namespace Temp { char arr[3]; A d; // both-error {{refers to subobject '&arr[1]'}} + +void Func() { + A a; // both-error {{pointer to subobject of predefined '__func__' variable}} +} From bff2b8c06f362b6b4c761fc1d3951da2bddf17de Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Fri, 11 Oct 2024 15:56:39 +0800 Subject: [PATCH 136/177] [mlir][sparse][test] Adjust tests for `LowerSparseOpsToForeach` (#110976) This PR relocates the tests added in #109435 to a new file named `no_lowering.mlir` and adds some new tests. --- mlir/test/Dialect/SparseTensor/codegen.mlir | 16 ------ .../Dialect/SparseTensor/no_lowering.mlir | 54 +++++++++++++++++++ 2 files changed, 54 insertions(+), 16 deletions(-) create mode 100644 mlir/test/Dialect/SparseTensor/no_lowering.mlir diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir index df03d871ba3a3e..af78458f109329 100644 --- a/mlir/test/Dialect/SparseTensor/codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/codegen.mlir @@ -826,19 +826,3 @@ func.func @sparse_new_coo_permute_no(%arg0: !llvm.ptr) -> tensor return %0 : tensor } - -// CHECK-LABEL: func.func @test_tensor_dim_unranked -// CHECK: tensor.dim -func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index { - %c = arith.constant 0 : index - %0 = tensor.dim %arg0, %c : tensor<*xf32> - return %0 : index -} - -// CHECK-LABEL: func.func @test_tensor_reshape_unranked -// CHECK: tensor.reshape -func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor { - %dst = tensor.reshape %src(%shape) - : (tensor<*xf32>, tensor<1xi32>) -> tensor - return %dst : tensor -} diff --git a/mlir/test/Dialect/SparseTensor/no_lowering.mlir b/mlir/test/Dialect/SparseTensor/no_lowering.mlir new file mode 100644 index 00000000000000..4f21055a13d58a --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/no_lowering.mlir @@ -0,0 +1,54 @@ +// RUN: mlir-opt %s --lower-sparse-ops-to-foreach --split-input-file | FileCheck %s + +// Ensure that we exit gracefully rather than crashing. + +// CHECK-LABEL: func.func @test_tensor_dim_unranked +// CHECK: tensor.dim +func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index { + %c = arith.constant 0 : index + %0 = tensor.dim %arg0, %c : tensor<*xf32> + return %0 : index +} + +// ----- + +#SparseVector = #sparse_tensor.encoding<{ + map = (d0) -> (d0 : compressed) +}> + +// CHECK-LABEL: func.func @test_no_constant_dim +// CHECK: tensor.dim +func.func @test_no_constant_dim(%arg0: tensor, %arg1: index) -> index { + %0 = tensor.dim %arg0, %arg1 : tensor + return %0 : index +} + +// ----- + +// CHECK-LABEL: func.func @test_tensor_dim_no_encoding +// CHECK: tensor.dim +func.func @test_tensor_dim_no_encoding(%arg0: tensor) -> index { + %c = arith.constant 0 : index + %0 = tensor.dim %arg0, %c : tensor + return %0 : index +} + +// ----- + +// CHECK-LABEL: func.func @test_tensor_reshape_unranked +// CHECK: tensor.reshape +func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor { + %dst = tensor.reshape %src(%shape) + : (tensor<*xf32>, tensor<1xi32>) -> tensor + return %dst : tensor +} + +// ----- + +// CHECK-LABEL: func.func @test_tensor_reshape_no_encoding +// CHECK: tensor.reshape +func.func @test_tensor_reshape_no_encoding(%src: tensor, %shape: tensor<1xi32>) -> tensor { + %dst = tensor.reshape %src(%shape) + : (tensor, tensor<1xi32>) -> tensor + return %dst : tensor +} From 8bb12ca28f7f195aa483fdb5921681ec373564ab Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Fri, 11 Oct 2024 11:17:09 +0300 Subject: [PATCH 137/177] [clang][NFC] Update `cxx_dr_status.html` --- clang/www/cxx_dr_status.html | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 1a67b6103cf43e..6f3cc8247d2e2d 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -17128,11 +17128,11 @@

C++ defect report implementation status

Undesired outcomes with const_cast Not resolved - + 2880 - open + accepted Accessibility check for destructor of incomplete class type - Not resolved + Unknown 2881 @@ -17260,7 +17260,7 @@

C++ defect report implementation status

2901 - review + tentatively ready Unclear semantics for near-match aliased access Not resolved @@ -17408,31 +17408,31 @@

C++ defect report implementation status

2923 - open + tentatively ready Note about infinite loops and execution steps Not resolved 2924 - open + review Undefined behavior during constant evaluation Not resolved - + 2925 - open + NAD Deleting a pointer to an incomplete enumeration type - Not resolved + Unknown 2926 - open + tentatively ready Lookup context for dependent qualified names Not resolved 2927 - open + review Unclear status of translation unit with module keyword Not resolved From bb4696ce3051be820de91c8c98b2649af1680236 Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Fri, 11 Oct 2024 09:39:19 +0100 Subject: [PATCH 138/177] [mlir][linalg] Fix for bias handling for Winograd (#110331) PR makes winograd.output_transform op a destination style op and fixes handing of a pre-existing data in its output argument (i.e. possibly pre-initialized with bias, which was discarded before). --------- Signed-off-by: Dmitriy Smirnov --- .../mlir/Dialect/Linalg/IR/LinalgOps.td | 3 +- .../Linalg/Transforms/WinogradConv2D.cpp | 114 +++++++++--------- .../transform-tile-and-winograd-rewrite.mlir | 51 ++++---- .../Linalg/transform-tile-winograd.mlir | 26 ++-- .../Linalg/winograd-conv2d-rewrite.mlir | 17 +-- 5 files changed, 106 insertions(+), 105 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index 5b6a90f806bedd..e42fd5d2ce13c1 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -313,7 +313,7 @@ def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform", } def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform", - [AllElementTypesMatch<["value", "output"]>, + [AllElementTypesMatch<["value", "output"]>, DestinationStyleOpInterface, DeclareOpInterfaceMethods scf::ValueVector { + auto context = builder.getContext(); Value tileHIter = ivs[0]; Value tileWIter = ivs[1]; Value NIter = ivs[2]; @@ -740,29 +741,41 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value, FIter, 2, 3, /*loopNorFIdx=*/4, /*loopCorFIdx=*/5, /*heightIdx=*/0, /*widthIdx=*/1); - TransformMapKeyTy key = {m, r}; - int64_t retRows = 1; - int64_t retCols = 1; - int64_t leftScalarFactor = 1; - int64_t rightScalarFactor = 1; + const TransformMapKeyTy key = {m, r}; + const TransformMatrix &AMatrix = AMatrices.at(key); + const TransformMatrix &ATMatrix = ATMatrices.at(key); + int64_t scalarFactor = (rightTransform ? AMatrix.scalarFactor : 1) * + (leftTransform ? ATMatrix.scalarFactor : 1); + int64_t retCols = rightTransform ? AMatrix.cols : 1; + int64_t retRows = leftTransform ? ATMatrix.rows : 1; + Value matmulRetValue = extractValue; Value zero = builder.create( loc, rewriter.getZeroAttr(elementType)); - if (leftTransform) { - // Get constant transform matrix AT. - auto it = ATMatrices.find(key); - if (it == ATMatrices.end()) - return {}; - const TransformMatrix &ATMatrix = it->second; - leftScalarFactor = ATMatrix.scalarFactor; - retRows = ATMatrix.rows; + auto affineMap = + AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); + Value heightOffset = + builder.create(loc, affineMap, tileHIter); + Value widthOffset = + builder.create(loc, affineMap, tileWIter); + + Value outInitVal = + extract2DDataFrom4D(builder, loc, args[0], NIter, FIter, heightOffset, + widthOffset, retRows, retCols, + /*loopNorFIdx=*/0, + /*loopCorFIdx=*/3, /*heightIdx=*/1, + /*widthIdx=*/2); + if (leftTransform) { auto matmulType = RankedTensorType::get({retRows, valueW}, elementType); - auto empty = - builder - .create(loc, matmulType.getShape(), elementType) - .getResult(); - auto init = builder.create(loc, zero, empty).getResult(0); + Value init = outInitVal; + if (rightTransform || scalarFactor != 1) { + auto empty = builder + .create(loc, matmulType.getShape(), + elementType) + .getResult(); + init = builder.create(loc, zero, empty).getResult(0); + } Value AT = create2DTransformMatrix(builder, loc, ATMatrix, elementType); // Multiply AT x m. @@ -772,21 +785,16 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value, } if (rightTransform) { - // Get constant transform matrix T. - auto it = AMatrices.find(key); - if (it == AMatrices.end()) - return {}; - const TransformMatrix &AMatrix = it->second; - - rightScalarFactor = AMatrix.scalarFactor; auto matmulType = RankedTensorType::get({retRows, AMatrix.cols}, elementType); - retCols = AMatrix.cols; - auto empty = - builder - .create(loc, matmulType.getShape(), elementType) - .getResult(); - auto init = builder.create(loc, zero, empty).getResult(0); + Value init = outInitVal; + if (scalarFactor != 1) { + auto empty = builder + .create(loc, matmulType.getShape(), + elementType) + .getResult(); + init = builder.create(loc, zero, empty).getResult(0); + } Value A = create2DTransformMatrix(builder, loc, AMatrix, elementType); // Multiply y = (AT x m) x A. @@ -795,48 +803,36 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value, matmulRetValue = matmulOp.getResult(0); } - if (leftScalarFactor * rightScalarFactor != 1) { - // Multiply scalar factor. - Value scalarFactor = builder.create( - loc, - FloatAttr::get(elementType, leftScalarFactor * rightScalarFactor)); + if (scalarFactor != 1) { + // Multiply by scalar factor and add outInitVal. + Value scalarFactorValue = builder.create( + loc, FloatAttr::get(elementType, scalarFactor)); auto matmulType = RankedTensorType::get({retRows, retCols}, elementType); - auto init = builder.create(loc, matmulType.getShape(), - elementType); - auto identityAffineMap = rewriter.getMultiDimIdentityMap(2); SmallVector affineMaps = { - AffineMap::get(2, 0, init.getContext()), identityAffineMap}; - auto broadcastedScalar = + AffineMap::get(2, 0, context), identityAffineMap, identityAffineMap}; + + matmulRetValue = rewriter .create( - loc, matmulType, ValueRange{scalarFactor}, ValueRange{init}, - affineMaps, + loc, matmulType, + ValueRange{scalarFactorValue, matmulRetValue}, + ValueRange{outInitVal}, affineMaps, llvm::ArrayRef{ utils::IteratorType::parallel, utils::IteratorType::parallel}, [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) { - nestedBuilder.create(nestedLoc, args[0]); + auto mulf = nestedBuilder.create( + nestedLoc, args[0], args[1]); + auto addf = nestedBuilder.create( + nestedLoc, mulf.getResult(), args[2]); + nestedBuilder.create(nestedLoc, + addf.getResult()); }) .getResult(0); - - matmulRetValue = builder - .create( - loc, matmulType, - ValueRange{broadcastedScalar, matmulRetValue}, - ValueRange{init}) - .getResult(0); } - auto context = builder.getContext(); - auto affineMap = - AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); - Value heightOffset = - builder.create(loc, affineMap, tileHIter); - Value widthOffset = - builder.create(loc, affineMap, tileWIter); - // Insert (H, W) to (N, H, W, F). Value combinedVal = insert2DDataTo4D(builder, loc, matmulRetValue, args[0], NIter, FIter, diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir index c5760acf94a88a..776dc5b748c846 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir @@ -85,31 +85,32 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[S9]] // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] // CHECK: %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S7:.*]] = tensor.empty() // CHECK: %[[S6:.*]] = linalg.batch_matmul // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] -// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x8x8x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) +// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]]) // CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] // CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) // CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG6]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] // CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) // CHECK: %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) // CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S25:.*]] = tensor.extract_slice %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: %[[S16:.*]] = tensor.empty() : tensor<4x6xf32> // CHECK: %[[S17:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S16]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S18:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_8]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S19:.*]] = tensor.empty() : tensor<4x4xf32> // CHECK: %[[S20:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S19]] : tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK: %[[S21:.*]] = linalg.matmul ins(%[[S18]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[S22:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S21]] : f32, tensor<4x4xf32>) outs(%[[S25]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: %[[VAL_90:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK: %[[VAL_91:.*]] = arith.addf %[[VAL_90]], %[[OUT]] : f32 +/// CHECK: linalg.yield %[[VAL_91]] : f32 // CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S24:.*]] = linalg.mul ins(%[[S23]], %[[S21]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S24]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S23]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: scf.yield %[[INSERTED_SLICE_9]] // CHECK: scf.yield %[[S15]] // CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) @@ -218,32 +219,33 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[S9]] // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] // CHECK: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S7:.*]] = tensor.empty() // CHECK: %[[S6:.*]] = linalg.batch_matmul // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] // CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] -// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]]) +// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[PADDED_8]]) // CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) // CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] // CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) // CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG7]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] // CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) // CHECK: %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) // CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S26:.*]] = tensor.extract_slice %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: %[[S17:.*]] = tensor.empty() : tensor<4x6xf32> // CHECK: %[[S18:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S19:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_11]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S18]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S20:.*]] = tensor.empty() : tensor<4x4xf32> // CHECK: %[[S21:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK: %[[S22:.*]] = linalg.matmul ins(%[[S19]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S21]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[S23:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK: %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S23]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S22]] : f32, tensor<4x4xf32>) outs(%[[S26]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: %[[VAL_104:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK: %[[VAL_105:.*]] = arith.addf %[[VAL_104]], %[[OUT]] : f32 +/// CHECK: linalg.yield %[[VAL_105]] : f32 // CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S25:.*]] = linalg.mul ins(%[[S24]], %[[S22]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S23]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S25]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S24]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: scf.yield %[[INSERTED_SLICE_12]] // CHECK: scf.yield %[[S15]] : tensor<2x4x4x2xf32> // CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) @@ -330,16 +332,17 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]]) // CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S15:.*]] = tensor.extract_slice %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] // CHECK: %[[S9:.*]] = tensor.empty() : tensor<4x1xf32> // CHECK: %[[S10:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S9]] : tensor<4x1xf32>) -> tensor<4x1xf32> // CHECK: %[[S11:.*]] = linalg.matmul ins(%[[CST_0]], %[[EXTRACTED_SLICE]] : tensor<4x6xf32>, tensor<6x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32> -// CHECK: %[[S12:.*]] = tensor.empty() : tensor<4x1xf32> -// CHECK: %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S12]] : tensor<4x1xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S11]] : f32, tensor<4x1xf32>) outs(%[[S15]] : tensor<4x1xf32>) { +// CHECK: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: %[[VAL_57:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK: %[[VAL_58:.*]] = arith.addf %[[VAL_57]], %[[OUT]] : f32 +/// CHECK: linalg.yield %[[VAL_58]] : f32 // CHECK: } -> tensor<4x1xf32> -// CHECK: %[[S14:.*]] = linalg.mul ins(%[[S13]], %[[S11]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S12]] : tensor<4x1xf32>) -> tensor<4x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] // CHECK: scf.yield %[[INSERTED_SLICE]] // CHECK: scf.yield %[[S7]] // CHECK: return %[[S6]] diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir index 21522a2083b463..9598c434aadb8f 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir @@ -279,14 +279,14 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[ARG1]]) -> (tensor<2x8x8x2xf32>) +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] iter_args(%[[ARG6:.*]] = %[[ARG5]]) -> (tensor<2x8x8x2xf32>) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> // CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) // CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) // CHECK: %[[S5:.*]] = affine.apply #[[$MAP1]]() // CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]]() -// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> +// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG6]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> // ----- @@ -321,10 +321,10 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] iter_args(%[[ARG9:.*]] = %[[ARG1]]) -> (tensor<3x8x8x5xf32>) +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] iter_args(%[[ARG10:.*]] = %[[ARG9]]) -> (tensor<3x8x8x5xf32>) +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] iter_args(%[[ARG11:.*]] = %[[ARG10]]) +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] iter_args(%[[ARG12:.*]] = %[[ARG11]]) // CHECK: %[[C3_8:.*]] = arith.constant 3 : index // CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG6]]) // CHECK: %[[C5_9:.*]] = arith.constant 5 : index @@ -334,7 +334,7 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]](%[[ARG4]]) // CHECK: %[[S9:.*]] = affine.apply #[[$MAP3]]() // CHECK: %[[S10:.*]] = affine.apply #[[$MAP3]]() -// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor +// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG12]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor // ----- @@ -367,14 +367,14 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[ARG1]]) -> (tensor<3x8x1x5xf32>) +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] iter_args(%[[ARG10:.*]] = %[[ARG9]]) -> (tensor<3x8x1x5xf32>) +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG11:.*]] = %[[ARG10]]) -> (tensor<3x8x1x5xf32>) +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] iter_args(%[[ARG12:.*]] = %[[ARG11]]) -> (tensor<3x8x1x5xf32>) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> // CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) // CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) // CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]]() // CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]]() -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG12]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> // CHECK: %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32> diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir index 4369f5f1eab4ca..16d06a74732729 100644 --- a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir @@ -100,21 +100,22 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg // CHECK-NEXT: %[[S8:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) { // CHECK-NEXT: %[[S9:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) { // CHECK-NEXT: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6xf32> +// CHECK-NEXT: %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK-NEXT: %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK-NEXT: %[[S22:.*]] = tensor.extract_slice %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<4x4xf32> // CHECK-NEXT: %[[S11:.*]] = tensor.empty() : tensor<4x6xf32> // CHECK-NEXT: %[[S12:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S11]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK-NEXT: %[[S13:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK-NEXT: %[[S14:.*]] = tensor.empty() : tensor<4x4xf32> // CHECK-NEXT: %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK-NEXT: %[[S16:.*]] = linalg.matmul ins(%[[S13]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S15]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK-NEXT: %[[S17:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK-NEXT: %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S17]] : tensor<4x4xf32>) { -// CHECK-NEXT: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK-NEXT: linalg.yield %[[IN]] : f32 +// CHECK-NEXT: %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S16]] : f32, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) { +// CHECK-NEXT: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK-NEXT: %[[VAL_98:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK-NEXT: %[[VAL_99:.*]] = arith.addf %[[VAL_98]], %[[OUT]] : f32 +// CHECK-NEXT: linalg.yield %[[VAL_99]] : f32 // CHECK-NEXT: } -> tensor<4x4xf32> -// CHECK-NEXT: %[[S19:.*]] = linalg.mul ins(%[[S18]], %[[S16]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S17]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK-NEXT: %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK-NEXT: %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S19]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S18]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32> // CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32> // CHECK-NEXT: } // CHECK-NEXT: scf.yield %[[S9]] : tensor<2x12x12x2xf32> From ebeb56af5f8f1ff9da8f5a7e98348f460d223de1 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 11 Oct 2024 10:40:28 +0200 Subject: [PATCH 139/177] [lldb] Only send "posix" error codes through the gdb-remote protocol (#108170) The other side has no way of telling which namespace do these codes belong to, so mashing them all together is not very helpful. I'm mainly doing this to simplify some code in a pending patch , and I've picked the posix error category semi-randomly. If we wanted to be serious about assigning meaning to these error codes, we should create a special error category for "gdb errors". From b222f319306a9cad9ac11183b7036ff45097c26f Mon Sep 17 00:00:00 2001 From: Dmitry Vasilyev Date: Fri, 11 Oct 2024 12:56:42 +0400 Subject: [PATCH 140/177] [lldb][test] Fixed the test `no_unique_address-with-bitfields` (#111902) Fixed the error `unable to create target: 'No available targets are compatible with triple "x86_64-apple-macosx10.4.0"'` running `clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s`. --- .../DWARF/{ => x86}/no_unique_address-with-bitfields.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lldb/test/Shell/SymbolFile/DWARF/{ => x86}/no_unique_address-with-bitfields.cpp (100%) diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp similarity index 100% rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp From 72f339de45bb590f25571c4c447a725e6f1dd8d7 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 11 Oct 2024 10:10:15 +0100 Subject: [PATCH 141/177] [LoopVectorize] Use predicated version of getSmallConstantMaxTripCount (#109928) There are a number of places where we call getSmallConstantMaxTripCount without passing a vector of predicates: getSmallBestKnownTC isIndvarOverflowCheckKnownFalse computeMaxVF isMoreProfitable I've changed all of these to now pass in a predicate vector so that we get the benefit of making better vectorisation choices when we know the max trip count for loops that require SCEV predicate checks. I've tried to add tests that cover all the cases affected by these changes. --- llvm/include/llvm/Analysis/ScalarEvolution.h | 7 + llvm/lib/Analysis/ScalarEvolution.cpp | 10 + .../Transforms/Vectorize/LoopVectorize.cpp | 48 ++- .../AArch64/low_trip_count_predicates.ll | 397 ++++++++++++++++++ .../RISCV/riscv-vector-reverse.ll | 2 + 5 files changed, 442 insertions(+), 22 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 179a2c38d9d3c2..328926f0b7aa65 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -2376,6 +2376,10 @@ class PredicatedScalarEvolution { /// Get the (predicated) symbolic max backedge count for the analyzed loop. const SCEV *getSymbolicMaxBackedgeTakenCount(); + /// Returns the upper bound of the loop trip count as a normal unsigned + /// value, or 0 if the trip count is unknown. + unsigned getSmallConstantMaxTripCount(); + /// Adds a new predicate. void addPredicate(const SCEVPredicate &Pred); @@ -2447,6 +2451,9 @@ class PredicatedScalarEvolution { /// The symbolic backedge taken count. const SCEV *SymbolicMaxBackedgeCount = nullptr; + + /// The constant max trip count for the loop. + std::optional SmallConstantMaxTripCount; }; template <> struct DenseMapInfo { diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 3d890f05c8ca21..cea3a5bc865fee 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15050,6 +15050,16 @@ const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() { return SymbolicMaxBackedgeCount; } +unsigned PredicatedScalarEvolution::getSmallConstantMaxTripCount() { + if (!SmallConstantMaxTripCount) { + SmallVector Preds; + SmallConstantMaxTripCount = SE.getSmallConstantMaxTripCount(&L, &Preds); + for (const auto *P : Preds) + addPredicate(*P); + } + return *SmallConstantMaxTripCount; +} + void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { if (Preds->implies(&Pred)) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2bee2c67a2353..05dc58a42249ca 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -411,10 +411,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. /// 4) Returns std::nullopt if all of the above failed. static std::optional -getSmallBestKnownTC(ScalarEvolution &SE, Loop *L, +getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax = true) { // Check if exact trip count is known. - if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) + if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L)) return ExpectedTC; // Check if there is an expected trip count available from profile data. @@ -426,7 +426,7 @@ getSmallBestKnownTC(ScalarEvolution &SE, Loop *L, return std::nullopt; // Check if upper bound estimate is known. - if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) + if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount()) return ExpectedTC; return std::nullopt; @@ -1789,12 +1789,15 @@ class GeneratedRTChecks { Loop *OuterLoop = nullptr; + PredicatedScalarEvolution &PSE; + public: - GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - TargetTransformInfo *TTI, const DataLayout &DL, - bool AddBranchWeights) - : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), - MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} + GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, + LoopInfo *LI, TargetTransformInfo *TTI, + const DataLayout &DL, bool AddBranchWeights) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), + MemCheckExp(*PSE.getSE(), DL, "scev.check"), + AddBranchWeights(AddBranchWeights), PSE(PSE) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -1941,7 +1944,7 @@ class GeneratedRTChecks { // Get the best known TC estimate. if (auto EstimatedTC = getSmallBestKnownTC( - *SE, OuterLoop, /* CanUseConstantMax = */ false)) + PSE, OuterLoop, /* CanUseConstantMax = */ false)) BestTripCount = *EstimatedTC; BestTripCount = std::max(BestTripCount, 1U); @@ -2272,8 +2275,7 @@ static bool isIndvarOverflowCheckKnownFalse( // We know the runtime overflow check is known false iff the (max) trip-count // is known and (max) trip-count + (VF * UF) does not overflow in the type of // the vector loop induction variable. - if (unsigned TC = - Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { + if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) { uint64_t MaxVF = VF.getKnownMinValue(); if (VF.isScalable()) { std::optional MaxVScale = @@ -3962,8 +3964,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); + unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + if (TC != MaxTC) + LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n'); if (TC == 1) { reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", @@ -4257,7 +4261,7 @@ bool LoopVectorizationPlanner::isMoreProfitable( InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); @@ -4852,7 +4856,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!Legal->isSafeForAnyVectorWidth()) return 1; - auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); // If we did not calculate the cost for VF (because the user selected the VF) @@ -9618,8 +9622,8 @@ static bool processLoopInVPlanNativePath( { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getDataLayout(), AddBranchWeights); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), + AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -9683,7 +9687,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional VScale, Loop *L, - ScalarEvolution &SE, + PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) @@ -9768,7 +9772,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // Skip vectorization if the expected trip count is less than the minimum // required trip count. - if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) { if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), VF.MinProfitableTripCount)) { LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " @@ -9875,7 +9879,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); + auto ExpectedTC = getSmallBestKnownTC(PSE, L); if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " @@ -9973,8 +9977,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getDataLayout(), AddBranchWeights); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), + AddBranchWeights); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, VF.Cost); @@ -9990,7 +9994,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, - *PSE.getSE(), SEL)) { + PSE, SEL)) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll new file mode 100644 index 00000000000000..1ec384b05779a8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -0,0 +1,397 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; REQUIRES: asserts +; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG + +target triple = "aarch64-unknown-linux-gnu" + +; DEBUG-LABEL: LV: Checking a loop in 'low_vf_ic_is_better' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 19 +; DEBUG: LV: IC is 1 +; DEBUG: LV: VF is vscale x 8 +; DEBUG: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:vscale x 4, Epilogue Loop UF:1 + +; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small' +; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred. +; DEBUG: LV: Not vectorizing: The trip count is below the minial threshold value.. + +; DEBUG-LABEL: LV: Checking a loop in 'too_many_runtime_checks' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 16 +; DEBUG: LV: Clamping the MaxVF to maximum power of two not exceeding the constant trip count: 16 +; DEBUG: LV: IC is 1 +; DEBUG: LV: VF is 16 +; DEBUG: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 32) +; DEBUG: LV: Too many memory checks needed. + +; DEBUG-LABEL: LV: Checking a loop in 'overflow_indvar_known_false' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 1027 +; DEBUG: LV: can fold tail by masking. +; DEBUG: Executing best plan with VF=vscale x 16, UF=1 + +define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef %val) { +; CHECK-LABEL: define void @low_vf_ic_is_better( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 19 +; CHECK-NEXT: br i1 [[CMP7]], label %[[ITER_CHECK:.*]], label %[[WHILE_END:.*]] +; CHECK: [[ITER_CHECK]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 19, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt i64 [[TMP8]], 4294967295 +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP23]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END5:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP33]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], [[TMP35]] +; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC4]] +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = add i64 [[TMP0]], [[INDEX6]] +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX7]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = add [[WIDE_LOAD7]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: store [[TMP41]], ptr [[TMP40]], align 1 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX6]], [[TMP37]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N12]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 19 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 19 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 19 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @trip_count_too_small(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) { +; CHECK-LABEL: define void @trip_count_too_small( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 3 +; CHECK-NEXT: br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 3 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 3 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 3 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @too_many_runtime_checks(ptr nocapture noundef %p, ptr nocapture noundef %p1, ptr nocapture noundef readonly %p2, ptr nocapture noundef readonly %p3, i32 noundef %tc, i16 noundef %val) { +; CHECK-LABEL: define void @too_many_runtime_checks( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], ptr nocapture noundef [[P1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], ptr nocapture noundef readonly [[P3:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[TC]], 16 +; CHECK-NEXT: br i1 [[CMP20]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV8:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP1]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P3]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP61:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP61]], [[TMP60]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP62]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[ADD12:%.*]] = add i8 [[TMP63]], [[CONV8]] +; CHECK-NEXT: store i8 [[ADD12]], ptr [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP64:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP64]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp20 = icmp ult i32 %tc, 16 + br i1 %cmp20, label %while.preheader, label %while.end + +while.preheader: + %0 = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %1 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %1, %while.preheader ], [ %iv.next, %while.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %p2, i64 %iv + %2 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p3, i64 %iv + %3 = load i8, ptr %arrayidx2, align 1 + %mul = mul i8 %3, %2 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %p1, i64 %iv + %4 = load i8, ptr %arrayidx5, align 1 + %add = add i8 %mul, %4 + store i8 %add, ptr %arrayidx5, align 1 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %5 = load i8, ptr %arrayidx10, align 1 + %add12 = add i8 %5, %0 + store i8 %add12, ptr %arrayidx10, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %6 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %6, 16 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) vscale_range(1,16) { +; CHECK-LABEL: define void @overflow_indvar_known_false( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 1027 +; CHECK-NEXT: br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1028, [[TMP20]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 1027, [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ugt i64 [[TMP23]], 4294967295 +; CHECK-NEXT: [[TMP28:%.*]] = or i1 [[TMP26]], [[TMP27]] +; CHECK-NEXT: br i1 [[TMP28]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP15]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP18]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP29]], 1027 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 1027 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 1027 + br i1 %exitcond.not, label %while.end, label %while.body, !llvm.loop !0 + +while.end: + ret void +} + + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 1d5e6c117a2eac..9a716f7756072e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -20,6 +20,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 +; CHECK-NEXT: LV: Found maximum trip count: 4294967295 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. @@ -224,6 +225,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 +; CHECK-NEXT: LV: Found maximum trip count: 4294967295 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. From 1276ce9e9713b2a0802004676fad7e40980396d5 Mon Sep 17 00:00:00 2001 From: Emilio Cota Date: Fri, 11 Oct 2024 05:08:23 -0400 Subject: [PATCH 142/177] Revert "[mlir][linalg] Introduce transpose semantic to 'linalg.matmul' ops. (#104783)" This reverts commit 03483737a7a2d72a257a5ab6ff01748ad9cf0f75 and 99c8557, which is a fix-up on top of the former. I'm reverting because this commit broke two tests: mlir/test/python/integration/dialects/linalg/opsrun.py mlir/test/python/integration/dialects/transform.py See https://lab.llvm.org/buildbot/#/builders/138/builds/4872 I'm not familiar with the tests, so I'm leaving it to the original author to either remove or adapt the broken tests, as discussed here: https://github.com/llvm/llvm-project/pull/104783#issuecomment-2406390905 --- .../Dialect/Linalg/IR/LinalgInterfaces.td | 10 - .../Linalg/IR/LinalgNamedStructuredOps.yaml | 72 +++++ .../Dialect/Linalg/IR/LinalgStructuredOps.td | 134 --------- .../Dialect/Linalg/IR/LinalgInterfaces.cpp | 17 +- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 263 +----------------- .../Linalg/Transforms/TransposeMatmul.cpp | 7 - .../Linalg/Transforms/Vectorization.cpp | 5 - .../NVGPU/TransformOps/NVGPUTransformOps.cpp | 6 - .../linalg/opdsl/ops/core_named_ops.py | 17 ++ .../Dialect/Linalg/generalize-named-ops.mlir | 111 -------- mlir/test/Dialect/Linalg/invalid.mlir | 159 ----------- mlir/test/Dialect/Linalg/named-ops.mlir | 243 ---------------- mlir/test/python/dialects/linalg/ops.py | 75 +++++ .../mlir-linalg-ods-yaml-gen.cpp | 6 +- 14 files changed, 182 insertions(+), 943 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td index e80dbb2afb9ef7..fbf3f19cde0e9b 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td @@ -684,16 +684,6 @@ def LinalgStructuredInterface return; }] >, - InterfaceMethod< - /*desc=*/[{ - Return true if the user has supplied an explicit indexing maps for this op. - }], - /*retTy=*/"bool", - /*methodName=*/"hasUserDefinedMaps", - /*args=*/(ins), - /*methodBody=*/"", - /*defaultImplementation=*/[{ return false; }] - >, //===------------------------------------------------------------------===// // Linalg generalization hooks. //===------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index 97b90333e2b200..8cb698096ef5b7 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -1065,6 +1065,78 @@ structured_op: !LinalgStructuredOpConfig - !ScalarExpression scalar_arg: rhs --- !LinalgOpConfig +metadata: !LinalgOpMetadata + name: matmul + cpp_class_name: MatmulOp + doc: |- + Performs a matrix multiplication of two 2D inputs. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. + implements: + - LinalgContractionOpInterface +structured_op: !LinalgStructuredOpConfig + args: + - !LinalgOperandDefConfig + name: A + kind: input_tensor + type_var: T1 + shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)> + - !LinalgOperandDefConfig + name: B + kind: input_tensor + type_var: T2 + shape_map: affine_map<()[s0, s1, s2] -> (s1, s2)> + - !LinalgOperandDefConfig + name: C + kind: output_tensor + type_var: U + shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)> + - !LinalgOperandDefConfig + name: cast + kind: type_fn_attr + default_fn: cast_signed + indexing_maps: !LinalgIndexingMapsConfig + static_indexing_maps: + - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)> + - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)> + - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)> + iterator_types: + - parallel + - parallel + - reduction + assignments: + - !ScalarAssign + arg: C + value: !ScalarExpression + scalar_fn: + kind: binary + fn_name: add + operands: + - !ScalarExpression + scalar_arg: C + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: mul + operands: + - !ScalarExpression + scalar_fn: + kind: type + attr_name: cast + type_var: U + operands: + - !ScalarExpression + scalar_arg: A + - !ScalarExpression + scalar_fn: + kind: type + attr_name: cast + type_var: U + operands: + - !ScalarExpression + scalar_arg: B +--- !LinalgOpConfig metadata: !LinalgOpMetadata name: quantized_matmul cpp_class_name: QuantizedMatmulOp diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 61d4fc9734c6de..31f29139247267 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -535,140 +535,6 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [ let hasCanonicalizer = 1; } -//===----------------------------------------------------------------------===// -// Op definition for MatmulOp -//===----------------------------------------------------------------------===// - -def MatmulOp : LinalgStructuredBase_Op<"matmul", [ - AttrSizedOperandSegments, - LinalgContractionOpInterface]> { - - let summary = [{ - Performs a matrix multiplication of two 2D inputs without broadcast or transpose. - }]; - let description = [{ - Numeric casting is performed on the operands to the inner multiply, - promoting them to the same data type as the accumulator/output. - - Broadcast and Transpose semantics can be appiled by specifying the explicit attribute - 'indexing_maps' as shown below.This is a list attribute, so the list must include all - the maps if specified. - - Example Transpose: - ``` - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>,memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - ``` - - Example Broadcast: - ``` - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, // broadcast - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - ``` - - Example Broadcast and transpose: - ``` - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose - affine_map<(d0, d1, d2) -> (d2)>, // broadcast - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) - }]; - - let arguments = (ins - Variadic:$inputs, - Variadic:$outputs, - DefaultValuedOptionalAttr:$indexing_maps, - DefaultValuedOptionalAttr:$cast - ); - let results = (outs Variadic:$result_tensors); - let regions = (region AnyRegion:$region); - - let skipDefaultBuilders = 1; - let builders = [ - OpBuilder< - (ins "ValueRange":$inputs, "ValueRange":$outputs, - CArg<"ArrayRef", "{}">:$attributes), - [{ - buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs, - attributes, MatmulOp::getRegionBuilder()); - }]>, - OpBuilder< - (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, - "ValueRange":$outputs, - CArg<"ArrayRef", "{}">:$attributes), - [{ - buildStructuredOp($_builder, $_state, resultTensorTypes, - inputs, outputs, attributes, MatmulOp::getRegionBuilder()); - }]>, - OpBuilder< - (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands, - CArg<"ArrayRef", "{}">:$attributes), - [{ - $_state.addOperands(operands); - $_state.addAttributes(attributes); - $_state.addTypes(resultTensorTypes); - (void)$_state.addRegion(); - }]>, - OpBuilder< - (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, - "ValueRange":$outputs, - "Attribute":$cast, CArg<"ArrayRef", "{}">:$attributes), - [{ - $_state.addAttribute("cast", cast); - buildStructuredOp($_builder, $_state, resultTensorTypes, inputs, outputs, - attributes, MatmulOp::getRegionBuilder()); - }]> - - ]; - let hasCustomAssemblyFormat = 1; - let hasFolder = 1; - let hasVerifier = 1; - - let extraClassDeclaration = structuredOpsBaseDecls # [{ - SmallVector getIteratorTypesArray(); - - /// Implements the block region builder. - static void regionBuilder(ImplicitLocOpBuilder &b, - Block &block, ArrayRef attrs); - - /// Returns a list of AffineMap with the typical matmul indexing charactristic. - SmallVector getDefaultIndexingMaps(); - - /// Returns true if the given broadcast map \p bcastMap is valid for this op. - bool isValidLhsRhsBroadcastMap(AffineMap bcastMap); - - static std::function)> - getRegionBuilder() { - return regionBuilder; - } - - ::mlir::MutableOperandRange getDpsInitsMutable() { - return getOutputsMutable(); - } - - // Generic methods. - static unsigned getNumRegionArgs(); - std::string getLibraryCallName(); - bool hasDynamicIndexingMaps(); - /// Check if the op has broadcast and/or transpose semantic. Returns true if the - /// user defined indexing maps are not equal to default map. - bool hasUserDefinedMaps(); - }]; -} - //===----------------------------------------------------------------------===// // Named Linalg ops, implemented as a declarative configurations of generic ops. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp index 3b9194098fa783..40795879c3026d 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp @@ -15,20 +15,13 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" -#include "mlir/IR/BuiltinTypeInterfaces.h" -#include "mlir/IR/MLIRContext.h" #include "mlir/IR/TypeUtilities.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/raw_ostream.h" #include -#include using namespace mlir; using namespace mlir::linalg; @@ -1149,6 +1142,7 @@ int64_t LinalgOp::getIndexingMapIndex(OpOperand *opOperand) { LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { LinalgOp linalgOp = cast(op); + // Mixed tensor/buffer operands are not allowed. if (!linalgOp.hasPureTensorSemantics() && !linalgOp.hasPureBufferSemantics() && op->getNumOperands() > 0) @@ -1168,8 +1162,6 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { << ") to be equal to the number of input/output operands (" << linalgOp->getNumOperands() << ")"; - // Set this flag if this op has user defined maps. This is required to guard - // the below error condition which assume default indexing maps. for (OpOperand &opOperand : linalgOp->getOpOperands()) { AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand); @@ -1186,13 +1178,13 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { << " dim(s) to match the number of loops"; int64_t rank = linalgOp.getRank(&opOperand); - if (indexingMap.getNumResults() != rank) return op->emitOpError("expected operand rank (") << rank << ") to match the result rank of indexing_map #" << opOperand.getOperandNumber() << " (" << indexingMap.getNumResults() << ")"; } + SmallVector redDims; linalgOp.getReductionDims(redDims); @@ -1202,8 +1194,9 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { // Check if given shapes match to inferred shapes. SmallVector endLoopRangeValues = linalgOp.getStaticLoopRanges(); SmallVector startLoopRangeValues(endLoopRangeValues.size(), 0); - // Verify only static cases since we can't get exact dimension sizes and - // loop ranges for dynamic cases in this stage. + + // Verify only static cases since we can't get exact dimension sizes and loop + // ranges for dynamic cases in this stage. if (llvm::none_of(endLoopRangeValues, ShapedType::isDynamic)) { for (int64_t &range : endLoopRangeValues) range -= 1; diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index c909d13e4314b4..730c478c2883ef 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -27,7 +27,6 @@ #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" -#include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Matchers.h" @@ -38,17 +37,12 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/LogicalResult.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include #include using namespace mlir; @@ -155,36 +149,15 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region ®ion, // iterator_types is an auto-generated method. } -/// Helper to create a typical indexing map for MatmulOp. Returns a list of -/// AffineMap. -static SmallVector -getDefaultIndexingMapsForMatmul(MLIRContext *context) { - AffineExpr d0, d1, d2; - SmallVector indexingMaps; - bindDims(context, d0, d1, d2); - indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context)); - indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context)); - indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context)); - return indexingMaps; -} - -/// Wrapper to return the typical indexing map array attribute for MatmulOp. -static SmallVector getDefaultIndexingMapAttr(MLIRContext *context) { - return llvm::map_to_vector( - getDefaultIndexingMapsForMatmul(context), - [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); }); -} - /// Creates a structured operation given `inputs`, `outputs`, and `attributes`. /// The result types are derived automatically if `resultTensorTypes` is none. /// The body of the operation is filled using `regionBuilder`. All ods-gen /// created structured operations use the method to implement their builders. -static void buildStructuredOp( - OpBuilder &b, OperationState &state, - std::optional resultTensorTypes, ValueRange inputs, - ValueRange outputs, ArrayRef attributes, - RegionBuilderFn regionBuilder, - std::optional> indexingMaps = std::nullopt) { +static void buildStructuredOp(OpBuilder &b, OperationState &state, + std::optional resultTensorTypes, + ValueRange inputs, ValueRange outputs, + ArrayRef attributes, + RegionBuilderFn regionBuilder) { // Derive the result types if needed. SmallVector derivedResultTypes = resultTensorTypes.value_or(TypeRange()); @@ -195,20 +168,6 @@ static void buildStructuredOp( state.addOperands(inputs); state.addOperands(outputs); state.addTypes(derivedResultTypes); - - // Initialize indexingMaps, for MatmulOp. - SmallVector indexingMapsAttrVal; - if (indexingMaps.has_value()) { - for (mlir::AffineMap map : *indexingMaps) { - // Convert each AffineMap to an AffineMapAttr - indexingMapsAttrVal.push_back(AffineMapAttr::get(map)); - } - state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal)); - } else { - indexingMapsAttrVal = getDefaultIndexingMapAttr(b.getContext()); - state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal)); - } - state.addAttributes(attributes); state.addAttribute( "operandSegmentSizes", @@ -340,48 +299,11 @@ static ParseResult parseNamedStructuredOp(OpAsmParser &parser, OperationState &result, unsigned numRegionArgs, RegionBuilderFn regionBuilder) { - - SmallVector indexingMapsAttr; - Attribute mapAttr; - if (succeeded(parser.parseOptionalKeyword("indexing_maps"))) { - if (parser.parseEqual()) - return failure(); - - if (parser.parseLSquare()) - return failure(); - - do { - if (parser.parseAttribute(mapAttr)) - return failure(); - if (!isa(mapAttr)) { - return parser.emitError(parser.getCurrentLocation(), - "expected affine map attribute"); - } - indexingMapsAttr.push_back(mapAttr); - - if (parser.parseOptionalComma()) - break; - } while (true); - - if (parser.parseRSquare()) - return failure(); - } - // Initialize indexingMaps, if not supplied explicitly. - if (indexingMapsAttr.empty()) { - indexingMapsAttr = getDefaultIndexingMapAttr(result.getContext()); - } - result.addAttribute("indexing_maps", - parser.getBuilder().getArrayAttr(indexingMapsAttr)); - // TODO: Enable when ods-gen supports captures. SmallVector inputTypes, outputTypes; if (parseCommonStructuredOpParts(parser, result, inputTypes, outputTypes)) return failure(); - // Parse optional attributes. - if (parser.parseOptionalAttrDict(result.attributes)) - return failure(); - // TODO: consider merging results parsing into region parsing. // Need to wait for declarative assembly resolution to decide. SmallVector outputTensorsTypes; @@ -407,9 +329,13 @@ static void printNamedStructuredOpResults(OpAsmPrinter &p, } static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op, - ValueRange inputs, ValueRange outputs, - ArrayRef elidedAttrs = {}) { - p.printOptionalAttrDict(op->getAttrs(), elidedAttrs); + ValueRange inputs, ValueRange outputs) { + p.printOptionalAttrDict( + op->getAttrs(), + /*elidedAttrs=*/{"operandSegmentSizes", + // See generated code in + // LinalgNamedStructuredOps.yamlgen.cpp.inc + "linalg.memoized_indexing_maps"}); // Printing is shared with generic ops, except for the region and // attributes. @@ -3456,168 +3382,3 @@ Operation *LinalgDialect::materializeConstant(OpBuilder &builder, Location loc) { return arith::ConstantOp::materialize(builder, value, type, loc); } - -/// Returns true if the result AffineExpr of the \p explicitMap is same as \p -/// defaultMap. -static bool isValidResultDimExprs(AffineMap explictMap, AffineMap defaultMap) { - auto explicitRange = explictMap.getResults(); - auto defaultRange = defaultMap.getResults(); - DenseSet explicitSet(explicitRange.begin(), explicitRange.end()); - DenseSet defaultSet(defaultRange.begin(), defaultRange.end()); - llvm::set_union(explicitSet, defaultSet); - return explicitSet == defaultSet; -} - -/// Returns true if the \p explictMap is broadcasted with respect to the -/// \p defaultMap. -static bool isBroadcasted(AffineMap explictMap, AffineMap defaultMap) { - return explictMap.getNumResults() < defaultMap.getNumResults(); -} - -/// Verifies the broadcast and transpose semantic sepecified by the explicit -/// indexing map for the MatmulOp \p op for each operand specified by \p -/// opIndex. -static LogicalResult verifyExtendedMatmulSemantic(MatmulOp matmulOp, - unsigned opIndex) { - SmallVector opIndexingMaps = matmulOp.getIndexingMapsArray(); - SmallVector defaultIndexingMaps = - matmulOp.getDefaultIndexingMaps(); - - auto opIndexingMap = opIndexingMaps[opIndex]; - auto defaultIndexingMap = defaultIndexingMaps[opIndex]; - // Check general validity of indexing map results. - if (!isValidResultDimExprs(opIndexingMap, defaultIndexingMap)) - return matmulOp->emitOpError() - << "Unexpected dim expression in map result."; - - // Check if the requested broadcast is valid. - if (isBroadcasted(opIndexingMap, defaultIndexingMap)) { - if (!matmulOp.isValidLhsRhsBroadcastMap(opIndexingMap)) { - return matmulOp->emitOpError() - << "Invalid broadcast requested, should be (d2)."; - } - return success(); - } - return success(); -} - -namespace mlir { -namespace linalg { -//===----------------------------------------------------------------------===// -// MatMulOp -//===----------------------------------------------------------------------===// -SmallVector MatmulOp::getIteratorTypesArray() { - return SmallVector{utils::IteratorType::parallel, - utils::IteratorType::parallel, - utils::IteratorType::reduction}; -} - -unsigned MatmulOp::getNumRegionArgs() { return 3; } - -std::string MatmulOp::getLibraryCallName() { - return generateLibraryCallName(getOperation()); -} - -bool MatmulOp::hasDynamicIndexingMaps() { return true; } - -/// Check if the op has broadcast and/or transpose semantic. Returns true if the -/// user defined indexing maps are not equal to default map. -bool MatmulOp::hasUserDefinedMaps() { - SmallVector defaultMaps = getDefaultIndexingMaps(); - SmallVector explicitMaps = getIndexingMapsArray(); - return defaultMaps != explicitMaps; -} - -/// Implements the block region builder for the MatmulOp. This is called by -/// 'fillStructuredOpRegion'. -void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block, - ArrayRef attrs) { - assert(3 > 0 && block.getNumArguments() == 3 && - "MatmulOp regionBuilder expects 3 (>=0) args"); - RegionBuilderHelper helper(b, block); - SmallVector yields; - - TypeFn castVal = TypeFn::cast_signed; - auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { - return attr.getName() == "cast"; - }); - if (castIter != attrs.end()) { - if (auto attr = llvm::dyn_cast(castIter->getValue())) - castVal = attr.getValue(); - } - - Value value1 = helper.buildTypeFn(castVal, block.getArgument(2).getType(), - block.getArgument(0)); - Value value2 = helper.buildTypeFn(castVal, block.getArgument(2).getType(), - block.getArgument(1)); - Value value3 = helper.buildBinaryFn(BinaryFn::mul, value1, value2); - Value value4 = - helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), value3); - yields.push_back(value4); - helper.yieldOutputs(yields); -} - -/// Returns a list of AffineMap with the typical matmul indexing charactristic. -SmallVector MatmulOp::getDefaultIndexingMaps() { - MLIRContext *context = this->getContext(); - return getDefaultIndexingMapsForMatmul(context); -} - -/// Returns true if the given broadcast map \p bcastMap is valid for this op. -bool MatmulOp::isValidLhsRhsBroadcastMap(AffineMap bcastMap) { - assert(bcastMap.getNumResults() == 1 && "Expected single result dim expr."); - AffineExpr exp = bcastMap.getResult(0); - // Invalid map if the common dimension of matmul not found. - return exp.isFunctionOfDim(bcastMap.getNumDims() - 1); -} - -ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) { - return parseNamedStructuredOp(parser, result, MatmulOp::getNumRegionArgs(), - MatmulOp::getRegionBuilder()); -} -void MatmulOp::print(OpAsmPrinter &p) { - SmallVector elidedAttrs = { - "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; - printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); - - SmallVector indexingMaps = - getDefaultIndexingMapAttr(getContext()); - if (!llvm::equal(getIndexingMaps(), indexingMaps)) { - p << " indexing_maps = ["; - llvm::interleaveComma(getIndexingMaps(), p, - [&](Attribute attr) { p.printAttribute(attr); }); - p << "]"; - } -} - -/// Verify the user defined indexing maps. -LogicalResult MatmulOp::verify() { - // Verification of pure matmul is handled by verifyStructuredOpInterface(). - if (!hasUserDefinedMaps()) - return success(); - - for (unsigned opIndex = 0; opIndex < 2; opIndex++) { - if (failed(verifyExtendedMatmulSemantic(*this, opIndex))) - return failure(); - } - return success(); -} - -LogicalResult MatmulOp::fold(FoldAdaptor, SmallVectorImpl &) { - return memref::foldMemRefCast(*this); -} -void MatmulOp::getEffects( - SmallVectorImpl> - &effects) { - if (hasPureTensorSemantics()) - return; - getGenericEffectsImpl(effects, cast(getOperation())); -} - -Speculation::Speculatability MatmulOp::getSpeculatability() { - return getGenericSpeculatabilityImpl(cast(getOperation())); -} - -} // namespace linalg -} // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp index 6b934f7e8157d4..aa0052ce47fa7b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp @@ -31,13 +31,6 @@ using namespace mlir::linalg; FailureOr mlir::linalg::transposeMatmul(RewriterBase &rewriter, linalg::MatmulOp matmulOp, bool transposeLHS) { - // Check to not let go the matmul with extended semantic, through this - // transform. - if (matmulOp.hasUserDefinedMaps()) { - return rewriter.notifyMatchFailure( - matmulOp, "only matmul ops with non-extended semantics are supported"); - } - if (!bufferization::hasTensorSemantics(matmulOp)) return rewriter.notifyMatchFailure( matmulOp, "only matmul ops with tensors are supported"); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index e3f010d9cfb20b..09c6b2683b4388 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2071,11 +2071,6 @@ vectorizeScalableVectorPrecondition(Operation *op, return failure(); } - // Check to not let go the matmul with extended semantic, through this - // transform. - if (linalgOp.hasUserDefinedMaps()) - return failure(); - // Cond 4: Only the following ops are supported in the // presence of scalable vectors return success(isElementwise(linalgOp) || isa(op) || diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index 3c508ed6e324b2..0c2275bbc4b224 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -821,12 +821,6 @@ DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne( bool fail = true; // TODO: more robust detection of matmulOp, with transposes etc. if (isa_and_nonnull(linalgOp.getOperation())) { - // Check to not let go the matmul with extended semantic, through this - // transform. - if (linalgOp.hasUserDefinedMaps()) { - return emitSilenceableError() - << "only matmul ops with non-extended semantics are supported"; - } Location loc = linalgOp.getLoc(); // TODO: more robust computation of laneId, for now assume a single warp. Value laneId = rewriter.create( diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index d5e79b4d3cb6dd..e4a6ec7487bb2f 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -383,6 +383,23 @@ def select( O[None] = TernaryFn.select(cond[None], lhs[None], rhs[None]) +@linalg_structured_op +def matmul( + A=TensorDef(T1, S.M, S.K), + B=TensorDef(T2, S.K, S.N), + C=TensorDef(U, S.M, S.N, output=True), + cast=TypeFnAttrDef(default=TypeFn.cast_signed), +): + """Performs a matrix multiplication of two 2D inputs. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. + """ + domain(D.m, D.n, D.k) + implements(ContractionOpInterface) + C[D.m, D.n] += cast(U, A[D.m, D.k]) * cast(U, B[D.k, D.n]) + + @linalg_structured_op def quantized_matmul( A=TensorDef(T1, S.M, S.K), diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir index aba26c35931fd3..1e8f1435ca0fa5 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir @@ -29,34 +29,6 @@ func.func @generalize_matmul_buffer(%A : memref<16x8xf32>, %B: memref<8x32xf32>, // ----- -func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func.func @matmul_bcast_a( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) { -// CHECK: ^bb0(%[[VAL_3:.*]]: f32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: f32): -// CHECK: %[[VAL_6:.*]] = arith.mulf %[[VAL_3]], %[[VAL_4]] : f32 -// CHECK: %[[VAL_7:.*]] = arith.addf %[[VAL_5]], %[[VAL_6]] : f32 -// CHECK: linalg.yield %[[VAL_7]] : f32 -// CHECK: } -// CHECK: return -// CHECK: } - -// ----- - func.func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> { %0 = linalg.matmul ins(%A, %B: tensor<16x8xf32>, tensor<8x32xf32>) outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32> @@ -919,86 +891,3 @@ func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor, tensor, tensor> } - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_a_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { - -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} -// CHECK: arith.mulf -// CHECK: arith.addf - -func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func.func @matmul_transpose_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { - -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} -// CHECK: arith.mulf -// CHECK: arith.addf - -func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_a_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { - -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} -// CHECK: arith.mulf -// CHECK: arith.addf - -func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index b2869893b8042d..c481a723c5623c 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -361,165 +361,6 @@ func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, // ----- -func.func @invalid_indexing_maps_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) { - // expected-error @+1 {{expected attribute value}} - linalg.matmul indexing_maps = [ - , - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>) - outs(%arg2 :memref<2x4xf32>) - return -} - -// ----- - -func.func @invalid_matmul_dim_a(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) { - // expected-error @+1 {{Unexpected dim expression in map result}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>) - return -} - -// ----- - -func.func @invalid_matmul_dim_b(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) { - // expected-error @+1 {{Unexpected dim expression in map result}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>) - return -} - -// ----- - -func.func @invalid_transpose_a_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> { - // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 1}} - %0 = linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) - outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32> - return %0: tensor<4x64xf32> -} - -// ----- - -func.func @invalid_transpose_b_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> { - // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #1 to be 1, but found 64}} - %0 = linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) - outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32> - return %0: tensor<4x64xf32> -} - -// ----- - -func.func @invalid_bcast_a(%arg0: memref<3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_bcast_a_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #0 (1)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_bcast_b_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #1 (1)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 5, but found 7}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_matmul_bcast_b_transpose_a_wrong_dim(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op Unexpected dim expression in map result.}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_indexing_maps_placement_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) { - // expected-error @+2 {{custom op 'indexing_maps' is unknown (tried 'func.indexing_maps' as well)}} - linalg.matmul ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) outs(%init : tensor<4x64xf32>) - indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - return -} - -// ----- - func.func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2x1xf32>, %output: memref<1x2x3x1xf32>) { // expected-error @+1 {{inferred input/output operand #0 has shape's dimension #1 to be greater than or equal to 4, but found 3}} linalg.conv_2d_nhwc_hwcf diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 65c18de8424771..02ecbed232c8b5 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1201,249 +1201,6 @@ func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %a // ----- -// CHECK-LABEL: func @matmul_transpose_a_explicit -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) -func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - -func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_a_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_a -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @matmul_bcast_a_dim1(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_a_dim1 -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @matmul_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_b -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @matmul_bcast_a_b(%arg0: memref<5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_bcast_a_b( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, %[[VAL_1:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_b_dim1(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_b_dim1 -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @dynamic_matmul_bcast_a(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref, memref) outs(%arg2: memref) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @dynamic_matmul_bcast_a( -// CHECK-SAME: %[[VAL_0:.*]]: memref, -// CHECK-SAME: %[[VAL_1:.*]]: memref, -// CHECK-SAME: %[[VAL_2:.*]]: memref) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref, memref) outs(%[[VAL_2]] : memref) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_a_transpose_b(%arg0: memref<5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_bcast_a_transpose_b( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_bcast_b_transpose_a( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - // CHECK-LABEL: func @matmul_transpose_b // CHECK: linalg.matmul_transpose_b // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>) diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py index 72045a07b2da80..3bfbcf7d7f7c81 100644 --- a/mlir/test/python/dialects/linalg/ops.py +++ b/mlir/test/python/dialects/linalg/ops.py @@ -84,6 +84,81 @@ def named_form(lhs, rhs): print(module) + +# CHECK-LABEL: TEST: testNamedStructuredOpGenericForm +@run +def testNamedStructuredOpGenericForm(): + with Context() as ctx, Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + + @func.FuncOp.from_py_func( + RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) + ) + def named_form(lhs, rhs): + init_result = tensor.empty([4, 8], f32) + # CHECK: "linalg.matmul"(%{{.*}}) + # CHECK-SAME: cast = #linalg.type_fn + # CHECK-SAME: operandSegmentSizes = array + # CHECK-NEXT: ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32): + # CHECK-NEXT: arith.mulf{{.*}} (f32, f32) -> f32 + # CHECK-NEXT: arith.addf{{.*}} (f32, f32) -> f32 + # CHECK-NEXT: linalg.yield{{.*}} (f32) -> () + # CHECK-NEXT: (tensor<4x16xf32>, tensor<16x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32> + return linalg.matmul(lhs, rhs, outs=[init_result]) + + module.operation.print(print_generic_op_form=True) + + +# CHECK-LABEL: TEST: testNamedStructuredAsGenericOp +@run +def testNamedStructuredAsGenericOp(): + with Context() as ctx, Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + + @func.FuncOp.from_py_func( + RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) + ) + def generic_form(lhs, rhs): + init_result = tensor.EmptyOp([4, 8], f32) + # CHECK: linalg.generic + return linalg.matmul( + lhs, rhs, outs=[init_result.result], emit_generic=True + ) + + print(module) + + +# CHECK-LABEL: TEST: testOpResultFromOtherOp +@run +def testOpResultFromOtherOp(): + with Context(), Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + + @func.FuncOp.from_py_func( + RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) + ) + def pass_an_op_directly(arg0, arg1): + one = arith.ConstantOp(F32Type.get(), 1.0) + # CHECK: %[[LHS:.*]] = linalg.fill + lhs = linalg.fill(one, outs=[arg0]) + # CHECK: %[[RHS:.*]] = linalg.fill + rhs = linalg.fill(one, outs=[arg1]) + # CHECK: %[[INIT:.*]] = tensor.empty + init = tensor.EmptyOp([4, 8], f32) + # CHECK: linalg.matmul + # CHECK: ins(%[[LHS]], %[[RHS]] + # CHECK: outs(%[[INIT]] + return linalg.matmul(lhs, rhs, outs=init) + + print(module) + + # CHECK-LABEL: TEST: testIdentityRegionOps @run def testIdentityRegionOps(): diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp index f820cb7ee8c3c4..aa5a52a21f1251 100644 --- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp +++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp @@ -681,11 +681,7 @@ ParseResult {0}::parse(OpAsmParser &parser, OperationState &result) {{ {0}::getNumRegionArgs(), {0}::getRegionBuilder()); } void {0}::print(OpAsmPrinter &p) {{ - SmallVector elidedAttrs = {{"operandSegmentSizes", - "linalg.memoized_indexing_maps", - "indexing_maps"}; - ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); + ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs()); } )FMT"; From a4402039bffd788b9af82435fd5a2fb311fdc6e8 Mon Sep 17 00:00:00 2001 From: Sebastian Kreutzer Date: Fri, 11 Oct 2024 05:23:34 -0400 Subject: [PATCH 143/177] [XRay] Add support for instrumentation of DSOs on x86_64 (#90959) This PR introduces shared library (DSO) support for XRay based on a revised version of the implementation outlined in [this RFC](https://discourse.llvm.org/t/rfc-upstreaming-dso-instrumentation-support-for-xray/73000). The feature enables the patching and handling of events from DSOs, supporting both libraries linked at startup or explicitly loaded, e.g. via `dlopen`. This patch adds the following: - The `-fxray-shared` flag to enable the feature (turned off by default) - A small runtime library that is linked into every instrumented DSO, providing position-independent trampolines and code to register with the main XRay runtime - Changes to the XRay runtime to support management and patching of multiple objects These changes are fully backward compatible, i.e. running without instrumented DSOs will produce identical traces (in terms of recorded function IDs) to the previous implementation. Due to my limited ability to test on other architectures, this feature is only implemented and tested with x86_64. Extending support to other architectures is fairly straightforward, requiring only a position-independent implementation of the architecture-specific trampoline implementation (see `compiler-rt/lib/xray/xray_trampoline_x86_64.S` for reference). This patch does not include any functionality to resolve function IDs from DSOs for the provided logging/tracing modes. These modes still work and will record calls from DSOs, but symbol resolution for these functions in not available. Getting this to work properly requires recording information about the loaded DSOs and should IMO be discussed in a separate RFC, as there are mulitple feasible approaches. @petrhosek @jplehr --- clang/include/clang/Basic/CodeGenOptions.def | 2 + clang/include/clang/Driver/Options.td | 5 + clang/include/clang/Driver/XRayArgs.h | 4 + clang/lib/Driver/ToolChains/CommonArgs.cpp | 12 +- clang/lib/Driver/XRayArgs.cpp | 21 ++ clang/test/Driver/XRay/xray-shared.cpp | 17 + .../cmake/Modules/AllSupportedArchDefs.cmake | 1 + compiler-rt/cmake/config-ix.cmake | 4 + compiler-rt/include/xray/xray_interface.h | 55 +++- compiler-rt/lib/xray/CMakeLists.txt | 86 +++++- compiler-rt/lib/xray/xray_dso_init.cpp | 62 ++++ compiler-rt/lib/xray/xray_init.cpp | 183 +++++++++-- compiler-rt/lib/xray/xray_interface.cpp | 292 ++++++++++++++---- .../lib/xray/xray_interface_internal.h | 83 ++++- compiler-rt/lib/xray/xray_trampoline_x86_64.S | 24 +- compiler-rt/lib/xray/xray_x86_64.cpp | 23 +- .../xray/TestCases/Posix/basic-mode-dso.cpp | 47 +++ .../TestCases/Posix/clang-xray-shared.cpp | 14 + .../test/xray/TestCases/Posix/dlopen.cpp | 107 +++++++ .../xray/TestCases/Posix/dso-dep-chains.cpp | 197 ++++++++++++ .../TestCases/Posix/patch-premain-dso.cpp | 45 +++ .../Posix/patching-unpatching-dso.cpp | 75 +++++ 22 files changed, 1215 insertions(+), 144 deletions(-) create mode 100644 clang/test/Driver/XRay/xray-shared.cpp create mode 100644 compiler-rt/lib/xray/xray_dso_init.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/dlopen.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index eac831278ee20d..e45370bde74a5d 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -136,6 +136,8 @@ CODEGENOPT(XRayIgnoreLoops , 1, 0) ///< Emit the XRay function index section. CODEGENOPT(XRayFunctionIndex , 1, 1) +///< Set when -fxray-shared is enabled +CODEGENOPT(XRayShared , 1, 0) ///< Set the minimum number of instructions in a function to determine selective ///< XRay instrumentation. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d306c751505e98..4ee16e213d0e13 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2946,6 +2946,11 @@ def fxray_selected_function_group : HelpText<"When using -fxray-function-groups, select which group of functions to instrument. Valid range is 0 to fxray-function-groups - 1">, MarshallingInfoInt, "0">; +defm xray_shared : BoolFOption<"xray-shared", + CodeGenOpts<"XRayShared">, DefaultFalse, + PosFlag, + NegFlag>; defm fine_grained_bitfield_accesses : BoolOption<"f", "fine-grained-bitfield-accesses", CodeGenOpts<"FineGrainedBitfieldAccesses">, DefaultFalse, diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h index bdd3d979547eed..8fbcf469e5bad1 100644 --- a/clang/include/clang/Driver/XRayArgs.h +++ b/clang/include/clang/Driver/XRayArgs.h @@ -27,6 +27,7 @@ class XRayArgs { XRayInstrSet InstrumentationBundle; llvm::opt::Arg *XRayInstrument = nullptr; bool XRayRT = true; + bool XRayShared = false; public: /// Parses the XRay arguments from an argument list. @@ -35,6 +36,9 @@ class XRayArgs { llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const; bool needsXRayRt() const { return XRayInstrument && XRayRT; } + bool needsXRayDSORt() const { + return XRayInstrument && XRayRT && XRayShared; + } llvm::ArrayRef modeList() const { return Modes; } XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; } }; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0c6a585c3acffd..0a1b7c209563e8 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1613,10 +1613,14 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, } bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) { - if (Args.hasArg(options::OPT_shared)) - return false; - - if (TC.getXRayArgs().needsXRayRt()) { + if (Args.hasArg(options::OPT_shared)) { + if (TC.getXRayArgs().needsXRayDSORt()) { + CmdArgs.push_back("--whole-archive"); + CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray-dso")); + CmdArgs.push_back("--no-whole-archive"); + return true; + } + } else if (TC.getXRayArgs().needsXRayRt()) { CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray")); for (const auto &Mode : TC.getXRayArgs().modeList()) diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index 8c5134e2501358..411054e067cb42 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -63,6 +63,23 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { << XRayInstrument->getSpelling() << Triple.str(); } + if (Args.hasFlag(options::OPT_fxray_shared, + options::OPT_fno_xray_shared, false)) { + XRayShared = true; + + // DSO instrumentation is currently limited to x86_64 + if (Triple.getArch() != llvm::Triple::x86_64) { + D.Diag(diag::err_drv_unsupported_opt_for_target) + << "-fxray-shared" << Triple.str(); + } + + unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args)); + if (!PICLvl) { + D.Diag(diag::err_opt_not_valid_without_opt) + << "-fxray-shared" << "-fPIC"; + } + } + // Both XRay and -fpatchable-function-entry use // TargetOpcode::PATCHABLE_FUNCTION_ENTER. if (Arg *A = Args.getLastArg(options::OPT_fpatchable_function_entry_EQ)) @@ -177,6 +194,10 @@ void XRayArgs::addArgs(const ToolChain &TC, const ArgList &Args, Args.addOptOutFlag(CmdArgs, options::OPT_fxray_function_index, options::OPT_fno_xray_function_index); + if (XRayShared) + Args.addOptInFlag(CmdArgs, options::OPT_fxray_shared, + options::OPT_fno_xray_shared); + if (const Arg *A = Args.getLastArg(options::OPT_fxray_instruction_threshold_EQ)) { int Value; diff --git a/clang/test/Driver/XRay/xray-shared.cpp b/clang/test/Driver/XRay/xray-shared.cpp new file mode 100644 index 00000000000000..215854e1fc7cef --- /dev/null +++ b/clang/test/Driver/XRay/xray-shared.cpp @@ -0,0 +1,17 @@ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fpic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-PIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-pic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC + +// On 64 bit darwin, PIC is always enabled +// RUN: %clang -### --target=x86_64-apple-darwin -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s + +// Check unsupported targets +// RUN: not %clang -### --target=aarch64-pc-freebsd -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET +// RUN: not %clang -### --target=arm64-apple-macos -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET + +// CHECK: "-cc1" {{.*}}"-fxray-instrument" {{.*}}"-fxray-shared" +// ERR-TARGET: error: unsupported option '-fxray-shared' for target +// ERR-PIC: error: option '-fxray-shared' cannot be specified without '-fPIC' + diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake index 809e9277156912..50a4256b82fe4e 100644 --- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake +++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake @@ -104,6 +104,7 @@ else() set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} powerpc64le ${HEXAGON} ${LOONGARCH64}) endif() +set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64}) set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64}) if (UNIX) diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index a93a88a9205001..6134c9876b38e9 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -668,6 +668,9 @@ if(APPLE) list_intersect(XRAY_SUPPORTED_ARCH ALL_XRAY_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) + list_intersect(XRAY_DSO_SUPPORTED_ARCH + ALL_XRAY_DSO_SUPPORTED_ARCH + SANITIZER_COMMON_SUPPORTED_ARCH) list_intersect(SHADOWCALLSTACK_SUPPORTED_ARCH ALL_SHADOWCALLSTACK_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) @@ -702,6 +705,7 @@ else() filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH}) filter_available_targets(SCUDO_STANDALONE_SUPPORTED_ARCH ${ALL_SCUDO_STANDALONE_SUPPORTED_ARCH}) filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH}) + filter_available_targets(XRAY_DSO_SUPPORTED_ARCH ${ALL_XRAY_DSO_SUPPORTED_ARCH}) filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH}) filter_available_targets(GWP_ASAN_SUPPORTED_ARCH ${ALL_GWP_ASAN_SUPPORTED_ARCH}) diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 727431c04e4f73..717cfe292ce416 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -93,31 +93,74 @@ enum XRayPatchingStatus { FAILED = 3, }; -/// This tells XRay to patch the instrumentation points. See XRayPatchingStatus +/// This tells XRay to patch the instrumentation points in all currently loaded objects. See XRayPatchingStatus /// for possible result values. extern XRayPatchingStatus __xray_patch(); +/// This tells XRay to patch the instrumentation points in the given object. +/// See XRayPatchingStatus for possible result values. +extern XRayPatchingStatus __xray_patch_object(int32_t ObjId); + /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch(); -/// This patches a specific function id. See XRayPatchingStatus for possible +/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for possible +/// result values. +extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); + +/// This unpacks the given (packed) function id and patches +/// the corresponding function. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_patch_function(int32_t FuncId); -/// This unpatches a specific function id. See XRayPatchingStatus for possible +/// This patches a specific function in the given object. See XRayPatchingStatus for possible +/// result values. +extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, + int32_t ObjId); + +/// This unpacks the given (packed) function id and unpatches +/// the corresponding function. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); -/// This function returns the address of the function provided a valid function -/// id. We return 0 if we encounter any error, even if 0 may be a valid function +/// This unpatches a specific function in the given object. +/// See XRayPatchingStatus for possible result values. +extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId, + int32_t ObjId); + +/// This function unpacks the given (packed) function id and returns the address of the corresponding function. We return 0 if we encounter any error, even if 0 may be a valid function /// address. extern uintptr_t __xray_function_address(int32_t FuncId); -/// This function returns the maximum valid function id. Returns 0 if we +/// This function returns the address of the function in the given object provided valid function and object +/// ids. We return 0 if we encounter any error, even if 0 may be a valid function +/// address. +extern uintptr_t __xray_function_address_in_object(int32_t FuncId, + int32_t ObjId); + +/// This function returns the maximum valid function id for the main executable (object id = 0). Returns 0 if we /// encounter errors (when there are no instrumented functions, etc.). extern size_t __xray_max_function_id(); +/// This function returns the maximum valid function id for the given object. Returns 0 if we +/// encounter errors (when there are no instrumented functions, etc.). +extern size_t __xray_max_function_id_in_object(int32_t ObjId); + +/// This function returns the number of previously registered objects (executable + loaded DSOs). +/// Returns 0 if XRay has not been initialized. +extern size_t __xray_num_objects(); + +/// Unpacks the function id from the given packed id. +extern int32_t __xray_unpack_function_id(int32_t PackedId); + +/// Unpacks the object id from the given packed id. +extern int32_t __xray_unpack_object_id(int32_t PackedId); + +/// Creates and returns a packed id from the given function and object ids. +/// If the ids do not fit within the reserved number of bits for each part, the high bits are truncated. +extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId); + /// Initialize the required XRay data structures. This is useful in cases where /// users want to control precisely when the XRay instrumentation data /// structures are initialized, for example when the XRay library is built with diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt index cf7b5062aae32d..f38c07420c9abf 100644 --- a/compiler-rt/lib/xray/CMakeLists.txt +++ b/compiler-rt/lib/xray/CMakeLists.txt @@ -10,6 +10,10 @@ set(XRAY_SOURCES xray_utils.cpp ) +set(XRAY_DSO_SOURCES + xray_dso_init.cpp + ) + # Implementation files for all XRay modes. set(XRAY_FDR_MODE_SOURCES xray_fdr_flags.cpp @@ -33,6 +37,11 @@ set(x86_64_SOURCES xray_trampoline_x86_64.S ) +set(x86_64_DSO_SOURCES + xray_trampoline_x86_64.S + ) + + set(arm_SOURCES xray_arm.cpp xray_trampoline_arm.S @@ -128,10 +137,12 @@ set(XRAY_IMPL_HEADERS # consumption by tests. set(XRAY_ALL_SOURCE_FILES ${XRAY_SOURCES} + ${XRAY_DSO_SOURCES} ${XRAY_FDR_MODE_SOURCES} ${XRAY_BASIC_MODE_SOURCES} ${XRAY_PROFILING_MODE_SOURCES} ${x86_64_SOURCES} + ${x86_64_DSO_SOURCES} ${arm_SOURCES} ${armhf_SOURCES} ${hexagon_SOURCES} @@ -162,6 +173,9 @@ set(XRAY_CFLAGS ${COMPILER_RT_CXX_CFLAGS}) set(XRAY_COMMON_DEFINITIONS SANITIZER_COMMON_NO_REDEFINE_BUILTINS XRAY_HAS_EXCEPTIONS=1) +# DSO trampolines need to be compiled with GOT addressing +set(XRAY_COMMON_DEFINITIONS_DSO ${XRAY_COMMON_DEFINITIONS} XRAY_PIC) + # Too many existing bugs, needs cleanup. append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format XRAY_CFLAGS) @@ -201,7 +215,16 @@ if (APPLE) CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayDSO + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_DSO_SUPPORTED_ARCH} + SOURCES ${XRAY_DSO_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS_DSO} + DEPS ${XRAY_DEPS}) set(XRAY_RTXRAY_ARCH_LIBS "") + set(XRAY_DSO_RTXRAY_ARCH_LIBS "") foreach(arch ${XRAY_SUPPORTED_ARCH}) if(NOT ${arch} IN_LIST XRAY_SOURCE_ARCHS) continue() @@ -215,6 +238,17 @@ if (APPLE) DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) list(APPEND XRAY_RTXRAY_ARCH_LIBS RTXray_${arch}) + if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) + add_compiler_rt_object_libraries(RTXrayDSO_${arch} + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_DSO_SUPPORTED_ARCH} + SOURCES ${${arch}_DSO_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS_DSO} + DEPS ${XRAY_DEPS}) + list(APPEND XRAY_DSO_RTXRAY_ARCH_LIBS RTXrayDSO_${arch}) + endif() endforeach() add_compiler_rt_object_libraries(RTXrayFDR OS ${XRAY_SUPPORTED_OS} @@ -252,6 +286,17 @@ if (APPLE) LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} LINK_LIBS ${XRAY_LINK_LIBS} PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-dso + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_DSO_SUPPORTED_ARCH} + OBJECT_LIBS RTXrayDSO ${XRAY_DSO_RTXRAY_ARCH_LIBS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-fdr STATIC OS ${XRAY_SUPPORTED_OS} @@ -346,16 +391,37 @@ else() # not Apple DEFS ${XRAY_COMMON_DEFINITIONS} OBJECT_LIBS RTXrayBASIC PARENT_TARGET xray) - # Profiler Mode runtime - add_compiler_rt_runtime(clang_rt.xray-profiling - STATIC - ARCHS ${arch} - CFLAGS ${XRAY_CFLAGS} - LINK_FLAGS ${XRAY_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - DEFS ${XRAY_COMMON_DEFINITIONS} - OBJECT_LIBS RTXrayPROFILING - PARENT_TARGET xray) + # Profiler Mode runtime + add_compiler_rt_runtime(clang_rt.xray-profiling + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + LINK_FLAGS ${XRAY_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayPROFILING + PARENT_TARGET xray) + + if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) + # TODO: Only implemented for X86 at the moment + add_compiler_rt_object_libraries(RTXrayDSO + ARCHS ${arch} + SOURCES ${XRAY_DSO_SOURCES} ${${arch}_DSO_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS_DSO} + DEPS ${XRAY_DEPS}) + # DSO runtime archive + add_compiler_rt_runtime(clang_rt.xray-dso + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + LINK_FLAGS ${XRAY_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayDSO + PARENT_TARGET xray) + endif() endforeach() endif() # not Apple diff --git a/compiler-rt/lib/xray/xray_dso_init.cpp b/compiler-rt/lib/xray/xray_dso_init.cpp new file mode 100644 index 00000000000000..eb754db54c64fa --- /dev/null +++ b/compiler-rt/lib/xray/xray_dso_init.cpp @@ -0,0 +1,62 @@ +//===-- xray_init.cpp -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay initialisation logic for DSOs. +//===----------------------------------------------------------------------===// + +#include "sanitizer_common/sanitizer_atomic.h" +#include "xray_defs.h" +#include "xray_flags.h" +#include "xray_interface_internal.h" + +using namespace __sanitizer; + +extern "C" { +extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak)) +__attribute__((visibility("hidden"))); +extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak)) +__attribute__((visibility("hidden"))); +extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)) +__attribute__((visibility("hidden"))); +extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)) +__attribute__((visibility("hidden"))); + +#if SANITIZER_APPLE +// HACK: This is a temporary workaround to make XRay build on +// Darwin, but it will probably not work at runtime. +extern const XRaySledEntry __start_xray_instr_map[] = {}; +extern const XRaySledEntry __stop_xray_instr_map[] = {}; +extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {}; +extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {}; +#endif +} + +// Handler functions to call in the patched entry/exit sled. +extern atomic_uintptr_t XRayPatchedFunction; +extern atomic_uintptr_t XRayArgLogger; +extern atomic_uintptr_t XRayPatchedCustomEvent; +extern atomic_uintptr_t XRayPatchedTypedEvent; + +static int __xray_object_id{-1}; + +// Note: .preinit_array initialization does not work for DSOs +__attribute__((constructor(0))) static void +__xray_init_dso() XRAY_NEVER_INSTRUMENT { + // Register sleds in main XRay runtime. + __xray_object_id = + __xray_register_dso(__start_xray_instr_map, __stop_xray_instr_map, + __start_xray_fn_idx, __stop_xray_fn_idx, {}); +} + +__attribute__((destructor(0))) static void +__xray_finalize_dso() XRAY_NEVER_INSTRUMENT { + // Inform the main runtime that this DSO is no longer used. + __xray_deregister_dso(__xray_object_id); +} diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp index f22a31b95686d0..53c93be89cd148 100644 --- a/compiler-rt/lib/xray/xray_init.cpp +++ b/compiler-rt/lib/xray/xray_init.cpp @@ -16,6 +16,8 @@ #include #include "sanitizer_common/sanitizer_common.h" +#include "xray/xray_interface.h" +#include "xray_allocator.h" #include "xray_defs.h" #include "xray_flags.h" #include "xray_interface_internal.h" @@ -28,7 +30,7 @@ extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)); extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)); #if SANITIZER_APPLE -// HACK: This is a temporary workaround to make XRay build on +// HACK: This is a temporary workaround to make XRay build on // Darwin, but it will probably not work at runtime. const XRaySledEntry __start_xray_instr_map[] = {}; extern const XRaySledEntry __stop_xray_instr_map[] = {}; @@ -43,14 +45,16 @@ using namespace __xray; // the weak symbols defined above (__start_xray_inst_map and // __stop_xray_instr_map) to initialise the instrumentation map that XRay uses // for runtime patching/unpatching of instrumentation points. -// -// FIXME: Support DSO instrumentation maps too. The current solution only works -// for statically linked executables. atomic_uint8_t XRayInitialized{0}; // This should always be updated before XRayInitialized is updated. SpinMutex XRayInstrMapMutex; -XRaySledMap XRayInstrMap; + +// Contains maps for the main executable as well as DSOs. +XRaySledMap *XRayInstrMaps; + +// Number of binary objects registered. +atomic_uint32_t XRayNumObjects{0}; // Global flag to determine whether the flags have been initialized. atomic_uint8_t XRayFlagsInitialized{0}; @@ -58,6 +62,63 @@ atomic_uint8_t XRayFlagsInitialized{0}; // A mutex to allow only one thread to initialize the XRay data structures. SpinMutex XRayInitMutex; +// Registers XRay sleds and trampolines coming from the main executable or one +// of the linked DSOs. +// Returns the object ID if registration is successful, -1 otherwise. +int32_t +__xray_register_sleds(const XRaySledEntry *SledsBegin, + const XRaySledEntry *SledsEnd, + const XRayFunctionSledIndex *FnIndexBegin, + const XRayFunctionSledIndex *FnIndexEnd, bool FromDSO, + XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { + if (!SledsBegin || !SledsEnd) { + Report("Invalid XRay sleds.\n"); + return -1; + } + XRaySledMap SledMap; + SledMap.FromDSO = FromDSO; + SledMap.Loaded = true; + SledMap.Trampolines = Trampolines; + SledMap.Sleds = SledsBegin; + SledMap.Entries = SledsEnd - SledsBegin; + if (FnIndexBegin != nullptr) { + SledMap.SledsIndex = FnIndexBegin; + SledMap.Functions = FnIndexEnd - FnIndexBegin; + } else { + size_t CountFunctions = 0; + uint64_t LastFnAddr = 0; + + for (std::size_t I = 0; I < SledMap.Entries; I++) { + const auto &Sled = SledMap.Sleds[I]; + const auto Function = Sled.function(); + if (Function != LastFnAddr) { + CountFunctions++; + LastFnAddr = Function; + } + } + SledMap.SledsIndex = nullptr; + SledMap.Functions = CountFunctions; + } + if (SledMap.Functions >= XRayMaxFunctions) { + Report("Too many functions! Maximum is %ld\n", XRayMaxFunctions); + return -1; + } + + if (Verbosity()) + Report("Registering %d new functions!\n", SledMap.Functions); + + { + SpinMutexLock Guard(&XRayInstrMapMutex); + auto Idx = atomic_fetch_add(&XRayNumObjects, 1, memory_order_acq_rel); + if (Idx >= XRayMaxObjects) { + Report("Too many objects registered! Maximum is %ld\n", XRayMaxObjects); + return -1; + } + XRayInstrMaps[Idx] = std::move(SledMap); + return Idx; + } +} + // __xray_init() will do the actual loading of the current process' memory map // and then proceed to look for the .xray_instr_map section/segment. void __xray_init() XRAY_NEVER_INSTRUMENT { @@ -80,29 +141,21 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { return; } - { - SpinMutexLock Guard(&XRayInstrMapMutex); - XRayInstrMap.Sleds = __start_xray_instr_map; - XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map; - if (__start_xray_fn_idx != nullptr) { - XRayInstrMap.SledsIndex = __start_xray_fn_idx; - XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx; - } else { - size_t CountFunctions = 0; - uint64_t LastFnAddr = 0; - - for (std::size_t I = 0; I < XRayInstrMap.Entries; I++) { - const auto &Sled = XRayInstrMap.Sleds[I]; - const auto Function = Sled.function(); - if (Function != LastFnAddr) { - CountFunctions++; - LastFnAddr = Function; - } - } + atomic_store(&XRayNumObjects, 0, memory_order_release); - XRayInstrMap.Functions = CountFunctions; - } + // Pre-allocation takes up approx. 5kB for XRayMaxObjects=64. + XRayInstrMaps = allocateBuffer(XRayMaxObjects); + + int MainBinaryId = + __xray_register_sleds(__start_xray_instr_map, __stop_xray_instr_map, + __start_xray_fn_idx, __stop_xray_fn_idx, false, {}); + + // The executable should always get ID 0. + if (MainBinaryId != 0) { + Report("Registering XRay sleds failed.\n"); + return; } + atomic_store(&XRayInitialized, true, memory_order_release); #ifndef XRAY_NO_PREINIT @@ -111,6 +164,84 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { #endif } +// Registers XRay sleds and trampolines of an instrumented DSO. +// Returns the object ID if registration is successful, -1 otherwise. +// +// Default visibility is hidden, so we have to explicitly make it visible to +// DSO. +SANITIZER_INTERFACE_ATTRIBUTE int32_t __xray_register_dso( + const XRaySledEntry *SledsBegin, const XRaySledEntry *SledsEnd, + const XRayFunctionSledIndex *FnIndexBegin, + const XRayFunctionSledIndex *FnIndexEnd, + XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { + // Make sure XRay has been initialized in the main executable. + __xray_init(); + + if (__xray_num_objects() == 0) { + if (Verbosity()) + Report("No XRay instrumentation map in main executable. Not initializing " + "XRay for DSO.\n"); + return -1; + } + + // Register sleds in global map. + int ObjId = __xray_register_sleds(SledsBegin, SledsEnd, FnIndexBegin, + FnIndexEnd, true, Trampolines); + +#ifndef XRAY_NO_PREINIT + if (ObjId >= 0 && flags()->patch_premain) + __xray_patch_object(ObjId); +#endif + + return ObjId; +} + +// Deregisters a DSO from the main XRay runtime. +// Called from the DSO-local runtime when the library is unloaded (e.g. if +// dlclose is called). +// Returns true if the object ID is valid and the DSO was successfully +// deregistered. +SANITIZER_INTERFACE_ATTRIBUTE bool +__xray_deregister_dso(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + + if (!atomic_load(&XRayInitialized, memory_order_acquire)) { + if (Verbosity()) + Report("XRay has not been initialized. Cannot deregister DSO.\n"); + return false; + } + + if (ObjId <= 0 || ObjId >= __xray_num_objects()) { + if (Verbosity()) + Report("Can't deregister object with ID %d: ID is invalid.\n", ObjId); + return false; + } + + { + SpinMutexLock Guard(&XRayInstrMapMutex); + auto &Entry = XRayInstrMaps[ObjId]; + if (!Entry.FromDSO) { + if (Verbosity()) + Report("Can't deregister object with ID %d: object does not correspond " + "to a shared library.\n", + ObjId); + return false; + } + if (!Entry.Loaded) { + if (Verbosity()) + Report("Can't deregister object with ID %d: object is not loaded.\n", + ObjId); + return true; + } + // Mark DSO as unloaded. No need to unpatch. + Entry.Loaded = false; + } + + if (Verbosity()) + Report("Deregistered object with ID %d.\n", ObjId); + + return true; +} + // FIXME: Make check-xray tests work on FreeBSD without // SANITIZER_CAN_USE_PREINIT_ARRAY. // See sanitizer_internal_defs.h where the macro is defined. diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 5839043fcb93a8..16e60bfc22cd10 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "xray_interface_internal.h" +#include "llvm/Support/ErrorHandling.h" #include #include @@ -36,7 +37,8 @@ extern __sanitizer::SpinMutex XRayInstrMapMutex; extern __sanitizer::atomic_uint8_t XRayInitialized; -extern __xray::XRaySledMap XRayInstrMap; +extern __xray::XRaySledMap *XRayInstrMaps; +extern __sanitizer::atomic_uint32_t XRayNumObjects; namespace __xray { @@ -61,16 +63,16 @@ static const int16_t cSledLength = 20; #endif /* CPU architecture */ // This is the function to call when we encounter the entry or exit sleds. -atomic_uintptr_t XRayPatchedFunction{0}; +atomic_uintptr_t XRayPatchedFunction SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the function to call from the arg1-enabled sleds/trampolines. -atomic_uintptr_t XRayArgLogger{0}; +atomic_uintptr_t XRayArgLogger SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the function to call when we encounter a custom event log call. -atomic_uintptr_t XRayPatchedCustomEvent{0}; +atomic_uintptr_t XRayPatchedCustomEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the function to call when we encounter a typed event log call. -atomic_uintptr_t XRayPatchedTypedEvent{0}; +atomic_uintptr_t XRayPatchedTypedEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the global status to determine whether we are currently // patching/unpatching. @@ -150,27 +152,42 @@ class MProtectHelper { namespace { -bool patchSled(const XRaySledEntry &Sled, bool Enable, - int32_t FuncId) XRAY_NEVER_INSTRUMENT { +bool isObjectLoaded(int32_t ObjId) { + SpinMutexLock Guard(&XRayInstrMapMutex); + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + return false; + } + return XRayInstrMaps[ObjId].Loaded; +} + +bool patchSled(const XRaySledEntry &Sled, bool Enable, int32_t FuncId, + const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT { bool Success = false; switch (Sled.Kind) { case XRayEntryType::ENTRY: - Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry); + Success = + patchFunctionEntry(Enable, FuncId, Sled, Trampolines.EntryTrampoline); break; case XRayEntryType::EXIT: - Success = patchFunctionExit(Enable, FuncId, Sled); + Success = + patchFunctionExit(Enable, FuncId, Sled, Trampolines.ExitTrampoline); break; case XRayEntryType::TAIL: - Success = patchFunctionTailExit(Enable, FuncId, Sled); + Success = patchFunctionTailExit(Enable, FuncId, Sled, + Trampolines.TailExitTrampoline); break; case XRayEntryType::LOG_ARGS_ENTRY: - Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry); + Success = + patchFunctionEntry(Enable, FuncId, Sled, Trampolines.LogArgsTrampoline); break; case XRayEntryType::CUSTOM_EVENT: - Success = patchCustomEvent(Enable, FuncId, Sled); + Success = patchCustomEvent(Enable, FuncId, Sled, + Trampolines.CustomEventTrampoline); break; case XRayEntryType::TYPED_EVENT: - Success = patchTypedEvent(Enable, FuncId, Sled); + Success = + patchTypedEvent(Enable, FuncId, Sled, Trampolines.TypedEventTrampoline); break; default: Report("Unsupported sled kind '%" PRIu64 "' @%04x\n", Sled.Address, @@ -205,10 +222,9 @@ findFunctionSleds(int32_t FuncId, return Index; } -XRayPatchingStatus patchFunction(int32_t FuncId, +XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, - memory_order_acquire)) + if (!atomic_load(&XRayInitialized, memory_order_acquire)) return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. uint8_t NotPatching = false; @@ -220,13 +236,24 @@ XRayPatchingStatus patchFunction(int32_t FuncId, XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + Report("Unable to patch function: invalid sled map index: %d", ObjId); + return XRayPatchingStatus::FAILED; + } + InstrMap = XRayInstrMaps[ObjId]; } // If we don't have an index, we can't patch individual functions. if (InstrMap.Functions == 0) return XRayPatchingStatus::NOT_INITIALIZED; + // Check if the corresponding DSO has been unloaded. + if (!InstrMap.Loaded) { + Report("Invalid function id provided: %d\n", FuncId); + return XRayPatchingStatus::NOT_INITIALIZED; + } + // FuncId must be a positive number, less than the number of functions // instrumented. if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) { @@ -234,6 +261,8 @@ XRayPatchingStatus patchFunction(int32_t FuncId, return XRayPatchingStatus::FAILED; } + auto PackedId = __xray::MakePackedId(FuncId, ObjId); + // Now we patch ths sleds for this specific function. XRayFunctionSledIndex SledRange; if (InstrMap.SledsIndex) { @@ -242,13 +271,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, } else { SledRange = findFunctionSleds(FuncId, InstrMap); } + auto *f = SledRange.Begin; bool SucceedOnce = false; for (size_t i = 0; i != SledRange.Size; ++i) - SucceedOnce |= patchSled(f[i], Enable, FuncId); + SucceedOnce |= patchSled(f[i], Enable, PackedId, InstrMap.Trampolines); - atomic_store(&XRayPatching, false, - memory_order_release); + atomic_store(&XRayPatching, false, memory_order_release); if (!SucceedOnce) { Report("Failed patching any sled for function '%d'.", FuncId); @@ -261,32 +290,31 @@ XRayPatchingStatus patchFunction(int32_t FuncId, // controlPatching implements the common internals of the patching/unpatching // implementation. |Enable| defines whether we're enabling or disabling the // runtime XRay instrumentation. -XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, - memory_order_acquire)) - return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - - uint8_t NotPatching = false; - if (!atomic_compare_exchange_strong( - &XRayPatching, &NotPatching, true, memory_order_acq_rel)) - return XRayPatchingStatus::ONGOING; // Already patching. - - uint8_t PatchingSuccess = false; - auto XRayPatchingStatusResetter = - at_scope_exit([&PatchingSuccess] { - if (!PatchingSuccess) - atomic_store(&XRayPatching, false, - memory_order_release); - }); - +// This function should only be called after ensuring that XRay is initialized +// and no other thread is currently patching. +XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + Report("Unable to patch functions: invalid sled map index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; + } + InstrMap = XRayInstrMaps[ObjId]; } if (InstrMap.Entries == 0) return XRayPatchingStatus::NOT_INITIALIZED; + if (Verbosity()) + Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries); + + // Check if the corresponding DSO has been unloaded. + if (!InstrMap.Loaded) { + Report("Object is not loaded at index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; + } + uint32_t FuncId = 1; uint64_t CurFun = 0; @@ -336,20 +364,96 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { ++FuncId; CurFun = F; } - patchSled(Sled, Enable, FuncId); + auto PackedId = __xray::MakePackedId(FuncId, ObjId); + patchSled(Sled, Enable, PackedId, InstrMap.Trampolines); } - atomic_store(&XRayPatching, false, - memory_order_release); - PatchingSuccess = true; + atomic_store(&XRayPatching, false, memory_order_release); return XRayPatchingStatus::SUCCESS; } -XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, +// Controls patching for all registered objects. +// Returns: SUCCESS, if patching succeeds for all objects. +// NOT_INITIALIZED, if one or more objects returned NOT_INITIALIZED +// but none failed. +// FAILED, if patching of one or more objects failed. +XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { + if (!atomic_load(&XRayInitialized, memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, + memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + auto XRayPatchingStatusResetter = at_scope_exit( + [] { atomic_store(&XRayPatching, false, memory_order_release); }); + + unsigned NumObjects = __xray_num_objects(); + + XRayPatchingStatus CombinedStatus{NOT_INITIALIZED}; + for (unsigned I = 0; I < NumObjects; ++I) { + if (!isObjectLoaded(I)) + continue; + auto LastStatus = controlPatchingObjectUnchecked(Enable, I); + switch (LastStatus) { + case SUCCESS: + if (CombinedStatus == NOT_INITIALIZED) + CombinedStatus = SUCCESS; + break; + case FAILED: + // Report failure, but try to patch the remaining objects + CombinedStatus = FAILED; + break; + case NOT_INITIALIZED: + // XRay has been initialized but there are no sleds available for this + // object. Try to patch remaining objects. + if (CombinedStatus != FAILED) + CombinedStatus = NOT_INITIALIZED; + break; + case ONGOING: + llvm_unreachable("Status ONGOING should not appear at this point"); + default: + llvm_unreachable("Unhandled patching status"); + } + } + return CombinedStatus; +} + +// Controls patching for one object. +XRayPatchingStatus controlPatching(bool Enable, + int32_t ObjId) XRAY_NEVER_INSTRUMENT { + + if (!atomic_load(&XRayInitialized, memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, + memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + auto XRayPatchingStatusResetter = at_scope_exit( + [] { atomic_store(&XRayPatching, false, memory_order_release); }); + + return controlPatchingObjectUnchecked(Enable, ObjId); +} + +XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId, bool Enable) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + Report("Unable to patch function: invalid sled map index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; + } + InstrMap = XRayInstrMaps[ObjId]; + } + + // Check if the corresponding DSO has been unloaded. + if (!InstrMap.Loaded) { + Report("Object is not loaded at index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; } // FuncId must be a positive number, less than the number of functions @@ -398,7 +502,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, Report("Failed mprotect: %d\n", errno); return XRayPatchingStatus::FAILED; } - return patchFunction(FuncId, Enable); + return patchFunction(FuncId, ObjId, Enable); } } // namespace @@ -412,12 +516,10 @@ using namespace __xray; int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, - memory_order_acquire)) { + if (atomic_load(&XRayInitialized, memory_order_acquire)) { atomic_store(&__xray::XRayPatchedFunction, - reinterpret_cast(entry), - memory_order_release); + reinterpret_cast(entry), memory_order_release); return 1; } return 0; @@ -425,11 +527,9 @@ int __xray_set_handler(void (*entry)(int32_t, int __xray_set_customevent_handler(void (*entry)(void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, - memory_order_acquire)) { + if (atomic_load(&XRayInitialized, memory_order_acquire)) { atomic_store(&__xray::XRayPatchedCustomEvent, - reinterpret_cast(entry), - memory_order_release); + reinterpret_cast(entry), memory_order_release); return 1; } return 0; @@ -437,11 +537,9 @@ int __xray_set_customevent_handler(void (*entry)(void *, size_t)) int __xray_set_typedevent_handler(void (*entry)(size_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, - memory_order_acquire)) { + if (atomic_load(&XRayInitialized, memory_order_acquire)) { atomic_store(&__xray::XRayPatchedTypedEvent, - reinterpret_cast(entry), - memory_order_release); + reinterpret_cast(entry), memory_order_release); return 1; } return 0; @@ -474,39 +572,78 @@ XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { return controlPatching(true); } +XRayPatchingStatus __xray_patch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return controlPatching(true, ObjId); +} + XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT { return controlPatching(false); } +XRayPatchingStatus __xray_unpatch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return controlPatching(false, ObjId); +} + XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, true); + auto Ids = __xray::UnpackId(FuncId); + auto ObjId = Ids.first; + auto FnId = Ids.second; + return mprotectAndPatchFunction(FnId, ObjId, true); +} + +XRayPatchingStatus +__xray_patch_function_in_object(int32_t FuncId, + int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return mprotectAndPatchFunction(FuncId, ObjId, true); } XRayPatchingStatus __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, false); + auto Ids = __xray::UnpackId(FuncId); + auto ObjId = Ids.first; + auto FnId = Ids.second; + return mprotectAndPatchFunction(FnId, ObjId, false); +} + +XRayPatchingStatus +__xray_unpatch_function_in_object(int32_t FuncId, + int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return mprotectAndPatchFunction(FuncId, ObjId, false); } int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) { - if (!atomic_load(&XRayInitialized, - memory_order_acquire)) + if (!atomic_load(&XRayInitialized, memory_order_acquire)) return 0; // A relaxed write might not be visible even if the current thread gets // scheduled on a different CPU/NUMA node. We need to wait for everyone to // have this handler installed for consistency of collected data across CPUs. atomic_store(&XRayArgLogger, reinterpret_cast(entry), - memory_order_release); + memory_order_release); return 1; } int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } -uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { +uintptr_t +__xray_function_address(int32_t CombinedFuncId) XRAY_NEVER_INSTRUMENT { + auto Ids = __xray::UnpackId(CombinedFuncId); + return __xray_function_address_in_object(Ids.second, Ids.first); +} + +uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId) + XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + auto count = atomic_load(&XRayNumObjects, memory_order_acquire); + if (ObjId < 0 || ObjId >= count) { + Report("Unable to determine function address: invalid sled map index %d " + "(size is %d)\n", + ObjId, (int)count); + return 0; + } + InstrMap = XRayInstrMaps[ObjId]; } if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) @@ -525,6 +662,29 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { } size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT { + return __xray_max_function_id_in_object(0); +} + +size_t __xray_max_function_id_in_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + SpinMutexLock Guard(&XRayInstrMapMutex); + if (ObjId < 0 || ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) + return 0; + return XRayInstrMaps[ObjId].Functions; +} + +size_t __xray_num_objects() XRAY_NEVER_INSTRUMENT { SpinMutexLock Guard(&XRayInstrMapMutex); - return XRayInstrMap.Functions; + return atomic_load(&XRayNumObjects, memory_order_acquire); +} + +int32_t __xray_unpack_function_id(int32_t PackedId) { + return __xray::UnpackId(PackedId).second; +} + +int32_t __xray_unpack_object_id(int32_t PackedId) { + return __xray::UnpackId(PackedId).first; +} + +int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId) { + return __xray::MakePackedId(FuncId, ObjId); } diff --git a/compiler-rt/lib/xray/xray_interface_internal.h b/compiler-rt/lib/xray/xray_interface_internal.h index 80c07c167f6461..5fbaa9c3f315b1 100644 --- a/compiler-rt/lib/xray/xray_interface_internal.h +++ b/compiler-rt/lib/xray/xray_interface_internal.h @@ -18,6 +18,18 @@ #include "xray/xray_interface.h" #include #include +#include + +extern "C" { +// The following functions have to be defined in assembler, on a per-platform +// basis. See xray_trampoline_*.S files for implementations. +extern void __xray_FunctionEntry(); +extern void __xray_FunctionExit(); +extern void __xray_FunctionTailExit(); +extern void __xray_ArgLoggerEntry(); +extern void __xray_CustomEvent(); +extern void __xray_TypedEvent(); +} extern "C" { @@ -67,36 +79,77 @@ struct XRayFunctionSledIndex { uintptr_t(Begin)); } }; + +struct XRayTrampolines { + void (*EntryTrampoline)(); + void (*ExitTrampoline)(); + void (*TailExitTrampoline)(); + void (*LogArgsTrampoline)(); + void (*CustomEventTrampoline)(); + void (*TypedEventTrampoline)(); + + XRayTrampolines() { + // These resolve to the definitions in the respective executable or DSO. + EntryTrampoline = __xray_FunctionEntry; + ExitTrampoline = __xray_FunctionExit; + TailExitTrampoline = __xray_FunctionTailExit; + LogArgsTrampoline = __xray_ArgLoggerEntry; + CustomEventTrampoline = __xray_CustomEvent; + TypedEventTrampoline = __xray_TypedEvent; + } +}; + +extern int32_t __xray_register_dso(const XRaySledEntry *SledsBegin, + const XRaySledEntry *SledsEnd, + const XRayFunctionSledIndex *FnIndexBegin, + const XRayFunctionSledIndex *FnIndexEnd, + XRayTrampolines Trampolines); + +extern bool __xray_deregister_dso(int32_t ObjId); } namespace __xray { +constexpr uint32_t XRayNFnBits = 24; +constexpr uint32_t XRayNObjBits = 8; + +constexpr uint32_t XRayFnBitMask = 0x00FFFFFF; +constexpr uint32_t XRayObjBitMask = 0xFF000000; + +constexpr size_t XRayMaxFunctions = 1 << XRayNFnBits; +constexpr size_t XRayMaxObjects = 1 << XRayNObjBits; + +inline int32_t MakePackedId(int32_t FnId, int32_t ObjId) { + return ((ObjId << XRayNFnBits) & XRayObjBitMask) | (FnId & XRayFnBitMask); +} + +inline std::pair UnpackId(int32_t PackedId) { + uint32_t ObjId = (PackedId & XRayObjBitMask) >> XRayNFnBits; + uint32_t FnId = PackedId & XRayFnBitMask; + return {ObjId, FnId}; +} + struct XRaySledMap { const XRaySledEntry *Sleds; size_t Entries; const XRayFunctionSledIndex *SledsIndex; size_t Functions; + XRayTrampolines Trampolines; + bool FromDSO; + bool Loaded; }; bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, void (*Trampoline)()); -bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, + void (*Trampoline)()); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, - const XRaySledEntry &Sled); -bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); -bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); + const XRaySledEntry &Sled, void (*Trampoline)()); +bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, + void (*Trampoline)()); +bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, + void (*Trampoline)()); } // namespace __xray -extern "C" { -// The following functions have to be defined in assembler, on a per-platform -// basis. See xray_trampoline_*.S files for implementations. -extern void __xray_FunctionEntry(); -extern void __xray_FunctionExit(); -extern void __xray_FunctionTailExit(); -extern void __xray_ArgLoggerEntry(); -extern void __xray_CustomEvent(); -extern void __xray_TypedEvent(); -} - #endif diff --git a/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/compiler-rt/lib/xray/xray_trampoline_x86_64.S index 01098f60eeab8b..0f480547b52cc6 100644 --- a/compiler-rt/lib/xray/xray_trampoline_x86_64.S +++ b/compiler-rt/lib/xray/xray_trampoline_x86_64.S @@ -107,6 +107,16 @@ .section __TEXT,__text #endif +.macro LOAD_HANDLER_ADDR handler +#if !defined(XRAY_PIC) + movq ASM_SYMBOL(\handler)(%rip), %rax +#else + movq ASM_SYMBOL(\handler)@GOTPCREL(%rip), %rax + movq (%rax), %rax +#endif +.endm + + //===----------------------------------------------------------------------===// .globl ASM_SYMBOL(__xray_FunctionEntry) @@ -121,7 +131,7 @@ ASM_SYMBOL(__xray_FunctionEntry): // This load has to be atomic, it's concurrent with __xray_patch(). // On x86/amd64, a simple (type-aligned) MOV instruction is enough. - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax, %rax je LOCAL_LABEL(tmp0) @@ -159,7 +169,7 @@ ASM_SYMBOL(__xray_FunctionExit): movupd %xmm1, 16(%rsp) movq %rax, 8(%rsp) movq %rdx, 0(%rsp) - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax,%rax je LOCAL_LABEL(tmp2) @@ -195,7 +205,7 @@ ASM_SYMBOL(__xray_FunctionTailExit): SAVE_REGISTERS ALIGN_STACK_16B - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax,%rax je LOCAL_LABEL(tmp4) @@ -224,12 +234,12 @@ ASM_SYMBOL(__xray_ArgLoggerEntry): ALIGN_STACK_16B // Again, these function pointer loads must be atomic; MOV is fine. - movq ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray13XRayArgLoggerE testq %rax, %rax jne LOCAL_LABEL(arg1entryLog) // If [arg1 logging handler] not set, defer to no-arg logging. - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax, %rax je LOCAL_LABEL(arg1entryFail) @@ -268,7 +278,7 @@ ASM_SYMBOL(__xray_CustomEvent): // We take two arguments to this trampoline, which should be in rdi and rsi // already. - movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray22XRayPatchedCustomEventE testq %rax,%rax je LOCAL_LABEL(customEventCleanup) @@ -293,7 +303,7 @@ ASM_SYMBOL(__xray_TypedEvent): // We pass three arguments to this trampoline, which should be in rdi, rsi // and rdx without our intervention. - movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray21XRayPatchedTypedEventE testq %rax,%rax je LOCAL_LABEL(typedEventCleanup) diff --git a/compiler-rt/lib/xray/xray_x86_64.cpp b/compiler-rt/lib/xray/xray_x86_64.cpp index b9666a40861d48..663a51b2686614 100644 --- a/compiler-rt/lib/xray/xray_x86_64.cpp +++ b/compiler-rt/lib/xray/xray_x86_64.cpp @@ -170,7 +170,8 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, } bool patchFunctionExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -192,11 +193,11 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, // Prerequisite is to compute the relative offset fo the // __xray_FunctionExit function's address. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = reinterpret_cast(__xray_FunctionExit) - + int64_t TrampolineOffset = reinterpret_cast(Trampoline) - (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(__xray_FunctionExit), + reinterpret_cast(Trampoline), reinterpret_cast(Address)); return false; } @@ -217,16 +218,16 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, } bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the tail call sled with a similar // sequence as the entry sled, but calls the tail exit sled instead. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = - reinterpret_cast(__xray_FunctionTailExit) - - (static_cast(Address) + 11); + int64_t TrampolineOffset = reinterpret_cast(Trampoline) - + (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(__xray_FunctionTailExit), + reinterpret_cast(Trampoline), reinterpret_cast(Address)); return false; } @@ -247,7 +248,8 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, } bool patchCustomEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -275,7 +277,8 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, } bool patchTypedEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp new file mode 100644 index 00000000000000..31c615bd1f81bf --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp @@ -0,0 +1,47 @@ +// Testing shared library support in basic logging mode. + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o + +// RUN: XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s +// RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s +// RUN: rm basic-mode-dso-* + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include +#include + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); + sleep(1); +} + +extern void instrumented_in_dso(); + +int main() { + // Explicit patching to ensure the DSO has been loaded + __xray_patch(); + instrumented_in_executable(); + // CHECK: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called +} + +//--- testlib.cpp + +#include +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} + +// ACCOUNT: funcid,count,min,median,90%ile,99%ile,max,sum,debug,function +// ACCOUNT-NEXT: 1,1,{{.*}} +// ACCOUNT-NEXT: 16777217,1,{{.*}} diff --git a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp new file mode 100644 index 00000000000000..92f3c29e970d42 --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp @@ -0,0 +1,14 @@ +// Test that the DSO-local runtime library has been linked if -fxray-shared is passed. +// +// RUN: %clangxx -fxray-instrument -fxray-shared %s -shared -o %t.so +// RUN: llvm-nm %t.so | FileCheck %s --check-prefix ENABLED + +// RUN: %clangxx -fxray-instrument %s -shared -o %t.so +// RUN: llvm-nm %t.so | FileCheck %s --check-prefix DISABLED +// +// REQUIRES: target=x86_64{{.*}} + +[[clang::xray_always_instrument]] int always_instrumented() { return 42; } + +// ENABLED: __start_xray_instr_map +// DISABLED-NOT: __start_xray_instr_map diff --git a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp new file mode 100644 index 00000000000000..9db411d5ff1c6e --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp @@ -0,0 +1,107 @@ +// Check that we can patch and un-patch DSOs loaded with dlopen. +// + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp -o %t/main.o +// +// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlib.so 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include +#include + +void test_handler(int32_t fid, XRayEntryType type) { + printf("called: %d, type=%d\n", fid, static_cast(type)); +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +typedef void (*dso_func_type)(); + +int main(int argc, char **argv) { + if (argc < 2) { + printf("Shared library argument missing\n"); + // CHECK-NOT: Shared library argument missing + return 1; + } + + const char *dso_path = argv[1]; + + void *dso_handle = dlopen(dso_path, RTLD_LAZY); + if (!dso_handle) { + printf("Failed to load shared library\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + return 1; + } + return 1; + } + + dso_func_type instrumented_in_dso = + (dso_func_type)dlsym(dso_handle, "_Z19instrumented_in_dsov"); + if (!instrumented_in_dso) { + printf("Failed to find symbol\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + return 1; + } + return 1; + } + + __xray_set_handler(test_handler); + + instrumented_in_executable(); + // CHECK: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 + + auto status = __xray_unpatch(); + printf("unpatching status: %d\n", static_cast(status)); + // CHECK-NEXT: unpatching status: 1 + + instrumented_in_executable(); + // CHECK-NEXT: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called + + status = __xray_patch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + + instrumented_in_executable(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 + + dlclose(dso_handle); + + status = __xray_unpatch(); + printf("unpatching status: %d\n", static_cast(status)); + // CHECK-NEXT: unpatching status: 1 +} + +//--- testlib.cpp + +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} diff --git a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp new file mode 100644 index 00000000000000..89da2764c35cee --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp @@ -0,0 +1,197 @@ +// Check that loading libraries with different modes (RTLD_LOCAL/RTLD_GLOBAL) +// and dependencies on other DSOs work correctly. +// + +// RUN: split-file %s %t +// +// Build shared libs with dependencies b->c and e->f +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testliba.cpp -o %t/testliba.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibc.cpp -o %t/testlibc.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibb.cpp %t/testlibc.so -o %t/testlibb.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibd.cpp -o %t/testlibd.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibf.cpp -o %t/testlibf.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibe.cpp %t/testlibf.so -o %t/testlibe.so +// +// Executable links with a and b explicitly and loads d and e at runtime. +// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testliba.so %t/testlibb.so -o %t/main.o +// +// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlibd.so %t/testlibe.so 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include +#include + +[[clang::xray_never_instrument]] void test_handler(int32_t fid, + XRayEntryType type) { + printf("called: %d, object=%d, fn=%d, type=%d\n", fid, (fid >> 24) & 0xFF, + fid & 0x00FFFFFF, static_cast(type)); +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +typedef void (*dso_func_type)(); + +[[clang::xray_never_instrument]] void *load_dso(const char *path, int mode) { + void *dso_handle = dlopen(path, mode); + if (!dso_handle) { + printf("failed to load shared library\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + } + return nullptr; + } + return dso_handle; +} + +[[clang::xray_never_instrument]] void find_and_call(void *dso_handle, + const char *fn) { + dso_func_type dso_fn = (dso_func_type)dlsym(dso_handle, fn); + if (!dso_fn) { + printf("failed to find symbol\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + } + return; + } + dso_fn(); +} + +extern void a(); +extern void b(); + +int main(int argc, char **argv) { + + if (argc < 3) { + printf("Shared library arguments missing\n"); + // CHECK-NOT: Shared library arguments missing + return 1; + } + + const char *dso_path_d = argv[1]; + const char *dso_path_e = argv[2]; + + __xray_set_handler(test_handler); + + instrumented_in_executable(); + // CHECK: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=1 + + a(); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: a called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1]], fn=1, type=1 + + // Make sure this object ID does not appear again + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ1]] + + b(); // b calls c + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: b called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: c called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ3]] + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ2]] + + // Now check explicit loading with RTLD_LOCAL + + void *dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_LOCAL); + void *dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_LOCAL); + // CHECK-NOT: failed to load shared library + + find_and_call(dso_handle_d, "_Z1dv"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: d called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ4]] + + find_and_call(dso_handle_e, "_Z1ev"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: e called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: f called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ6]] + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ5]] + + // Unload DSOs + dlclose(dso_handle_d); + dlclose(dso_handle_e); + + // Repeat test with RTLD_GLOBAL + dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_GLOBAL); + dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_GLOBAL); + // CHECK-NOT: failed to load shared library + + find_and_call(dso_handle_d, "_Z1dv"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: d called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ7]] + + find_and_call(dso_handle_e, "_Z1ev"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: e called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: f called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ9]] + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ8]] + + auto status = __xray_unpatch(); + printf("unpatching status: %d\n", static_cast(status)); + // CHECK-NEXT: unpatching status: 1 + + dlclose(dso_handle_d); + dlclose(dso_handle_e); +} + +//--- libgenmacro.inc +#include +// Helper macros to quickly generate libraries containing a single function. +#define GENERATE_LIB(NAME) \ + [[clang::xray_always_instrument]] void NAME() { printf(#NAME " called\n"); } + +#define GENERATE_LIB_WITH_CALL(NAME, FN) \ + extern void FN(); \ + [[clang::xray_always_instrument]] void NAME() { \ + printf(#NAME " called\n"); \ + FN(); \ + } + +//--- testliba.cpp +#include "libgenmacro.inc" +GENERATE_LIB(a) + +//--- testlibb.cpp +#include "libgenmacro.inc" +GENERATE_LIB_WITH_CALL(b, c) + +//--- testlibc.cpp +#include "libgenmacro.inc" +GENERATE_LIB(c) + +//--- testlibd.cpp +#include "libgenmacro.inc" +GENERATE_LIB(d) + +//--- testlibe.cpp +#include "libgenmacro.inc" +GENERATE_LIB_WITH_CALL(e, f) + +//--- testlibf.cpp +#include "libgenmacro.inc" +GENERATE_LIB(f) diff --git a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp new file mode 100644 index 00000000000000..0708d0383439d0 --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp @@ -0,0 +1,45 @@ +// Checking that DSOs are automatically patched upon load, if patch_premain is passed. + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o + +// RUN: XRAY_OPTIONS="patch_premain=true,verbosity=1" %run %t/main.o 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include + +void test_handler(int32_t fid, XRayEntryType type) { + printf("called: %d, type=%d\n", fid, static_cast(type)); +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +extern void instrumented_in_dso(); + +int main() { + __xray_set_handler(test_handler); + instrumented_in_executable(); + // CHECK: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 +} + +//--- testlib.cpp + +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp new file mode 100644 index 00000000000000..d3e992dd497725 --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp @@ -0,0 +1,75 @@ +// Check that we can patch and un-patch on demand, and that logging gets invoked +// appropriately. +// + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o + +// RUN: XRAY_OPTIONS="patch_premain=false" %run %t/main.o 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include + +bool called = false; + +void test_handler(int32_t fid, XRayEntryType type) { + printf("called: %d, type=%d\n", fid, static_cast(type)); + called = true; +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +extern void instrumented_in_dso(); + +int main() { + __xray_set_handler(test_handler); + instrumented_in_executable(); + // CHECK: instrumented_in_executable called + instrumented_in_dso(); + // CHECK: instrumented_in_dso called + auto status = __xray_patch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + instrumented_in_executable(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 + status = __xray_unpatch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + instrumented_in_executable(); + // CHECK-NEXT: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called + status = __xray_patch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + __xray_remove_handler(); + instrumented_in_executable(); + // CHECK-NEXT: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called + status = __xray_unpatch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 +} + +//--- testlib.cpp + +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} From 42ec740d0347a89b656c9be5ac4a7e4d8bcd30d5 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 11:36:55 +0200 Subject: [PATCH 144/177] [clang][ExprConstant] Remove an outdated TODO comment (#111959) Seems like passing the quantities directly seems to work fine. --- clang/lib/AST/ExprConstant.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 4d5af96093cfeb..06e653f96d6de1 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -9768,11 +9768,8 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, if (BaseAlignment < Align) { Result.Designator.setInvalid(); - // FIXME: Add support to Diagnostic for long / long long. - CCEDiag(E->getArg(0), - diag::note_constexpr_baa_insufficient_alignment) << 0 - << (unsigned)BaseAlignment.getQuantity() - << (unsigned)Align.getQuantity(); + CCEDiag(E->getArg(0), diag::note_constexpr_baa_insufficient_alignment) + << 0 << BaseAlignment.getQuantity() << Align.getQuantity(); return false; } } @@ -9783,11 +9780,11 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, (OffsetResult.Base ? CCEDiag(E->getArg(0), - diag::note_constexpr_baa_insufficient_alignment) << 1 + diag::note_constexpr_baa_insufficient_alignment) + << 1 : CCEDiag(E->getArg(0), diag::note_constexpr_baa_value_insufficient_alignment)) - << (int)OffsetResult.Offset.getQuantity() - << (unsigned)Align.getQuantity(); + << OffsetResult.Offset.getQuantity() << Align.getQuantity(); return false; } From 7b0d56be1d002e9cf0d8dda8ecaee99c5dbc88cf Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 11 Oct 2024 11:40:27 +0200 Subject: [PATCH 145/177] AMDGPU/GlobalISel: Fix inst-selection of ballot (#109986) Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input. --- llvm/docs/AMDGPUUsage.rst | 6 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 101 +++++++++++++----- .../GlobalISel/llvm.amdgcn.ballot.i32.ll | 90 +++++++++++++++- .../GlobalISel/llvm.amdgcn.ballot.i64.ll | 58 +++++++++- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 77 ++++++++++++- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 47 ++++++++ .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 20 ++-- 8 files changed, 360 insertions(+), 41 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 6ff3272422fe95..aba39762861dd8 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics. sign-extended from the width of the underlying PC hardware register even on processors where the s_getpc_b64 instruction returns a zero-extended value. + llvm.amdgcn.ballot Returns a bitfield(i32 or i64) containing the result of its i1 argument + in all active lanes, and zero in all inactive lanes. + Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield + used by hardware to control active lanes when used in EXEC register. + For example, ballot(i1 true) return EXEC mask. + ============================================== ========================================================== .. TODO:: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 2738eb77b675ab..715f2cc917e21c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp : [IntrNoMem, IntrConvergent, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// Returns a bitfield(i32 or i64) containing the result of its i1 argument +// in all active lanes, and zero in all inactive lanes. def int_amdgcn_ballot : Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5be0a049cc5827..53628981e12409 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1413,50 +1413,101 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { return true; } +// Ballot has to zero bits in input lane-mask that are zero in current exec, +// Done as AND with exec. For inputs that are results of instruction that +// implicitly use same exec, for example compares in same basic block or SCC to +// VCC copy, use copy. +static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, + MachineBasicBlock *MBB) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getParent() != MBB) + return false; + + // Lane mask generated by SCC to VCC copy. + if (MI->getOpcode() == AMDGPU::COPY) { + auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg()); + auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg()); + if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID && + SrcRB->getID() == AMDGPU::SGPRRegBankID) + return true; + } + + // Lane mask generated using compare with same exec. + if (isa(MI)) + return true; + + Register LHS, RHS; + // Look through AND. + if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS)))) + return isLaneMaskFromSameBlock(LHS, MRI, MBB) || + isLaneMaskFromSameBlock(RHS, MRI, MBB); + + return false; +} + bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); Register DstReg = I.getOperand(0).getReg(); - const unsigned Size = MRI->getType(DstReg).getSizeInBits(); - const bool Is64 = Size == 64; - const bool IsWave32 = (STI.getWavefrontSize() == 32); + Register SrcReg = I.getOperand(2).getReg(); + const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits(); + const unsigned WaveSize = STI.getWavefrontSize(); // In the common case, the return type matches the wave size. // However we also support emitting i64 ballots in wave32 mode. - if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32)) + if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32)) return false; std::optional Arg = - getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); + getIConstantVRegValWithLookThrough(SrcReg, *MRI); + + Register Dst = DstReg; + // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot. + if (BallotSize != WaveSize) { + Dst = MRI->createVirtualRegister(TRI.getBoolRC()); + } - const auto BuildCopy = [&](Register SrcReg) { - if (Size == STI.getWavefrontSize()) { - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) - .addReg(SrcReg); - return; + if (Arg) { + const int64_t Value = Arg->Value.getZExtValue(); + if (Value == 0) { + // Dst = S_MOV 0 + unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0); + } else { + // Dst = COPY EXEC + assert(Value == 1); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec()); } + if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) + return false; + } else { + if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) { + // Dst = COPY SrcReg + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg); + if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) + return false; + } else { + // Dst = S_AND SrcReg, EXEC + unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; + auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst) + .addReg(SrcReg) + .addReg(TRI.getExec()) + .setOperandDead(3); // Dead scc + if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI)) + return false; + } + } - // If emitting a i64 ballot in wave32, fill the upper bits with zeroes. + // i64 ballot on Wave32: zero-extend i32 ballot to i64. + if (BallotSize != WaveSize) { Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(SrcReg) + .addReg(Dst) .addImm(AMDGPU::sub0) .addReg(HiReg) .addImm(AMDGPU::sub1); - }; - - if (Arg) { - const int64_t Value = Arg->Value.getSExtValue(); - if (Value == 0) { - unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; - BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); - } else if (Value == -1) // all ones - BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); - else - return false; - } else - BuildCopy(I.getOperand(2).getReg()); + } I.eraseFromParent(); return true; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 96cab200b61cdb..2edcf23df411df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s declare i32 @llvm.amdgcn.ballot.i32(i1) declare i32 @llvm.ctpop.i32(i32) @@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) { ; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo ; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) @@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -137,7 +139,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -419,3 +422,80 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; GFX10-LABEL: non_cst_non_compare_input: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: ; %bb.1: ; %B +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX10-NEXT: ; %bb.3: ; %A +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; %bb.4: ; %exit +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s0, s0, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: non_cst_non_compare_input: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11-NEXT: ; %bb.1: ; %B +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX11-NEXT: ; %bb.3: ; %A +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo +; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: ; %bb.4: ; %exit +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_b32 s0, s0, exec_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi) + store i32 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index a18f843440445c..0bbb40b8db43ab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) { ; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec ; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) @@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cmp_eq_u64 vcc, 0 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -140,7 +142,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -422,3 +425,52 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; CHECK-LABEL: non_cst_non_compare_input: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_and_b32 s0, 1, s0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: ; %bb.1: ; %B +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; %bb.2: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; CHECK-NEXT: ; %bb.3: ; %A +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 1, v2 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; CHECK-NEXT: ; %bb.4: ; %exit +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; CHECK-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi) + store i64 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index 047b35b8c0f9d8..026a8d7da7080b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s declare i32 @llvm.amdgcn.ballot.i32(i1) declare i32 @llvm.ctpop.i32(i32) @@ -522,3 +522,76 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; GFX10-LABEL: non_cst_non_compare_input: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: ; implicit-def: $sgpr0 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: ; %bb.1: ; %B +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX10-NEXT: ; %bb.3: ; %A +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; %bb.4: ; %exit +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: non_cst_non_compare_input: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: ; implicit-def: $sgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11-NEXT: ; %bb.1: ; %B +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX11-NEXT: ; %bb.3: ; %A +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo +; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: ; %bb.4: ; %exit +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi) + store i32 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index 61f0f20f057043..c7597e98a6d583 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -511,3 +511,50 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; CHECK-LABEL: non_cst_non_compare_input: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: ; implicit-def: $sgpr0_sgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: ; %bb.1: ; %B +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; %bb.2: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; CHECK-NEXT: ; %bb.3: ; %A +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; CHECK-NEXT: ; %bb.4: ; %exit +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; CHECK-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi) + store i64 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll index 5dbfdf24ef36f7..fe69dc49062435 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -40,12 +40,20 @@ define amdgpu_cs i64 @constant_true() { ; Test ballot of a non-comparison operation define amdgpu_cs i64 @non_compare(i32 %x) { -; CHECK-LABEL: non_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; CHECK-NEXT: ; return to shader part epilog +; DAGISEL-LABEL: non_compare: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; DAGISEL-NEXT: s_mov_b32 s1, 0 +; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; DAGISEL-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: non_compare: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_mov_b32 s1, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GISEL-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GISEL-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) ret i64 %ballot From 777142937a599d8a9cea5964b415d9cd13016d79 Mon Sep 17 00:00:00 2001 From: Simon Camphausen Date: Fri, 11 Oct 2024 11:45:25 +0200 Subject: [PATCH 146/177] [mlir][EmitC] Fail on memrefs with 0 dims in type conversion (#111965) This let's the type conversion fail instead of generating invalid array types. --- .../Conversion/MemRefToEmitC/MemRefToEmitC.cpp | 4 +++- .../MemRefToEmitC/memref-to-emitc-failed.mlir | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp index 2b7ac4b529cf0d..39532d34f616eb 100644 --- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp +++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp @@ -167,7 +167,9 @@ void mlir::populateMemRefToEmitCTypeConversion(TypeConverter &typeConverter) { typeConverter.addConversion( [&](MemRefType memRefType) -> std::optional { if (!memRefType.hasStaticShape() || - !memRefType.getLayout().isIdentity() || memRefType.getRank() == 0) { + !memRefType.getLayout().isIdentity() || memRefType.getRank() == 0 || + llvm::any_of(memRefType.getShape(), + [](int64_t dim) { return dim == 0; })) { return {}; } Type convertedElementType = diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir index dee9cc97a14493..fda01974d3fc85 100644 --- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir +++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir @@ -41,6 +41,22 @@ func.func @zero_rank() { // ----- +func.func @zero_dim_rank_1() { + // expected-error@+1 {{failed to legalize operation 'memref.alloca'}} + %0 = memref.alloca() : memref<0xf32> + return +} + +// ----- + +func.func @zero_dim_rank_3() { + // expected-error@+1 {{failed to legalize operation 'memref.alloca'}} + %0 = memref.alloca() : memref<2x0x4xf32> + return +} + +// ----- + // expected-error@+1 {{failed to legalize operation 'memref.global'}} memref.global "nested" constant @nested_global : memref<3x7xf32> From 80c15c48d1fbb53478c9400e598abcbdcae0d962 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 11:46:33 +0200 Subject: [PATCH 147/177] [clang][bytecode] Implement __builtin_assume_aligned (#111968) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 71 ++++++++++++++++++- clang/lib/AST/ExprConstShared.h | 8 +++ clang/lib/AST/ExprConstant.cpp | 35 +++++---- clang/test/Sema/builtin-assume-aligned.c | 2 + clang/test/SemaCXX/builtin-assume-aligned.cpp | 1 + 5 files changed, 98 insertions(+), 19 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 74e9e1cf629372..ec27aebf84bd80 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -38,7 +38,6 @@ static T getParam(const InterpFrame *Frame, unsigned Index) { return Frame->getParam(Offset); } -// static APSInt getAPSIntParam(InterpStack &Stk, size_t Offset = 0) { static APSInt getAPSIntParam(const InterpFrame *Frame, unsigned Index) { APSInt R; unsigned Offset = Frame->getFunction()->getParamOffset(Index); @@ -1162,6 +1161,71 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, return false; } +/// __builtin_assume_aligned(Ptr, Alignment[, ExtraOffset]) +static bool interp__builtin_assume_aligned(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const Function *Func, + const CallExpr *Call) { + assert(Call->getNumArgs() == 2 || Call->getNumArgs() == 3); + + // Might be called with function pointers in C. + std::optional PtrT = S.Ctx.classify(Call->getArg(0)); + if (PtrT != PT_Ptr) + return false; + + unsigned ArgSize = callArgSize(S, Call); + const Pointer &Ptr = S.Stk.peek(ArgSize); + std::optional ExtraOffset; + APSInt Alignment; + if (Call->getNumArgs() == 2) { + Alignment = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(1))); + } else { + PrimType AlignmentT = *S.Ctx.classify(Call->getArg(1)); + PrimType ExtraOffsetT = *S.Ctx.classify(Call->getArg(2)); + Alignment = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(1)), + align(primSize(AlignmentT)) + + align(primSize(ExtraOffsetT))); + ExtraOffset = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(2))); + } + + CharUnits Align = CharUnits::fromQuantity(Alignment.getZExtValue()); + + // If there is a base object, then it must have the correct alignment. + if (Ptr.isBlockPointer()) { + CharUnits BaseAlignment; + if (const auto *VD = Ptr.getDeclDesc()->asValueDecl()) + BaseAlignment = S.getASTContext().getDeclAlign(VD); + else if (const auto *E = Ptr.getDeclDesc()->asExpr()) + BaseAlignment = GetAlignOfExpr(S.getASTContext(), E, UETT_AlignOf); + + if (BaseAlignment < Align) { + S.CCEDiag(Call->getArg(0), + diag::note_constexpr_baa_insufficient_alignment) + << 0 << BaseAlignment.getQuantity() << Align.getQuantity(); + return false; + } + } + + APValue AV = Ptr.toAPValue(S.getASTContext()); + CharUnits AVOffset = AV.getLValueOffset(); + if (ExtraOffset) + AVOffset -= CharUnits::fromQuantity(ExtraOffset->getZExtValue()); + if (AVOffset.alignTo(Align) != AVOffset) { + if (Ptr.isBlockPointer()) + S.CCEDiag(Call->getArg(0), + diag::note_constexpr_baa_insufficient_alignment) + << 1 << AVOffset.getQuantity() << Align.getQuantity(); + else + S.CCEDiag(Call->getArg(0), + diag::note_constexpr_baa_value_insufficient_alignment) + << AVOffset.getQuantity() << Align.getQuantity(); + return false; + } + + S.Stk.push(Ptr); + return true; +} + static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, @@ -1905,6 +1969,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; + case Builtin::BI__builtin_assume_aligned: + if (!interp__builtin_assume_aligned(S, OpPC, Frame, F, Call)) + return false; + break; + case clang::X86::BI__builtin_ia32_bextr_u32: case clang::X86::BI__builtin_ia32_bextr_u64: case clang::X86::BI__builtin_ia32_bextri_u32: diff --git a/clang/lib/AST/ExprConstShared.h b/clang/lib/AST/ExprConstShared.h index efe8ee986d29b3..401ae629c86bfd 100644 --- a/clang/lib/AST/ExprConstShared.h +++ b/clang/lib/AST/ExprConstShared.h @@ -14,12 +14,17 @@ #ifndef LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H #define LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H +#include "clang/Basic/TypeTraits.h" + namespace llvm { class APFloat; } namespace clang { class QualType; class LangOptions; +class ASTContext; +class CharUnits; +class Expr; } // namespace clang using namespace clang; /// Values returned by __builtin_classify_type, chosen to match the values @@ -66,4 +71,7 @@ void HandleComplexComplexDiv(llvm::APFloat A, llvm::APFloat B, llvm::APFloat C, llvm::APFloat D, llvm::APFloat &ResR, llvm::APFloat &ResI); +CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E, + UnaryExprOrTypeTrait ExprKind); + #endif diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 06e653f96d6de1..70b223596d8b9b 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -9620,7 +9620,7 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) { return ExprEvaluatorBaseTy::VisitCastExpr(E); } -static CharUnits GetAlignOfType(EvalInfo &Info, QualType T, +static CharUnits GetAlignOfType(const ASTContext &Ctx, QualType T, UnaryExprOrTypeTrait ExprKind) { // C++ [expr.alignof]p3: // When alignof is applied to a reference type, the result is the @@ -9631,23 +9631,22 @@ static CharUnits GetAlignOfType(EvalInfo &Info, QualType T, return CharUnits::One(); const bool AlignOfReturnsPreferred = - Info.Ctx.getLangOpts().getClangABICompat() <= LangOptions::ClangABI::Ver7; + Ctx.getLangOpts().getClangABICompat() <= LangOptions::ClangABI::Ver7; // __alignof is defined to return the preferred alignment. // Before 8, clang returned the preferred alignment for alignof and _Alignof // as well. if (ExprKind == UETT_PreferredAlignOf || AlignOfReturnsPreferred) - return Info.Ctx.toCharUnitsFromBits( - Info.Ctx.getPreferredTypeAlign(T.getTypePtr())); + return Ctx.toCharUnitsFromBits(Ctx.getPreferredTypeAlign(T.getTypePtr())); // alignof and _Alignof are defined to return the ABI alignment. else if (ExprKind == UETT_AlignOf) - return Info.Ctx.getTypeAlignInChars(T.getTypePtr()); + return Ctx.getTypeAlignInChars(T.getTypePtr()); else llvm_unreachable("GetAlignOfType on a non-alignment ExprKind"); } -static CharUnits GetAlignOfExpr(EvalInfo &Info, const Expr *E, - UnaryExprOrTypeTrait ExprKind) { +CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E, + UnaryExprOrTypeTrait ExprKind) { E = E->IgnoreParens(); // The kinds of expressions that we have special-case logic here for @@ -9657,22 +9656,22 @@ static CharUnits GetAlignOfExpr(EvalInfo &Info, const Expr *E, // alignof decl is always accepted, even if it doesn't make sense: we default // to 1 in those cases. if (const DeclRefExpr *DRE = dyn_cast(E)) - return Info.Ctx.getDeclAlign(DRE->getDecl(), - /*RefAsPointee*/true); + return Ctx.getDeclAlign(DRE->getDecl(), + /*RefAsPointee*/ true); if (const MemberExpr *ME = dyn_cast(E)) - return Info.Ctx.getDeclAlign(ME->getMemberDecl(), - /*RefAsPointee*/true); + return Ctx.getDeclAlign(ME->getMemberDecl(), + /*RefAsPointee*/ true); - return GetAlignOfType(Info, E->getType(), ExprKind); + return GetAlignOfType(Ctx, E->getType(), ExprKind); } static CharUnits getBaseAlignment(EvalInfo &Info, const LValue &Value) { if (const auto *VD = Value.Base.dyn_cast()) return Info.Ctx.getDeclAlign(VD); if (const auto *E = Value.Base.dyn_cast()) - return GetAlignOfExpr(Info, E, UETT_AlignOf); - return GetAlignOfType(Info, Value.Base.getTypeInfoType(), UETT_AlignOf); + return GetAlignOfExpr(Info.Ctx, E, UETT_AlignOf); + return GetAlignOfType(Info.Ctx, Value.Base.getTypeInfoType(), UETT_AlignOf); } /// Evaluate the value of the alignment argument to __builtin_align_{up,down}, @@ -14475,11 +14474,11 @@ bool IntExprEvaluator::VisitUnaryExprOrTypeTraitExpr( case UETT_PreferredAlignOf: case UETT_AlignOf: { if (E->isArgumentType()) - return Success(GetAlignOfType(Info, E->getArgumentType(), E->getKind()), - E); + return Success( + GetAlignOfType(Info.Ctx, E->getArgumentType(), E->getKind()), E); else - return Success(GetAlignOfExpr(Info, E->getArgumentExpr(), E->getKind()), - E); + return Success( + GetAlignOfExpr(Info.Ctx, E->getArgumentExpr(), E->getKind()), E); } case UETT_PtrAuthTypeDiscriminator: { diff --git a/clang/test/Sema/builtin-assume-aligned.c b/clang/test/Sema/builtin-assume-aligned.c index c2e4f9d659dd4d..33e85578451529 100644 --- a/clang/test/Sema/builtin-assume-aligned.c +++ b/clang/test/Sema/builtin-assume-aligned.c @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -DSIZE_T_64 -fsyntax-only -Wno-strict-prototypes -triple x86_64-linux -verify %s // RUN: %clang_cc1 -fsyntax-only -Wno-strict-prototypes -triple i386-freebsd -verify %s +// RUN: %clang_cc1 -DSIZE_T_64 -fsyntax-only -Wno-strict-prototypes -triple x86_64-linux -verify %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -fsyntax-only -Wno-strict-prototypes -triple i386-freebsd -verify %s -fexperimental-new-constant-interpreter // __builtin_assume_aligned's second parameter is size_t, which may be 32 bits, // so test differently when size_t is 32 bits and when it is 64 bits. diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp index 48bd8414fc50a1..85a7faee916181 100644 --- a/clang/test/SemaCXX/builtin-assume-aligned.cpp +++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s -fexperimental-new-constant-interpreter int n; constexpr int *p = 0; From 73ad416ebf9d11b876f22ede0ee90f660192869f Mon Sep 17 00:00:00 2001 From: Dominik Adamski Date: Fri, 11 Oct 2024 11:53:28 +0200 Subject: [PATCH 148/177] [OpenMP][Flang] Enable alias analysis inside omp target region (#111670) At present, alias analysis does not work for operations inside OMP target regions because the FIR declare operations within OMP target do not offer sufficient information for alias analysis. Consequently, it is necessary to examine the FIR code outside the OMP target region. --- .../lib/Optimizer/Analysis/AliasAnalysis.cpp | 29 ++++++ flang/lib/Optimizer/Analysis/CMakeLists.txt | 2 + .../alias-analysis-omp-target-1.fir | 66 +++++++++++++ .../alias-analysis-omp-target-2.fir | 96 +++++++++++++++++++ 4 files changed, 193 insertions(+) create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index e88da5a8ebae19..6ee4f0ff71057a 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -13,6 +13,8 @@ #include "flang/Optimizer/Dialect/FortranVariableInterface.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "mlir/Analysis/AliasAnalysis.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Value.h" #include "mlir/Interfaces/SideEffectInterfaces.h" @@ -296,6 +298,17 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, defOp = v.getDefiningOp(); return; } + // If load is inside target and it points to mapped item, + // continue tracking. + Operation *loadMemrefOp = op.getMemref().getDefiningOp(); + bool isDeclareOp = llvm::isa(loadMemrefOp) || + llvm::isa(loadMemrefOp); + if (isDeclareOp && + llvm::isa(loadMemrefOp->getParentOp())) { + v = op.getMemref(); + defOp = v.getDefiningOp(); + return; + } // No further tracking for addresses loaded from memory for now. type = SourceKind::Indirect; breakFromLoop = true; @@ -319,6 +332,22 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, breakFromLoop = true; }) .Case([&](auto op) { + // If declare operation is inside omp target region, + // continue alias analysis outside the target region + if (auto targetOp = + llvm::dyn_cast(op->getParentOp())) { + auto argIface = cast(*targetOp); + for (auto [opArg, blockArg] : llvm::zip_equal( + targetOp.getMapVars(), argIface.getMapBlockArgs())) { + if (blockArg == op.getMemref()) { + omp::MapInfoOp mapInfo = + llvm::cast(opArg.getDefiningOp()); + v = mapInfo.getVarPtr(); + defOp = v.getDefiningOp(); + return; + } + } + } auto varIf = llvm::cast(defOp); // While going through a declare operation collect // the variable attributes from it. Right now, some diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt index 436d4d3f18969c..c000a9da99f871 100644 --- a/flang/lib/Optimizer/Analysis/CMakeLists.txt +++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt @@ -6,6 +6,7 @@ add_flang_library(FIRAnalysis FIRDialect HLFIRDialect MLIRIR + MLIROpenMPDialect LINK_LIBS FIRBuilder @@ -14,5 +15,6 @@ add_flang_library(FIRAnalysis MLIRFuncDialect MLIRLLVMDialect MLIRMathTransforms + MLIROpenMPDialect FIRSupport ) diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir new file mode 100644 index 00000000000000..88f411847172a0 --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir @@ -0,0 +1,66 @@ +// Use --mlir-disable-threading so that the AA queries are serialized +// as well as its diagnostic output. +// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s + +// Fortran source code: +// +// program TestAllocatableArray +// real(kind=8), allocatable :: A(:) +// real(kind=8), allocatable :: B(:) +// !$omp target +// A(0) = B(0) +// !$omp end target +// end TestAllocatableArray + +// CHECK-LABEL: Testing : "_QPTestAllocatableArray" +// CHECK-DAG: targetArrayB#0 <-> targetArrayA#0: NoAlias +func.func @_QPTestAllocatableArray() { + %0 = fir.address_of(@_QFEa) : !fir.ref>>> + %1:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "ArrayA" } : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %2 = fir.address_of(@_QFEb) : !fir.ref>>> + %3:2 = hlfir.declare %2 {fortran_attrs = #fir.var_attrs, uniq_name = "ArrayB" } : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %4 = fir.load %1#0 : !fir.ref>>> + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %5 = fir.load %1#1 : !fir.ref>>> + %c0_0 = arith.constant 0 : index + %6:3 = fir.box_dims %5, %c0_0 : (!fir.box>>, index) -> (index, index, index) + %7:3 = fir.box_dims %4, %c0 : (!fir.box>>, index) -> (index, index, index) + %c0_1 = arith.constant 0 : index + %8 = arith.subi %7#1, %c1 : index + %9 = omp.map.bounds lower_bound(%c0_1 : index) upper_bound(%8 : index) extent(%7#1 : index) stride(%7#2 : index) start_idx(%6#0 : index) {stride_in_bytes = true} + %10 = fir.box_offset %1#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> + %11 = omp.map.info var_ptr(%1#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%10 : !fir.llvm_ptr>>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%9) -> !fir.llvm_ptr>> {name = ""} + %12 = omp.map.info var_ptr(%1#1 : !fir.ref>>>, !fir.box>>) map_clauses(implicit, tofrom) capture(ByRef) members(%11 : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "a"} + %13 = fir.load %3#0 : !fir.ref>>> + %c1_2 = arith.constant 1 : index + %c0_3 = arith.constant 0 : index + %14 = fir.load %3#1 : !fir.ref>>> + %c0_4 = arith.constant 0 : index + %15:3 = fir.box_dims %14, %c0_4 : (!fir.box>>, index) -> (index, index, index) + %16:3 = fir.box_dims %13, %c0_3 : (!fir.box>>, index) -> (index, index, index) + %c0_5 = arith.constant 0 : index + %17 = arith.subi %16#1, %c1_2 : index + %18 = omp.map.bounds lower_bound(%c0_5 : index) upper_bound(%17 : index) extent(%16#1 : index) stride(%16#2 : index) start_idx(%15#0 : index) {stride_in_bytes = true} + %19 = fir.box_offset %3#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> + %20 = omp.map.info var_ptr(%3#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%19 : !fir.llvm_ptr>>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%18) -> !fir.llvm_ptr>> {name = ""} + %21 = omp.map.info var_ptr(%3#1 : !fir.ref>>>, !fir.box>>) map_clauses(implicit, tofrom) capture(ByRef) members(%20 : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "b"} + omp.target map_entries(%11 -> %arg0, %12 -> %arg1, %20 -> %arg2, %21 -> %arg3 : !fir.llvm_ptr>>, !fir.ref>>>, !fir.llvm_ptr>>, !fir.ref>>>) { + %22:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %23:2 = hlfir.declare %arg3 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %24 = fir.load %23#0 : !fir.ref>>> + %c0_6 = arith.constant 0 : index + %25 = hlfir.designate %24 (%c0_6) {test.ptr = "targetArrayB"} : (!fir.box>>, index) -> !fir.ref + %26 = fir.load %25 : !fir.ref + %27 = fir.load %22#0 : !fir.ref>>> + %c0_7 = arith.constant 0 : index + %28 = hlfir.designate %27 (%c0_7) {test.ptr = "targetArrayA"} : (!fir.box>>, index) -> !fir.ref + hlfir.assign %26 to %28 : f64, !fir.ref + omp.terminator + } + return +} +fir.global internal @_QFEa : !fir.box>> { +} +fir.global internal @_QFEb : !fir.box>> { +} diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir new file mode 100644 index 00000000000000..c6b2e29a7188a9 --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir @@ -0,0 +1,96 @@ +// Use --mlir-disable-threading so that the AA queries are serialized +// as well as its diagnostic output. +// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s + +// Fortran source code: +// +// subroutine TestTargetData(p, a, b) +// real :: p(10), a(10), b(10) +// !$omp target data map(from: p) +// !$omp target map(to: a ) +// p(1) = a(1) +// !$omp end target +// !$omp target map(to: b ) +// p(1) = b(1) +// !$omp end target +// !$omp end target data +// end subroutine + +// CHECK-LABEL: Testing : "_QPTestTargetData" + +// CHECK-DAG: targetArrayA#0 <-> targetArrayP#0: NoAlias +// CHECK-DAG: targetArrayA#0 <-> targetArrayB#0: NoAlias +// CHECK-DAG: targetArrayP#0 <-> targetArrayB#0: NoAlias + +func.func @_QPTestTargetData(%arg0: !fir.ref> {fir.bindc_name = "p"}, %arg1: !fir.ref> {fir.bindc_name = "a"}, %arg2: !fir.ref> {fir.bindc_name = "b"}) { + %0 = fir.dummy_scope : !fir.dscope + %c10 = arith.constant 10 : index + %1 = fir.shape %c10 : (index) -> !fir.shape<1> + %2:2 = hlfir.declare %arg1(%1) dummy_scope %0 {uniq_name = "_QFtest_target_dataEa"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %c10_0 = arith.constant 10 : index + %3 = fir.shape %c10_0 : (index) -> !fir.shape<1> + %4:2 = hlfir.declare %arg2(%3) dummy_scope %0 {uniq_name = "_QFtest_target_dataEb"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %c10_1 = arith.constant 10 : index + %5 = fir.shape %c10_1 : (index) -> !fir.shape<1> + %6:2 = hlfir.declare %arg0(%5) dummy_scope %0 {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %7 = arith.subi %c10_1, %c1 : index + %8 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%7 : index) extent(%c10_1 : index) stride(%c1 : index) start_idx(%c1 : index) + %9 = omp.map.info var_ptr(%6#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(from) capture(ByRef) bounds(%8) -> !fir.ref> {name = "p"} + omp.target_data map_entries(%9 : !fir.ref>) { + %c1_2 = arith.constant 1 : index + %c0_3 = arith.constant 0 : index + %10 = arith.subi %c10, %c1_2 : index + %11 = omp.map.bounds lower_bound(%c0_3 : index) upper_bound(%10 : index) extent(%c10 : index) stride(%c1_2 : index) start_idx(%c1_2 : index) + %12 = omp.map.info var_ptr(%2#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(to) capture(ByRef) bounds(%11) -> !fir.ref> {name = "a"} + %c1_4 = arith.constant 1 : index + %c0_5 = arith.constant 0 : index + %13 = arith.subi %c10_1, %c1_4 : index + %14 = omp.map.bounds lower_bound(%c0_5 : index) upper_bound(%13 : index) extent(%c10_1 : index) stride(%c1_4 : index) start_idx(%c1_4 : index) + %15 = omp.map.info var_ptr(%6#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%14) -> !fir.ref> {name = "p"} + omp.target map_entries(%12 -> %arg3, %15 -> %arg4 : !fir.ref>, !fir.ref>) { + %c10_10 = arith.constant 10 : index + %22 = fir.shape %c10_10 : (index) -> !fir.shape<1> + %23:2 = hlfir.declare %arg3(%22) {uniq_name = "_QFtest_target_dataEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c10_11 = arith.constant 10 : index + %24 = fir.shape %c10_11 : (index) -> !fir.shape<1> + %25:2 = hlfir.declare %arg4(%24) {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c1_12 = arith.constant 1 : index + %26 = hlfir.designate %23#0 (%c1_12) {test.ptr = "targetArrayA"} : (!fir.ref>, index) -> !fir.ref + %27 = fir.load %26 : !fir.ref + %c1_13 = arith.constant 1 : index + %28 = hlfir.designate %25#0 (%c1_13) {test.ptr = "targetArrayP"} : (!fir.ref>, index) -> !fir.ref + hlfir.assign %27 to %28 : f32, !fir.ref + omp.terminator + } + %c1_6 = arith.constant 1 : index + %c0_7 = arith.constant 0 : index + %16 = arith.subi %c10_0, %c1_6 : index + %17 = omp.map.bounds lower_bound(%c0_7 : index) upper_bound(%16 : index) extent(%c10_0 : index) stride(%c1_6 : index) start_idx(%c1_6 : index) + %18 = omp.map.info var_ptr(%4#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(to) capture(ByRef) bounds(%17) -> !fir.ref> {name = "b"} + %c1_8 = arith.constant 1 : index + %c0_9 = arith.constant 0 : index + %19 = arith.subi %c10_1, %c1_8 : index + %20 = omp.map.bounds lower_bound(%c0_9 : index) upper_bound(%19 : index) extent(%c10_1 : index) stride(%c1_8 : index) start_idx(%c1_8 : index) + %21 = omp.map.info var_ptr(%6#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%20) -> !fir.ref> {name = "p"} + omp.target map_entries(%18 -> %arg3, %21 -> %arg4 : !fir.ref>, !fir.ref>) { + %c10_10 = arith.constant 10 : index + %22 = fir.shape %c10_10 : (index) -> !fir.shape<1> + %23:2 = hlfir.declare %arg3(%22) {uniq_name = "_QFtest_target_dataEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c10_11 = arith.constant 10 : index + %24 = fir.shape %c10_11 : (index) -> !fir.shape<1> + %25:2 = hlfir.declare %arg4(%24) {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c1_12 = arith.constant 1 : index + %26 = hlfir.designate %23#0 (%c1_12) {test.ptr = "targetArrayB"} : (!fir.ref>, index) -> !fir.ref + %27 = fir.load %26 : !fir.ref + %c1_13 = arith.constant 1 : index + %28 = hlfir.designate %25#0 (%c1_13) {test.ptr = "targetArrayP"} : (!fir.ref>, index) -> !fir.ref + hlfir.assign %27 to %28 : f32, !fir.ref + omp.terminator + } + omp.terminator + } + return +} + From f74f568b29885c3fa63c44e33f91f3bb7281138e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Fri, 11 Oct 2024 11:58:14 +0200 Subject: [PATCH 149/177] [clang][analyzer] PointerSubChecker should not warn on pointers converted to numerical type (#111846) Pointer values casted to integer (non-pointer) type should be able to be subtracted as usual. --- .../StaticAnalyzer/Checkers/PointerSubChecker.cpp | 4 ++++ clang/test/Analysis/pointer-sub.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp index f0dc5efd75f7d6..7a85d9e2073068 100644 --- a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp @@ -61,6 +61,10 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B, if (LR->getSymbolicBase() || RR->getSymbolicBase()) return; + if (!B->getLHS()->getType()->isPointerType() || + !B->getRHS()->getType()->isPointerType()) + return; + const auto *ElemLR = dyn_cast(LR); const auto *ElemRR = dyn_cast(RR); diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c index 1c9d676ebb8f24..25fb7f043d468c 100644 --- a/clang/test/Analysis/pointer-sub.c +++ b/clang/test/Analysis/pointer-sub.c @@ -1,5 +1,7 @@ // RUN: %clang_analyze_cc1 -analyzer-checker=security.PointerSub -analyzer-output=text-minimal -verify %s +typedef int * Ptr; + void f1(void) { int x, y, z[10]; int d = &y - &x; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} @@ -10,6 +12,12 @@ void f1(void) { d = &x - (&x + 1); // no-warning d = (&x + 0) - &x; // no-warning d = (z + 10) - z; // no-warning + d = (long long)&y - (long long)&x; // no-warning + long long l = 1; + d = l - (long long)&y; // no-warning + Ptr p1 = &x; + Ptr p2 = &y; + d = p1 - p2; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} } void f2(void) { @@ -28,6 +36,10 @@ void f2(void) { d = (int *)((char *)(&a[4]) + sizeof(int)) - &a[4]; // no-warning (pointers into the same array data) d = (int *)((char *)(&a[4]) + 1) - &a[4]; // expected-warning{{Subtraction of two pointers that}} + + long long a1 = (long long)&a[1]; + long long b1 = (long long)&b[1]; + d = a1 - b1; } void f3(void) { From 6a65e98fa7901dc1de91172d065fafb16ce89d77 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 11 Oct 2024 18:19:21 +0800 Subject: [PATCH 150/177] [InstCombine] Drop range attributes in `foldIsPowerOf2` (#111946) Fixes https://github.com/llvm/llvm-project/issues/111934. --- .../InstCombine/InstCombineAndOrXor.cpp | 18 ++++++++--- llvm/test/Transforms/InstCombine/ispow2.ll | 32 +++++++++++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 688601a8ffa543..964616a4eb35e2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -955,9 +955,11 @@ static Value *foldIsPowerOf2OrZero(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd, } /// Reduce a pair of compares that check if a value has exactly 1 bit set. -/// Also used for logical and/or, must be poison safe. +/// Also used for logical and/or, must be poison safe if range attributes are +/// dropped. static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, - InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder, + InstCombinerImpl &IC) { // Handle 'and' / 'or' commutation: make the equality check the first operand. if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE) std::swap(Cmp0, Cmp1); @@ -971,7 +973,10 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_ULT, m_Intrinsic(m_Specific(X)), m_SpecificInt(2)))) { - Value *CtPop = Cmp1->getOperand(0); + auto *CtPop = cast(Cmp1->getOperand(0)); + // Drop range attributes and re-infer them in the next iteration. + CtPop->dropPoisonGeneratingAnnotations(); + IC.addToWorklist(CtPop); return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1)); } // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1 @@ -980,7 +985,10 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_UGT, m_Intrinsic(m_Specific(X)), m_SpecificInt(1)))) { - Value *CtPop = Cmp1->getOperand(0); + auto *CtPop = cast(Cmp1->getOperand(0)); + // Drop range attributes and re-infer them in the next iteration. + CtPop->dropPoisonGeneratingAnnotations(); + IC.addToWorklist(CtPop); return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1)); } return nullptr; @@ -3375,7 +3383,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, if (Value *V = foldSignedTruncationCheck(LHS, RHS, I, Builder)) return V; - if (Value *V = foldIsPowerOf2(LHS, RHS, IsAnd, Builder)) + if (Value *V = foldIsPowerOf2(LHS, RHS, IsAnd, Builder, *this)) return V; if (Value *V = foldPowerOf2AndShiftedMask(LHS, RHS, IsAnd, Builder)) diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll index c21ad95f83a1c4..832c066370b0f8 100644 --- a/llvm/test/Transforms/InstCombine/ispow2.ll +++ b/llvm/test/Transforms/InstCombine/ispow2.ll @@ -1522,3 +1522,35 @@ define <2 x i1> @not_pow2_or_z_known_bits_fail_wrong_cmp(<2 x i32> %xin) { %r = icmp ugt <2 x i32> %cnt, ret <2 x i1> %r } + +; Make sure that range attributes on return values are dropped after merging these two icmps + +define i1 @has_single_bit(i32 %x) { +; CHECK-LABEL: @has_single_bit( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[POPCNT:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = icmp eq i32 [[POPCNT]], 1 +; CHECK-NEXT: ret i1 [[SEL]] +; +entry: + %cmp1 = icmp ne i32 %x, 0 + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp2 = icmp ult i32 %popcnt, 2 + %sel = select i1 %cmp1, i1 %cmp2, i1 false + ret i1 %sel +} + +define i1 @has_single_bit_inv(i32 %x) { +; CHECK-LABEL: @has_single_bit_inv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[POPCNT:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = icmp ne i32 [[POPCNT]], 1 +; CHECK-NEXT: ret i1 [[SEL]] +; +entry: + %cmp1 = icmp eq i32 %x, 0 + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp2 = icmp ugt i32 %popcnt, 1 + %sel = select i1 %cmp1, i1 true, i1 %cmp2 + ret i1 %sel +} From 65da32c634a8345fcbe021f69fc6a609d074c08c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 11 Oct 2024 11:26:57 +0100 Subject: [PATCH 151/177] [LV] Account for any-of reduction when computing costs of blend phis. Any-of reductions are narrowed to i1. Update the legacy cost model to use the correct type when computing the cost of a phi that gets lowered to selects (BLEND). This fixes a divergence between legacy and VPlan-based cost models after 36fc291b6ec6d. Fixes https://github.com/llvm/llvm-project/issues/111874. --- .../Transforms/Vectorize/LoopVectorize.cpp | 24 ++- .../RISCV/blend-any-of-reduction-cost.ll | 167 ++++++++++++++++++ 2 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 05dc58a42249ca..54f57fb0b6b58e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6480,12 +6480,32 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) { + Type *ResultTy = Phi->getType(); + + // All instructions in an Any-of reduction chain are narrowed to bool. + // Check if that is the case for this phi node. + auto *HeaderUser = cast_if_present( + find_singleton(Phi->users(), [this](User *U, bool) -> User * { + auto *Phi = dyn_cast(U); + if (Phi && Phi->getParent() == TheLoop->getHeader()) + return Phi; + return nullptr; + })); + if (HeaderUser) { + auto &ReductionVars = Legal->getReductionVars(); + auto Iter = ReductionVars.find(HeaderUser); + if (Iter != ReductionVars.end() && + RecurrenceDescriptor::isAnyOfRecurrenceKind( + Iter->second.getRecurrenceKind())) + ResultTy = Type::getInt1Ty(Phi->getContext()); + } return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( - Instruction::Select, ToVectorTy(Phi->getType(), VF), + Instruction::Select, ToVectorTy(ResultTy, VF), ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); + } return TTI.getCFInstrCost(Instruction::PHI, CostKind); } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll new file mode 100644 index 00000000000000..7db47cb9171d24 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +; Test case for https://github.com/llvm/llvm-project/issues/111874. +define i32 @any_of_reduction_used_in_blend(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 { +; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[ANY_OF_RED:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: br i1 [[C_0]], label %[[LOOP_LATCH]], label %[[ELSE_1:.*]] +; CHECK: [[ELSE_1]]: +; CHECK-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[ELSE_2:.*]] +; CHECK: [[ELSE_2]]: +; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[SRC]], align 8 +; CHECK-NEXT: [[C_2:%.*]] = icmp eq ptr [[L]], null +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C_2]], i32 0, i32 [[ANY_OF_RED]] +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ANY_OF_RED_NEXT]] = phi i32 [ [[ANY_OF_RED]], %[[LOOP_HEADER]] ], [ [[ANY_OF_RED]], %[[ELSE_1]] ], [ [[SEL]], %[[ELSE_2]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop.header + +loop.header: + %any.of.red = phi i32 [ 0, %entry ], [ %any.of.red.next, %loop.latch ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c.0, label %loop.latch, label %else.1 + +else.1: + br i1 %c.1, label %loop.latch, label %else.2 + +else.2: + %l = load ptr, ptr %src, align 8 + %c.2 = icmp eq ptr %l, null + %sel = select i1 %c.2, i32 0, i32 %any.of.red + br label %loop.latch + +loop.latch: + %any.of.red.next = phi i32 [ %any.of.red, %loop.header ], [ %any.of.red, %else.1 ], [ %sel, %else.2 ] + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop.header + +exit: + %res = phi i32 [ %any.of.red.next, %loop.latch ] + ret i32 %res +} + +define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 { +; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend_with_mutliple_phis( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[C_0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i1 [[C_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, ptr [[SRC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = xor [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = xor [[BROADCAST_SPLAT2]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP6]], [[TMP7]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2p0.nxv2p0( [[BROADCAST_SPLAT4]], i32 8, [[TMP8]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_MASKED_GATHER]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or [[VEC_PHI]], [[TMP9]] +; CHECK-NEXT: [[PREDPHI]] = select [[TMP8]], [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1( [[PREDPHI]]) +; CHECK-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 0, i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: br i1 [[C_0]], label %[[X_1:.*]], label %[[ELSE_1:.*]] +; CHECK: [[ELSE_1]]: +; CHECK-NEXT: br i1 [[C_1]], label %[[X_1]], label %[[ELSE_2:.*]] +; CHECK: [[ELSE_2]]: +; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[SRC]], align 8 +; CHECK-NEXT: [[C_2:%.*]] = icmp eq ptr [[L]], null +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C_2]], i32 0, i32 [[ANY_OF_RED]] +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[X_1]]: +; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[ANY_OF_RED]], %[[LOOP_HEADER]] ], [ [[ANY_OF_RED]], %[[ELSE_1]] ] +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ANY_OF_RED_NEXT]] = phi i32 [ [[P]], %[[X_1]] ], [ [[SEL]], %[[ELSE_2]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop.header + +loop.header: + %any.of.red = phi i32 [ 0, %entry ], [ %any.of.red.next, %loop.latch ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c.0, label %x.1, label %else.1 + +else.1: + br i1 %c.1, label %x.1, label %else.2 + +else.2: + %l = load ptr, ptr %src, align 8 + %c.2 = icmp eq ptr %l, null + %sel = select i1 %c.2, i32 0, i32 %any.of.red + br label %loop.latch + +x.1: + %p = phi i32 [ %any.of.red, %loop.header ], [ %any.of.red, %else.1 ] + br label %loop.latch + +loop.latch: + %any.of.red.next = phi i32 [ %p, %x.1 ], [ %sel, %else.2 ] + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop.header + +exit: + %res = phi i32 [ %any.of.red.next, %loop.latch ] + ret i32 %res +} + +attributes #0 = { "target-cpu"="sifive-p670" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. From d941254da94c8a5897689a74012a57de279c2c9e Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 11 Oct 2024 11:00:07 +0000 Subject: [PATCH 152/177] [lldb][test] Fix var name typo in TestProcessSaveCoreMinidump --- .../TestProcessSaveCoreMinidump.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py index 4818dde4f3b838..808de687e6ea2e 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py @@ -117,14 +117,14 @@ def test_save_linux_mini_dump(self): expected_number_of_threads = process.GetNumThreads() expected_threads = [] stacks_to_sp_map = {} - stakcs_to_registers_map = {} + stacks_to_registers_map = {} for thread_idx in range(process.GetNumThreads()): thread = process.GetThreadAtIndex(thread_idx) thread_id = thread.GetThreadID() expected_threads.append(thread_id) stacks_to_sp_map[thread_id] = thread.GetFrameAtIndex(0).GetSP() - stakcs_to_registers_map[thread_id] = thread.GetFrameAtIndex( + stacks_to_registers_map[thread_id] = thread.GetFrameAtIndex( 0 ).GetRegisters() @@ -138,7 +138,7 @@ def test_save_linux_mini_dump(self): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) self.runCmd(base_command + " --style=modified-memory '%s'" % (core_dirty)) @@ -149,7 +149,7 @@ def test_save_linux_mini_dump(self): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) self.runCmd(base_command + " --style=full '%s'" % (core_full)) @@ -160,7 +160,7 @@ def test_save_linux_mini_dump(self): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) options = lldb.SBSaveCoreOptions() @@ -178,7 +178,7 @@ def test_save_linux_mini_dump(self): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) options = lldb.SBSaveCoreOptions() @@ -195,7 +195,7 @@ def test_save_linux_mini_dump(self): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) # Minidump can now save full core files, but they will be huge and @@ -214,7 +214,7 @@ def test_save_linux_mini_dump(self): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) self.assertSuccess(process.Kill()) From 4451f9f812d458f6b53785b27869674caf01e67b Mon Sep 17 00:00:00 2001 From: Sebastian Kreutzer Date: Fri, 11 Oct 2024 07:11:03 -0400 Subject: [PATCH 153/177] [XRay] Fix LLVM include in xray_interface.cpp (#111978) Removes a dependency on LLVM in `xray_interface.cpp` by replacing `llvm_unreachable` with compiler-rt's `UNREACHABLE`. Applies clang-format to some unformatted changes. Original PR: #90959 --- clang/include/clang/Driver/XRayArgs.h | 4 +-- clang/lib/Driver/XRayArgs.cpp | 8 ++--- compiler-rt/include/xray/xray_interface.h | 40 +++++++++++++---------- compiler-rt/lib/xray/xray_interface.cpp | 5 ++- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h index 8fbcf469e5bad1..1b5c4a4c42f12a 100644 --- a/clang/include/clang/Driver/XRayArgs.h +++ b/clang/include/clang/Driver/XRayArgs.h @@ -36,9 +36,7 @@ class XRayArgs { llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const; bool needsXRayRt() const { return XRayInstrument && XRayRT; } - bool needsXRayDSORt() const { - return XRayInstrument && XRayRT && XRayShared; - } + bool needsXRayDSORt() const { return XRayInstrument && XRayRT && XRayShared; } llvm::ArrayRef modeList() const { return Modes; } XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; } }; diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index 411054e067cb42..d0bb5d4887c184 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -63,8 +63,8 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { << XRayInstrument->getSpelling() << Triple.str(); } - if (Args.hasFlag(options::OPT_fxray_shared, - options::OPT_fno_xray_shared, false)) { + if (Args.hasFlag(options::OPT_fxray_shared, options::OPT_fno_xray_shared, + false)) { XRayShared = true; // DSO instrumentation is currently limited to x86_64 @@ -75,8 +75,8 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args)); if (!PICLvl) { - D.Diag(diag::err_opt_not_valid_without_opt) - << "-fxray-shared" << "-fPIC"; + D.Diag(diag::err_opt_not_valid_without_opt) << "-fxray-shared" + << "-fPIC"; } } diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 717cfe292ce416..675ea0cbc48c83 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -93,8 +93,8 @@ enum XRayPatchingStatus { FAILED = 3, }; -/// This tells XRay to patch the instrumentation points in all currently loaded objects. See XRayPatchingStatus -/// for possible result values. +/// This tells XRay to patch the instrumentation points in all currently loaded +/// objects. See XRayPatchingStatus for possible result values. extern XRayPatchingStatus __xray_patch(); /// This tells XRay to patch the instrumentation points in the given object. @@ -105,8 +105,8 @@ extern XRayPatchingStatus __xray_patch_object(int32_t ObjId); /// result values. extern XRayPatchingStatus __xray_unpatch(); -/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for possible -/// result values. +/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for +/// possible result values. extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); /// This unpacks the given (packed) function id and patches @@ -114,8 +114,8 @@ extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); /// result values. extern XRayPatchingStatus __xray_patch_function(int32_t FuncId); -/// This patches a specific function in the given object. See XRayPatchingStatus for possible -/// result values. +/// This patches a specific function in the given object. See XRayPatchingStatus +/// for possible result values. extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, int32_t ObjId); @@ -129,26 +129,29 @@ extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId, int32_t ObjId); -/// This function unpacks the given (packed) function id and returns the address of the corresponding function. We return 0 if we encounter any error, even if 0 may be a valid function -/// address. +/// This function unpacks the given (packed) function id and returns the address +/// of the corresponding function. We return 0 if we encounter any error, even +/// if 0 may be a valid function address. extern uintptr_t __xray_function_address(int32_t FuncId); -/// This function returns the address of the function in the given object provided valid function and object -/// ids. We return 0 if we encounter any error, even if 0 may be a valid function -/// address. +/// This function returns the address of the function in the given object +/// provided valid function and object ids. We return 0 if we encounter any +/// error, even if 0 may be a valid function address. extern uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId); -/// This function returns the maximum valid function id for the main executable (object id = 0). Returns 0 if we -/// encounter errors (when there are no instrumented functions, etc.). +/// This function returns the maximum valid function id for the main executable +/// (object id = 0). Returns 0 if we encounter errors (when there are no +/// instrumented functions, etc.). extern size_t __xray_max_function_id(); -/// This function returns the maximum valid function id for the given object. Returns 0 if we -/// encounter errors (when there are no instrumented functions, etc.). +/// This function returns the maximum valid function id for the given object. +/// Returns 0 if we encounter errors (when there are no instrumented functions, +/// etc.). extern size_t __xray_max_function_id_in_object(int32_t ObjId); -/// This function returns the number of previously registered objects (executable + loaded DSOs). -/// Returns 0 if XRay has not been initialized. +/// This function returns the number of previously registered objects +/// (executable + loaded DSOs). Returns 0 if XRay has not been initialized. extern size_t __xray_num_objects(); /// Unpacks the function id from the given packed id. @@ -158,7 +161,8 @@ extern int32_t __xray_unpack_function_id(int32_t PackedId); extern int32_t __xray_unpack_object_id(int32_t PackedId); /// Creates and returns a packed id from the given function and object ids. -/// If the ids do not fit within the reserved number of bits for each part, the high bits are truncated. +/// If the ids do not fit within the reserved number of bits for each part, the +/// high bits are truncated. extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId); /// Initialize the required XRay data structures. This is useful in cases where diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 16e60bfc22cd10..402fc3d07b4e2a 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "xray_interface_internal.h" -#include "llvm/Support/ErrorHandling.h" #include #include @@ -411,9 +410,9 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { CombinedStatus = NOT_INITIALIZED; break; case ONGOING: - llvm_unreachable("Status ONGOING should not appear at this point"); + UNREACHABLE("Status ONGOING should not appear at this point"); default: - llvm_unreachable("Unhandled patching status"); + UNREACHABLE("Unhandled patching status"); } } return CombinedStatus; From 0163ac1f53abc0a0f6e5b7e56912c1dee67e7f32 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Fri, 11 Oct 2024 12:23:37 +0100 Subject: [PATCH 154/177] [Flang][OpenMP]Add tests for TODOs and small changes to improve messages (#111562) The bulk of this change are new tests to check that we get a "Not yet implemneted: *some stuff here*" message when using some not yet supported OpenMP functionality. For some of these cases, this also means adding additional clauses to a filter list in OpenMP.cpp - this changes nothing [to the best of my understanding] other than allowing the clause to get to the point where it can be rejected in a TODO with a more clear message. One of the TOOD filters were missing Mergeable clause, so this was also added and the existing test updated for the new more specific error message. There is no functional change intended here. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 9 ++++++--- flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 | 14 ++++++++++++++ flang/test/Lower/OpenMP/Todo/reduction-task.f90 | 12 ++++++++++++ .../test/Lower/OpenMP/Todo/target-inreduction.f90 | 15 +++++++++++++++ flang/test/Lower/OpenMP/Todo/task-inreduction.f90 | 15 +++++++++++++++ flang/test/Lower/OpenMP/Todo/task_mergeable.f90 | 2 +- .../OpenMP/Todo/taskgroup-task-reduction.f90 | 10 ++++++++++ flang/test/Lower/OpenMP/Todo/taskloop.f90 | 13 +++++++++++++ flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 | 10 ++++++++++ flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 | 8 ++++++++ 10 files changed, 104 insertions(+), 4 deletions(-) create mode 100644 flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/reduction-task.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/target-inreduction.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/task-inreduction.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 0735e40ea2ca7e..a89029b720e788 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1310,8 +1310,8 @@ static void genTaskClauses(lower::AbstractConverter &converter, cp.processUntied(clauseOps); // TODO Support delayed privatization. - cp.processTODO( - loc, llvm::omp::Directive::OMPD_task); + cp.processTODO(loc, llvm::omp::Directive::OMPD_task); } static void genTaskgroupClauses(lower::AbstractConverter &converter, @@ -2780,7 +2780,10 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, !std::holds_alternative(clause.u) && !std::holds_alternative(clause.u) && !std::holds_alternative(clause.u) && - !std::holds_alternative(clause.u)) { + !std::holds_alternative(clause.u) && + !std::holds_alternative(clause.u) && + !std::holds_alternative(clause.u) && + !std::holds_alternative(clause.u)) { TODO(clauseLocation, "OpenMP Block construct clause"); } } diff --git a/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 b/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 new file mode 100644 index 00000000000000..c5f196fe09693a --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 @@ -0,0 +1,14 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Reduction modifiers are not supported +subroutine reduction_inscan() + integer :: i,j + i = 0 + + !$omp do reduction(inscan, +:i) + do j=1,10 + i = i + 1 + end do + !$omp end do +end subroutine reduction_inscan diff --git a/flang/test/Lower/OpenMP/Todo/reduction-task.f90 b/flang/test/Lower/OpenMP/Todo/reduction-task.f90 new file mode 100644 index 00000000000000..6707f65e1a4cc3 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/reduction-task.f90 @@ -0,0 +1,12 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Reduction modifiers are not supported +subroutine reduction_task() + integer :: i + i = 0 + + !$omp parallel reduction(task, +:i) + i = i + 1 + !$omp end parallel +end subroutine reduction_task diff --git a/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 new file mode 100644 index 00000000000000..e5a9cffac5a117 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 @@ -0,0 +1,15 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +!=============================================================================== +! `mergeable` clause +!=============================================================================== + +! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TARGET construct +subroutine omp_target_inreduction() + integer i + i = 0 + !$omp target in_reduction(+:i) + i = i + 1 + !$omp end target +end subroutine omp_target_inreduction diff --git a/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 new file mode 100644 index 00000000000000..aeed680a6dba7c --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 @@ -0,0 +1,15 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +!=============================================================================== +! `mergeable` clause +!=============================================================================== + +! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASK construct +subroutine omp_task_in_reduction() + integer i + i = 0 + !$omp task in_reduction(+:i) + i = i + 1 + !$omp end task +end subroutine omp_task_in_reduction diff --git a/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 index 13145d92ccf902..ddc27487abfe9c 100644 --- a/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 +++ b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 @@ -5,7 +5,7 @@ ! `mergeable` clause !=============================================================================== -! CHECK: not yet implemented: OpenMP Block construct clause +! CHECK: not yet implemented: Unhandled clause MERGEABLE in TASK construct subroutine omp_task_mergeable() !$omp task mergeable call foo() diff --git a/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 new file mode 100644 index 00000000000000..1cb471d784d766 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 @@ -0,0 +1,10 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause TASK_REDUCTION in TASKGROUP construct +subroutine omp_taskgroup_task_reduction + integer :: res + !$omp taskgroup task_reduction(+:res) + res = res + 1 + !$omp end taskgroup +end subroutine omp_taskgroup_task_reduction diff --git a/flang/test/Lower/OpenMP/Todo/taskloop.f90 b/flang/test/Lower/OpenMP/Todo/taskloop.f90 new file mode 100644 index 00000000000000..aca050584cbbe3 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskloop.f90 @@ -0,0 +1,13 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Taskloop construct +subroutine omp_taskloop + integer :: res, i + !$omp taskloop + do i = 1, 10 + res = res + 1 + end do + !$omp end taskloop +end subroutine omp_taskloop + diff --git a/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 b/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 new file mode 100644 index 00000000000000..d1f953be8802fa --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 @@ -0,0 +1,10 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause DEPEND in TASKWAIT construct +subroutine omp_tw_depend + integer :: res + !$omp taskwait depend(out: res) + res = res + 1 +end subroutine omp_tw_depend + diff --git a/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 b/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 new file mode 100644 index 00000000000000..21e8609b08ba37 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 @@ -0,0 +1,8 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause NOWAIT in TASKWAIT construct +subroutine omp_tw_nowait + !$omp taskwait nowait +end subroutine omp_tw_nowait + From b5ea5be2a714e28bac57d417c221f687efe396bf Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Fri, 11 Oct 2024 13:24:54 +0200 Subject: [PATCH 155/177] [RISCV][MC] Fix >32bit .insn Directives (#111878) The original patch had a reasonably significant bug. You could not use `.insn` to assemble encodings that had any bits set above the low 32 bits. This is due to the fact that `getMachineOpValue` was truncating the immediate value, and I did not commit enough tests of useful cases. This changes the result of `getMachineOpValue` to be able to return the 48-bit and 64-bit immediates needed for the wider `.insn` directives. I took the opportunity to move some of the test cases around in the file to make looking at the output of `llvm-objdump` a little clearer. --- .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 6 ++-- llvm/test/MC/RISCV/insn.s | 35 +++++++++++++++---- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 66970ed37f2724..54f1a3899c4957 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -77,7 +77,7 @@ class RISCVMCCodeEmitter : public MCCodeEmitter { /// Return binary encoding of operand. If the machine operand requires /// relocation, record the relocation and return zero. - unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; @@ -375,7 +375,7 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, ++MCNumEmitted; // Keep track of the # of mi's emitted. } -unsigned +uint64_t RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { @@ -384,7 +384,7 @@ RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); if (MO.isImm()) - return static_cast(MO.getImm()); + return MO.getImm(); llvm_unreachable("Unhandled expression!"); return 0; diff --git a/llvm/test/MC/RISCV/insn.s b/llvm/test/MC/RISCV/insn.s index e32fec25bb16b4..d24f4fe8b36374 100644 --- a/llvm/test/MC/RISCV/insn.s +++ b/llvm/test/MC/RISCV/insn.s @@ -170,17 +170,40 @@ target: # CHECK-OBJ: .insn 6, 0x1f -# CHECK-ASM: .insn 0x4, 65503 -# CHECK-ASM: encoding: [0xdf,0xff,0x00,0x00] -# CHECK-OBJ: -.insn 0xffdf - # CHECK-ASM: .insn 0x8, 63 # CHECK-ASM: encoding: [0x3f,0x00,0x00,0x00,0x00,0x00,0x00,0x00] # CHECK-OBJ: .insn 8, 0x3f +# CHECK-ASM: .insn 0x6, 281474976710623 +# CHECK-ASM: encoding: [0xdf,0xff,0xff,0xff,0xff,0xff] +# CHECK-OBJ: +.insn 0x6, 0xffffffffffdf + +# CHECK-ASM: .insn 0x8, -65 +# CHECK-ASM: encoding: [0xbf,0xff,0xff,0xff,0xff,0xff,0xff,0xff] +# CHECK-OBJ: +.insn 0x8, 0xffffffffffffffbf + +odd_lengths: +# CHECK-ASM-LABEL: odd_lengths: +# CHECK-OBJ-LABEL: : + +## These deliberately disagree with the lengths objdump expects them to have, so +## keep them at the end so that the disassembled instruction stream is not out +## of sync with the encoded instruction stream. We don't check for `` +## as we could get any number of those, so instead check for the encoding +## halfwords. These might be split into odd 16-bit chunks, so each chunk is on +## one line. + +# CHECK-ASM: .insn 0x4, 65503 +# CHECK-ASM: encoding: [0xdf,0xff,0x00,0x00] +# CHECK-OBJ: ffdf +# CHECK-OBJ: 0000 +.insn 0xffdf + # CHECK-ASM: .insn 0x4, 65471 # CHECK-ASM: encoding: [0xbf,0xff,0x00,0x00] -# CHECK-OBJ: +# CHECK-OBJ: ffbf +# CHECK-OBJ: 0000 .insn 0xffbf From 303c8d20601d810c177f6646f771c1eb3f29ab8c Mon Sep 17 00:00:00 2001 From: Rin Dobrescu Date: Fri, 11 Oct 2024 12:29:44 +0100 Subject: [PATCH 156/177] [AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. (#111538) Introduce a description of late forwarding to the Neoverse-V1 Scheduling model. --- .../Target/AArch64/AArch64SchedNeoverseV1.td | 207 ++- .../llvm-mca/AArch64/Neoverse/V1-forwarding.s | 1421 +++++++++++++++++ .../AArch64/Neoverse/V1-neon-instructions.s | 138 +- 3 files changed, 1645 insertions(+), 121 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td index f7e6545f0dd386..fb4d2f3d7bcd3a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td @@ -469,6 +469,89 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01, V1UnitV, V1UnitV, V1UnitV, V1UnitV, V1UnitV, V1UnitV]>; +//===----------------------------------------------------------------------===// +// Define forwarded types + +// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for +// consumers of 64 bit multiply high operations? +def V1Wr_IM : SchedWriteRes<[V1UnitM]> { let Latency = 2; } +def V1Wr_IMA : SchedWriteRes<[V1UnitM0]> { let Latency = 2; } +def V1WriteIM : SchedWriteVariant< + [SchedVar, + SchedVar]>; +def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>; + +def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>; + +def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; } +def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>; + +def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>; + +def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>; + +def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; } +def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>; + +def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; } +def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>; + +def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; } +def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>; + +def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>; + +def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>; + +def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; } +def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>; + +def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>; + +def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; } +def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>; + +def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>; + +def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; } +def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>; + +def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; } +def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>; + +def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>; + +def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; } +def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>; + +def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; } +def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>; + +let Latency = 5, NumMicroOps = 2 in +def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>; +def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>; + +def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>; + +def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; } +def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>; + +def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; } +def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>; +def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>; +def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>; // Miscellaneous Instructions // ----------------------------------------------------------------------------- @@ -553,16 +636,19 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>; def : SchedAlias; def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + // Multiply -// Multiply accumulate -// Multiply accumulate, long -// Multiply long -def V1WriteIM : SchedWriteVariant< - [SchedVar, - SchedVar]>; -def : SchedAlias; -def : SchedAlias; +// Multiply accumulate, W-form +// Multiply accumulate, X-form +def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA], + (instregex "^M(ADD|SUB)[WX]rrr$")>; +// Multiply accumulate long +// Multiply long +def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA], + (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; // Multiply high def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>; @@ -680,10 +766,11 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>; def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>; // FP multiply -def : SchedAlias; +def : WriteRes { let Latency = 3; } // FP multiply accumulate -def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; +def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA], + (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; // FP round to integral def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$", @@ -824,7 +911,7 @@ def : SchedAlias; // ASIMD absolute diff accum // ASIMD absolute diff accum long // ASIMD pairwise add and accumulate long -def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>; +def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>; // ASIMD arith, reduce, 4H/4S // ASIMD max/min, reduce, 4H/4S @@ -843,23 +930,26 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$", // ASIMD dot product // ASIMD dot product using signed and unsigned integers -def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>; +def : InstRW<[V1Wr_VDOT, V1Rd_VDOT], + (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>; -// ASIMD matrix multiply- accumulate -def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>; +// ASIMD matrix multiply-accumulate +def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; // ASIMD multiply +def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>; + // ASIMD multiply accumulate +def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>; + // ASIMD multiply accumulate long +def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; + // ASIMD multiply accumulate high +def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; + // ASIMD multiply accumulate saturating long -def : InstRW<[V1Write_4c_1V02], - (instregex "^MUL(v[148]i16|v[124]i32)$", - "^SQR?DMULH(v[48]i16|v[24]i32)$", - "^ML[AS](v[148]i16|v[124]i32)$", - "^[SU]ML[AS]Lv", - "^SQRDML[AS]H(v[148]i16|v[124]i32)$", - "^SQDML[AS]Lv")>; +def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>; // ASIMD multiply/multiply long (8x8) polynomial def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>; @@ -868,11 +958,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>; def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>; // ASIMD shift accumulate +def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>; + // ASIMD shift by immed, complex // ASIMD shift by register, complex def : InstRW<[V1Write_4c_1V13], - (instregex "^[SU]R?SRAv", - "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$", + (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$", "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", "^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv", "^[SU]Q?RSHLv", "^[SU]QSHLv")>; @@ -890,16 +981,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv", // ASIMD FP absolute value/difference // ASIMD FP arith, normal // ASIMD FP compare -// ASIMD FP complex add // ASIMD FP max/min, normal // ASIMD FP max/min, pairwise // ASIMD FP negate // Covered by "SchedAlias (WriteV[dq]...)" above +// ASIMD FP complex add +def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>; + // ASIMD FP complex multiply add +def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>; + +// ASIMD FP multiply +def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>; + // ASIMD FP multiply accumulate -def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$", - "^FML[AS]v")>; +def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>; // ASIMD FP convert, long (F16 to F32) def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>; @@ -953,12 +1053,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>; // ASIMD FP max/min, reduce, Q-form F16 def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>; -// ASIMD FP multiply -def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>; - -// ASIMD FP multiply accumulate long -def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>; - // ASIMD FP round, D-form F32 and Q-form F64 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>; @@ -976,13 +1070,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>; def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>; // ASIMD dot product -def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>; +def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>; // ASIMD matrix multiply accumulate -def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>; +def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>; // ASIMD multiply accumulate long -def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>; +def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>; // Scalar convert, F32 to BF16 def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>; @@ -1300,7 +1394,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>; // ----------------------------------------------------------------------------- // CRC checksum ops -def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>; +def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>; // SVE Predicate instructions @@ -1440,13 +1534,14 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>; +def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>; // Dot product, 8 bit, using signed and unsigned integers -def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>; +def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], + (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>; // Dot product, 16 bit -def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>; +def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>; // Duplicate, immediate and indexed form def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$", @@ -1488,7 +1583,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$", "^MOVPRFX_ZZ$")>; // Matrix multiply-accumulate -def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; +def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; // Multiply, B, H, S element size def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", @@ -1497,12 +1592,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", "^[SU]MULH_ZPZZ_[BHS]")>; // Multiply, D element size -// Multiply accumulate, D element size def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D", "^MUL_ZPZZ_D", "^[SU]MULH_(ZPmZ|ZZZ)_D", - "^[SU]MULH_ZPZZ_D", - "^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>; + "^[SU]MULH_ZPZZ_D")>; + +// Multiply accumulate, D element size +def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD], + (instregex "^ML[AS]_ZPZZZ_D")>; +def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD], + (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; // Multiply accumulate, B, H, S element size // NOTE: This is not specified in the SOG. @@ -1583,8 +1682,8 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$", def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>; // Floating point complex multiply add -def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$", - "^FCMLA_ZZZI_[HS]$")>; +def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>; +def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>; // Floating point convert, long or narrow (F16 to F32 or F32 to F16) // Floating point convert to integer, F32 @@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]", "^FMUL_ZPZ[IZ]_[HSD]")>; // Floating point multiply accumulate +def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA], + (instregex "^FN?ML[AS]_ZPmZZ_[HSD]", + "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>; +def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA], + (instregex "^FML[AS]_ZZZI_[HSD]", + "^FN?ML[AS]_ZPZZZ_[HSD]")>; + // Floating point reciprocal step -def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$", - "^FN?ML[AS]_ZPZZZ_[HSD]", - "^FML[AS]_ZZZI_[HSD]$", - "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>; +def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; // Floating point reciprocal estimate, F16 def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>; @@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$", def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; // Dot product -def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; +def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>; +def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>; // Multiply accumulate long -def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; +def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; // SVE Load instructions diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s new file mode 100644 index 00000000000000..4de37f96000520 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s @@ -0,0 +1,1421 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s + +# LLVM-MCA-BEGIN madd +mul x0, x0, x0 +madd x0, x1, x2, x0 +madd x0, x1, x2, x0 +madd x0, x0, x0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smaddl +mul x0, x0, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w0, w0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmadd +fadd d0, d0, d0 +fmadd d0, d1, d2, d0 +fmul d0, d0, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d0, d1, d2 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN saba +mul v0.4s, v0.4s, v0.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sadalp +mul v0.4s, v0.4s, v0.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v0.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sdot +mul v0.4s, v0.4s, v0.4s +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smmla +mul v0.4s, v0.4s, v0.4s +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN mla +mul v0.4s, v0.4s, v0.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smlal2 +mul v0.4s, v0.4s, v0.4s +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN ssra +mul v0.4s, v0.4s, v0.4s +ssra v0.2d, v1.2d, #1 +ssra v0.2d, v1.2d, #1 +ssra v0.2d, v0.2d, #1 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fcmla +fmul v0.4s, v0.4s, v0.4s +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v0.2d, v1.2d, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmla +fmul v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fadd v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v0.2d, v1.2d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmlal +fmul v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fadd v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v0.4h, v1.4h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfdot +fmul v0.2d, v0.2d, v0.2d +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmmla +fmul v0.2d, v0.2d, v0.2d +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul v0.2d, v0.2d, v0.2d +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN crc32cb +mul w0, w0, w0 +crc32cb w0, w0, w1 +crc32cb w0, w0, w1 +crc32cb w0, w0, w0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.s +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b +sdot z0.s, z1.b, z2.b +sdot z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sudot +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z0.b, z1.b[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.d +mul z0.d, p0/m, z0.d, z0.d +sdot z0.d, z1.h, z2.h +sdot z0.d, z1.h, z2.h +sdot z0.d, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z smmla +mul z0.d, p0/m, z0.d, z0.d +smmla z0.s, z1.b, z2.b +smmla z0.s, z1.b, z2.b +smmla z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mla.d +mul z0.d, p0/m, z0.d, z0.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mad.d +mul z0.d, p0/m, z0.d, z0.d +mad z0.d, p0/m, z1.d, z2.d +mad z0.d, p0/m, z1.d, z2.d +mad z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z msb.d +mul z0.d, p0/m, z0.d, z0.d +msb z0.d, p0/m, z1.d, z2.d +msb z0.d, p0/m, z1.d, z2.d +msb z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZPmZZ +fmul z0.d, z0.d, z0.d +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z0.d, z1.d, 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZZZI +fmul z0.d, z0.d, z0.d +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z0.s, z1.s[1], 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZPmZZ +fmul z0.d, z0.d, z0.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZZZI +fmul z0.d, z0.d, z0.d +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z0.d, z1.d[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfdot +fmul z0.d, z0.d, z0.d +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfmmla +fmul z0.d, z0.d, z0.d +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul z0.d, z0.d, z0.d +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z0.h, z1.h +# LLVM-MCA-END + +# CHECK: [0] Code Region - madd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 madd x0, x0, x0, x0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [1] Code Region - smaddl + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] D=========eeER .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,2] D==========eeER.. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,3] D============eeER smaddl x0, w0, w0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [2] Code Region - fmadd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeER. . . . . . .. fadd d0, d0, d0 +# CHECK-NEXT: [0,1] D==eeeeER . . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,2] D======eeeER . . . . .. fmul d0, d0, d0 +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmadd d0, d0, d1, d2 +# CHECK-NEXT: [1,0] D=================eeER . . .. fadd d0, d0, d0 +# CHECK-NEXT: [1,1] D===================eeeeER . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,2] D=======================eeeER . .. fmul d0, d0, d0 +# CHECK-NEXT: [1,3] D========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,4] D==========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,5] D==============================eeeeER fmadd d0, d0, d1, d2 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fadd d0, d0, d0 +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmul d0, d0, d0 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmadd d0, d0, d1, d2 +# CHECK-NEXT: 2 15.7 0.1 0.0 + +# CHECK: [3] Code Region - saba + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] D======================eeeeER saba v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [4] Code Region - sadalp + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . sadalp v0.2d, v0.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,3] D======================eeeeER sadalp v0.2d, v0.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 sadalp v0.2d, v0.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [5] Code Region - sdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 0.8 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D===============eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] D================eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] D===================eeeER sdot v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.8 0.1 0.0 + +# CHECK: [6] Code Region - smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 0.8 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D===============eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] D================eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] D===================eeeER smmla v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.8 0.1 0.0 + +# CHECK: [7] Code Region - mla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] D======================eeeeER mla v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [8] Code Region - smlal2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER smlal2 v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [9] Code Region - ssra + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [0,3] D=========eeeeER . . . ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [1,2] D==================eeeeER. . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [1,3] D======================eeeeER ssra v0.2d, v0.2d, #1 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [10] Code Region - fcmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D================eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,2] D==================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,3] D======================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [11] Code Region - fmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D=eeeeER . . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,2] D=====eeER. . . . . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: [1,0] D=================eeeER . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D==================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,2] D======================eeER . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] D========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,4] D==========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,5] D==============================eeeeER fmla v0.2d, v0.2d, v1.2d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 2. 2 14.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: 2 15.3 0.1 0.0 + +# CHECK: [12] Code Region - fmlal + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 2203 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0123456 + +# CHECK: [0,0] DeeeER . . . . . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,2] D========eeER . . . . . . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,4] D============eeeeeER. . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,5] D=================eeeeeER. . . . .. fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: [1,0] D======================eeeER . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D=========================eeeeeER . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,2] D==============================eeER. . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] D================================eeeeeER. .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,4] D==================================eeeeeER .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,5] D=======================================eeeeeER fmlal v0.4s, v0.4h, v1.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 12.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 15.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 2. 2 20.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 22.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 5. 2 29.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: 2 20.3 0.1 0.0 + +# CHECK: [13] Code Region - bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D================eeeeER . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER bfdot v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [14] Code Region - bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D===================eeeeeER . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D======================eeeeeER. . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D===========================eeeeeER bfmmla v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 14.0 0.1 0.0 + +# CHECK: [15] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D================eeeeER . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER bfmlalb v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [16] Code Region - crc32cb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul w0, w0, w0 +# CHECK-NEXT: [0,1] D==eeER . .. crc32cb w0, w0, w1 +# CHECK-NEXT: [0,2] D===eeER . .. crc32cb w0, w0, w1 +# CHECK-NEXT: [0,3] D=====eeER. .. crc32cb w0, w0, w0 +# CHECK-NEXT: [1,0] D=======eeER .. mul w0, w0, w0 +# CHECK-NEXT: [1,1] D=========eeER .. crc32cb w0, w0, w1 +# CHECK-NEXT: [1,2] D==========eeER.. crc32cb w0, w0, w1 +# CHECK-NEXT: [1,3] D============eeER crc32cb w0, w0, w0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul w0, w0, w0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 crc32cb w0, w0, w0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [17] Code Region - Z sdot.s + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [18] Code Region - Z sudot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [19] Code Region - Z sdot.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h +# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D===================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,2] D====================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,3] D========================eeeeER sdot z0.d, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sdot z0.d, z0.h, z1.h +# CHECK-NEXT: 2 13.3 0.1 0.0 + +# CHECK: [20] Code Region - Z smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeER . . .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeER. . .. smmla z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] D==================eeeER .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] D=====================eeeER smmla z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smmla z0.s, z0.b, z1.b +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [21] Code Region - Z mla.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER mla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [22] Code Region - Z mad.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. mad z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER mad z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mad z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [23] Code Region - Z msb.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. msb z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER msb z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 msb z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [24] Code Region - Z fcmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: 2 13.0 0.1 0.0 + +# CHECK: [25] Code Region - Z fcmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.s, z0.s, z1.s[1], #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: 2 13.0 0.1 0.0 + +# CHECK: [26] Code Region - Z fmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [27] Code Region - Z fmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, z0.d, z1.d[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [28] Code Region - Z bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D======================eeeeER bfdot z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [29] Code Region - Z bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D===================eeeeeER . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D======================eeeeeER. . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D===========================eeeeeER bfmmla z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: 2 14.0 0.1 0.0 + +# CHECK: [30] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D====================eeeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D=========================eeeeeER bfmlalb z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: 2 13.0 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s index 1e8df4770d7950..65b73177c7b70a 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s @@ -1365,8 +1365,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 fcmgt s10, s11, s12 # CHECK-NEXT: 1 2 0.25 fcmgt v0.4s, v0.4s, #0.0 # CHECK-NEXT: 1 2 0.25 fcmgt v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 2 0.25 fcmla v0.2s, v0.2s, v0.2s, #90 -# CHECK-NEXT: 1 2 0.25 fcmla v0.4s, v0.4s, v0.s[1], #0 +# CHECK-NEXT: 1 4 0.25 fcmla v0.2s, v0.2s, v0.2s, #90 +# CHECK-NEXT: 1 4 0.25 fcmla v0.4s, v0.4s, v0.s[1], #0 # CHECK-NEXT: 1 2 0.25 fcmle d20, d21, #0.0 # CHECK-NEXT: 1 2 0.25 fcmle s10, s11, #0.0 # CHECK-NEXT: 1 2 0.25 fcmle v0.2d, v0.2d, #0.0 @@ -1651,7 +1651,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 7 8 1.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] # CHECK-NEXT: 8 8 1.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 # CHECK-NEXT: 8 8 1.00 * ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 -# CHECK-NEXT: 1 2 0.25 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mla v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 mls v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 2 0.25 mov b0, v0.b[15] # CHECK-NEXT: 1 2 0.25 mov d6, v0.d[1] @@ -1673,7 +1673,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 movi v0.2s, #8, msl #8 # CHECK-NEXT: 1 2 0.25 movi v0.4s, #255, lsl #24 # CHECK-NEXT: 1 2 0.25 movi v0.8b, #255 -# CHECK-NEXT: 1 2 0.25 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mul v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 mvni v0.2s, #0 # CHECK-NEXT: 1 2 0.25 mvni v0.4s, #16, msl #16 # CHECK-NEXT: 1 2 0.25 neg d29, d24 @@ -1780,10 +1780,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 2 4 1.00 scvtf v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 scvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 4 6 1.00 scvtf v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sdot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 sdot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 sdot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 sdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 sdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 sdot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 shadd v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 shl d7, d10, #12 # CHECK-NEXT: 1 2 0.50 shl v0.16b, v0.16b, #3 @@ -1873,26 +1873,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 sqadd b20, b11, b15 # CHECK-NEXT: 1 2 0.25 sqadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.25 sqadd v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 2 0.25 sqdmlal d19, s24, s12 +# CHECK-NEXT: 1 4 0.50 sqdmlal d19, s24, s12 # CHECK-NEXT: 1 4 0.50 sqdmlal d8, s9, v0.s[1] # CHECK-NEXT: 1 4 0.50 sqdmlal s0, h0, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmlal s17, h27, h12 +# CHECK-NEXT: 1 4 0.50 sqdmlal s17, h27, h12 # CHECK-NEXT: 1 4 0.50 sqdmlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqdmlsl d12, s23, s13 +# CHECK-NEXT: 1 4 0.50 sqdmlsl d12, s23, s13 # CHECK-NEXT: 1 4 0.50 sqdmlsl d8, s9, v0.s[1] # CHECK-NEXT: 1 4 0.50 sqdmlsl s0, h0, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmlsl s14, h12, h25 +# CHECK-NEXT: 1 4 0.50 sqdmlsl s14, h12, h25 # CHECK-NEXT: 1 4 0.50 sqdmlsl v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmlsl v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqdmulh h10, h11, h12 -# CHECK-NEXT: 1 2 0.25 sqdmulh h7, h15, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmulh s15, s14, v0.s[1] -# CHECK-NEXT: 1 2 0.25 sqdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmulh s20, s21, s2 # CHECK-NEXT: 1 4 0.50 sqdmulh v0.2s, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmulh v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 sqdmull d1, s1, v0.s[1] @@ -1914,34 +1914,34 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 sqneg v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 sqneg v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 sqneg v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqrdmlah h0, h1, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah s0, s1, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, h2 # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.4h # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.8h # CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, s2 # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.2s # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 1 2 0.25 sqrdmlsh h0, h1, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh s0, s1, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, h2 # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.4h # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.8h # CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, s2 # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.2s # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 1 2 0.25 sqrdmulh h10, h11, h12 -# CHECK-NEXT: 1 2 0.25 sqrdmulh h7, h15, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmulh s15, s14, v0.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqrdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s20, s21, s2 # CHECK-NEXT: 1 4 0.50 sqrdmulh v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqrdmulh v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 sqrshl d31, d31, d31 @@ -2124,8 +2124,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 5 4 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], x5 # CHECK-NEXT: 1 2 0.25 sub d15, d5, d16 # CHECK-NEXT: 1 2 0.25 sub v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 1 2 0.25 sudot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 sudot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sudot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sudot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 suqadd b19, b14 # CHECK-NEXT: 1 2 0.25 suqadd d18, d22 # CHECK-NEXT: 1 2 0.25 suqadd h20, h15 @@ -2222,10 +2222,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 2 4 1.00 ucvtf v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 ucvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 4 6 1.00 ucvtf v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 udot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 udot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 udot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 udot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 udot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 udot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 udot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 udot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 uhadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.25 uhadd v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 uhsub v0.4s, v0.4s, v0.4s @@ -2356,10 +2356,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 ursra v0.4s, v0.4s, #3 # CHECK-NEXT: 1 4 0.50 ursra v0.8b, v0.8b, #3 # CHECK-NEXT: 1 4 0.50 ursra v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 2 0.25 usdot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 usdot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 usdot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 usdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 usdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 usdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 usdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 usdot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.50 ushl d0, d0, d0 # CHECK-NEXT: 1 2 0.50 ushl v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.50 ushl v0.4s, v0.4s, v0.4s @@ -2465,7 +2465,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] -# CHECK-NEXT: - - - - 26.67 49.17 49.17 18.75 7.75 7.75 7.75 394.50 377.00 349.00 331.50 +# CHECK-NEXT: - - - - 26.67 49.17 49.17 18.75 7.75 7.75 7.75 401.00 370.50 355.50 325.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions: @@ -2892,7 +2892,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - 1.00 1.00 1.00 - - - - 1.00 1.00 1.00 1.00 ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] # CHECK-NEXT: - - - - 1.00 1.00 1.00 0.25 0.25 0.25 0.25 1.00 1.00 1.00 1.00 ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 # CHECK-NEXT: - - - - 1.00 1.00 1.00 0.25 0.25 0.25 0.25 1.00 1.00 1.00 1.00 ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mla v0.8b, v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mls v0.4h, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mov b0, v0.b[15] # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mov d6, v0.d[1] @@ -2914,7 +2914,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.2s, #8, msl #8 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.4s, #255, lsl #24 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.8b, #255 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mul v0.8b, v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mvni v0.2s, #0 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mvni v0.4s, #16, msl #16 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 neg d29, d24 @@ -3114,26 +3114,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd b20, b11, b15 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd v0.2s, v0.2s, v0.2s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlal d19, s24, s12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal d19, s24, s12 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal d8, s9, v0.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal s0, h0, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlal s17, h27, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal s17, h27, h12 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlsl d12, s23, s13 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl d12, s23, s13 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl d8, s9, v0.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl s0, h0, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlsl s14, h12, h25 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl s14, h12, h25 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh h10, h11, h12 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh h7, h15, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh s15, s14, v0.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh s20, s21, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh v0.2s, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh v0.4s, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmull d1, s1, v0.s[1] @@ -3155,34 +3155,34 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah h0, h1, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah s0, s1, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah h0, h1, h2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4h, v1.4h, v2.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.8h, v1.8h, v2.8h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah s0, s1, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.2s, v1.2s, v2.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh h0, h1, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh s0, s1, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh h0, h1, h2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4h, v1.4h, v2.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.8h, v1.8h, v2.8h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh s0, s1, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.2s, v1.2s, v2.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4s, v1.4s, v2.4s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh h10, h11, h12 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh h7, h15, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh s15, s14, v0.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh s20, s21, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh v0.4h, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - - 0.50 - 0.50 sqrshl d31, d31, d31 From 90627a5a190a99ae2991d524580d866484aaba16 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Fri, 11 Oct 2024 14:01:58 +0200 Subject: [PATCH 157/177] Revert "[XRay] Add support for instrumentation of DSOs on x86_64 (#90959)" This reverts commit a4402039bffd788b9af82435fd5a2fb311fdc6e8 and 4451f9f812d458f6b53785b27869674caf01e67b --- clang/include/clang/Basic/CodeGenOptions.def | 2 - clang/include/clang/Driver/Options.td | 5 - clang/include/clang/Driver/XRayArgs.h | 2 - clang/lib/Driver/ToolChains/CommonArgs.cpp | 12 +- clang/lib/Driver/XRayArgs.cpp | 21 -- clang/test/Driver/XRay/xray-shared.cpp | 17 - .../cmake/Modules/AllSupportedArchDefs.cmake | 1 - compiler-rt/cmake/config-ix.cmake | 4 - compiler-rt/include/xray/xray_interface.h | 65 +--- compiler-rt/lib/xray/CMakeLists.txt | 86 +----- compiler-rt/lib/xray/xray_dso_init.cpp | 62 ---- compiler-rt/lib/xray/xray_init.cpp | 183 ++--------- compiler-rt/lib/xray/xray_interface.cpp | 291 ++++-------------- .../lib/xray/xray_interface_internal.h | 83 +---- compiler-rt/lib/xray/xray_trampoline_x86_64.S | 24 +- compiler-rt/lib/xray/xray_x86_64.cpp | 23 +- .../xray/TestCases/Posix/basic-mode-dso.cpp | 47 --- .../TestCases/Posix/clang-xray-shared.cpp | 14 - .../test/xray/TestCases/Posix/dlopen.cpp | 107 ------- .../xray/TestCases/Posix/dso-dep-chains.cpp | 197 ------------ .../TestCases/Posix/patch-premain-dso.cpp | 45 --- .../Posix/patching-unpatching-dso.cpp | 75 ----- 22 files changed, 147 insertions(+), 1219 deletions(-) delete mode 100644 clang/test/Driver/XRay/xray-shared.cpp delete mode 100644 compiler-rt/lib/xray/xray_dso_init.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/dlopen.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index e45370bde74a5d..eac831278ee20d 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -136,8 +136,6 @@ CODEGENOPT(XRayIgnoreLoops , 1, 0) ///< Emit the XRay function index section. CODEGENOPT(XRayFunctionIndex , 1, 1) -///< Set when -fxray-shared is enabled -CODEGENOPT(XRayShared , 1, 0) ///< Set the minimum number of instructions in a function to determine selective ///< XRay instrumentation. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4ee16e213d0e13..d306c751505e98 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2946,11 +2946,6 @@ def fxray_selected_function_group : HelpText<"When using -fxray-function-groups, select which group of functions to instrument. Valid range is 0 to fxray-function-groups - 1">, MarshallingInfoInt, "0">; -defm xray_shared : BoolFOption<"xray-shared", - CodeGenOpts<"XRayShared">, DefaultFalse, - PosFlag, - NegFlag>; defm fine_grained_bitfield_accesses : BoolOption<"f", "fine-grained-bitfield-accesses", CodeGenOpts<"FineGrainedBitfieldAccesses">, DefaultFalse, diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h index 1b5c4a4c42f12a..bdd3d979547eed 100644 --- a/clang/include/clang/Driver/XRayArgs.h +++ b/clang/include/clang/Driver/XRayArgs.h @@ -27,7 +27,6 @@ class XRayArgs { XRayInstrSet InstrumentationBundle; llvm::opt::Arg *XRayInstrument = nullptr; bool XRayRT = true; - bool XRayShared = false; public: /// Parses the XRay arguments from an argument list. @@ -36,7 +35,6 @@ class XRayArgs { llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const; bool needsXRayRt() const { return XRayInstrument && XRayRT; } - bool needsXRayDSORt() const { return XRayInstrument && XRayRT && XRayShared; } llvm::ArrayRef modeList() const { return Modes; } XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; } }; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0a1b7c209563e8..0c6a585c3acffd 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1613,14 +1613,10 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, } bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) { - if (Args.hasArg(options::OPT_shared)) { - if (TC.getXRayArgs().needsXRayDSORt()) { - CmdArgs.push_back("--whole-archive"); - CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray-dso")); - CmdArgs.push_back("--no-whole-archive"); - return true; - } - } else if (TC.getXRayArgs().needsXRayRt()) { + if (Args.hasArg(options::OPT_shared)) + return false; + + if (TC.getXRayArgs().needsXRayRt()) { CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray")); for (const auto &Mode : TC.getXRayArgs().modeList()) diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index d0bb5d4887c184..8c5134e2501358 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -63,23 +63,6 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { << XRayInstrument->getSpelling() << Triple.str(); } - if (Args.hasFlag(options::OPT_fxray_shared, options::OPT_fno_xray_shared, - false)) { - XRayShared = true; - - // DSO instrumentation is currently limited to x86_64 - if (Triple.getArch() != llvm::Triple::x86_64) { - D.Diag(diag::err_drv_unsupported_opt_for_target) - << "-fxray-shared" << Triple.str(); - } - - unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args)); - if (!PICLvl) { - D.Diag(diag::err_opt_not_valid_without_opt) << "-fxray-shared" - << "-fPIC"; - } - } - // Both XRay and -fpatchable-function-entry use // TargetOpcode::PATCHABLE_FUNCTION_ENTER. if (Arg *A = Args.getLastArg(options::OPT_fpatchable_function_entry_EQ)) @@ -194,10 +177,6 @@ void XRayArgs::addArgs(const ToolChain &TC, const ArgList &Args, Args.addOptOutFlag(CmdArgs, options::OPT_fxray_function_index, options::OPT_fno_xray_function_index); - if (XRayShared) - Args.addOptInFlag(CmdArgs, options::OPT_fxray_shared, - options::OPT_fno_xray_shared); - if (const Arg *A = Args.getLastArg(options::OPT_fxray_instruction_threshold_EQ)) { int Value; diff --git a/clang/test/Driver/XRay/xray-shared.cpp b/clang/test/Driver/XRay/xray-shared.cpp deleted file mode 100644 index 215854e1fc7cef..00000000000000 --- a/clang/test/Driver/XRay/xray-shared.cpp +++ /dev/null @@ -1,17 +0,0 @@ -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fpic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s -// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-PIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC -// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-pic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC - -// On 64 bit darwin, PIC is always enabled -// RUN: %clang -### --target=x86_64-apple-darwin -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s - -// Check unsupported targets -// RUN: not %clang -### --target=aarch64-pc-freebsd -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET -// RUN: not %clang -### --target=arm64-apple-macos -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET - -// CHECK: "-cc1" {{.*}}"-fxray-instrument" {{.*}}"-fxray-shared" -// ERR-TARGET: error: unsupported option '-fxray-shared' for target -// ERR-PIC: error: option '-fxray-shared' cannot be specified without '-fPIC' - diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake index 50a4256b82fe4e..809e9277156912 100644 --- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake +++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake @@ -104,7 +104,6 @@ else() set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} powerpc64le ${HEXAGON} ${LOONGARCH64}) endif() -set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64}) set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64}) if (UNIX) diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 6134c9876b38e9..a93a88a9205001 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -668,9 +668,6 @@ if(APPLE) list_intersect(XRAY_SUPPORTED_ARCH ALL_XRAY_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) - list_intersect(XRAY_DSO_SUPPORTED_ARCH - ALL_XRAY_DSO_SUPPORTED_ARCH - SANITIZER_COMMON_SUPPORTED_ARCH) list_intersect(SHADOWCALLSTACK_SUPPORTED_ARCH ALL_SHADOWCALLSTACK_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) @@ -705,7 +702,6 @@ else() filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH}) filter_available_targets(SCUDO_STANDALONE_SUPPORTED_ARCH ${ALL_SCUDO_STANDALONE_SUPPORTED_ARCH}) filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH}) - filter_available_targets(XRAY_DSO_SUPPORTED_ARCH ${ALL_XRAY_DSO_SUPPORTED_ARCH}) filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH}) filter_available_targets(GWP_ASAN_SUPPORTED_ARCH ${ALL_GWP_ASAN_SUPPORTED_ARCH}) diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 675ea0cbc48c83..727431c04e4f73 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -93,78 +93,31 @@ enum XRayPatchingStatus { FAILED = 3, }; -/// This tells XRay to patch the instrumentation points in all currently loaded -/// objects. See XRayPatchingStatus for possible result values. +/// This tells XRay to patch the instrumentation points. See XRayPatchingStatus +/// for possible result values. extern XRayPatchingStatus __xray_patch(); -/// This tells XRay to patch the instrumentation points in the given object. -/// See XRayPatchingStatus for possible result values. -extern XRayPatchingStatus __xray_patch_object(int32_t ObjId); - /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch(); -/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for -/// possible result values. -extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); - -/// This unpacks the given (packed) function id and patches -/// the corresponding function. See XRayPatchingStatus for possible +/// This patches a specific function id. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_patch_function(int32_t FuncId); -/// This patches a specific function in the given object. See XRayPatchingStatus -/// for possible result values. -extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, - int32_t ObjId); - -/// This unpacks the given (packed) function id and unpatches -/// the corresponding function. See XRayPatchingStatus for possible +/// This unpatches a specific function id. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); -/// This unpatches a specific function in the given object. -/// See XRayPatchingStatus for possible result values. -extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId, - int32_t ObjId); - -/// This function unpacks the given (packed) function id and returns the address -/// of the corresponding function. We return 0 if we encounter any error, even -/// if 0 may be a valid function address. +/// This function returns the address of the function provided a valid function +/// id. We return 0 if we encounter any error, even if 0 may be a valid function +/// address. extern uintptr_t __xray_function_address(int32_t FuncId); -/// This function returns the address of the function in the given object -/// provided valid function and object ids. We return 0 if we encounter any -/// error, even if 0 may be a valid function address. -extern uintptr_t __xray_function_address_in_object(int32_t FuncId, - int32_t ObjId); - -/// This function returns the maximum valid function id for the main executable -/// (object id = 0). Returns 0 if we encounter errors (when there are no -/// instrumented functions, etc.). +/// This function returns the maximum valid function id. Returns 0 if we +/// encounter errors (when there are no instrumented functions, etc.). extern size_t __xray_max_function_id(); -/// This function returns the maximum valid function id for the given object. -/// Returns 0 if we encounter errors (when there are no instrumented functions, -/// etc.). -extern size_t __xray_max_function_id_in_object(int32_t ObjId); - -/// This function returns the number of previously registered objects -/// (executable + loaded DSOs). Returns 0 if XRay has not been initialized. -extern size_t __xray_num_objects(); - -/// Unpacks the function id from the given packed id. -extern int32_t __xray_unpack_function_id(int32_t PackedId); - -/// Unpacks the object id from the given packed id. -extern int32_t __xray_unpack_object_id(int32_t PackedId); - -/// Creates and returns a packed id from the given function and object ids. -/// If the ids do not fit within the reserved number of bits for each part, the -/// high bits are truncated. -extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId); - /// Initialize the required XRay data structures. This is useful in cases where /// users want to control precisely when the XRay instrumentation data /// structures are initialized, for example when the XRay library is built with diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt index f38c07420c9abf..cf7b5062aae32d 100644 --- a/compiler-rt/lib/xray/CMakeLists.txt +++ b/compiler-rt/lib/xray/CMakeLists.txt @@ -10,10 +10,6 @@ set(XRAY_SOURCES xray_utils.cpp ) -set(XRAY_DSO_SOURCES - xray_dso_init.cpp - ) - # Implementation files for all XRay modes. set(XRAY_FDR_MODE_SOURCES xray_fdr_flags.cpp @@ -37,11 +33,6 @@ set(x86_64_SOURCES xray_trampoline_x86_64.S ) -set(x86_64_DSO_SOURCES - xray_trampoline_x86_64.S - ) - - set(arm_SOURCES xray_arm.cpp xray_trampoline_arm.S @@ -137,12 +128,10 @@ set(XRAY_IMPL_HEADERS # consumption by tests. set(XRAY_ALL_SOURCE_FILES ${XRAY_SOURCES} - ${XRAY_DSO_SOURCES} ${XRAY_FDR_MODE_SOURCES} ${XRAY_BASIC_MODE_SOURCES} ${XRAY_PROFILING_MODE_SOURCES} ${x86_64_SOURCES} - ${x86_64_DSO_SOURCES} ${arm_SOURCES} ${armhf_SOURCES} ${hexagon_SOURCES} @@ -173,9 +162,6 @@ set(XRAY_CFLAGS ${COMPILER_RT_CXX_CFLAGS}) set(XRAY_COMMON_DEFINITIONS SANITIZER_COMMON_NO_REDEFINE_BUILTINS XRAY_HAS_EXCEPTIONS=1) -# DSO trampolines need to be compiled with GOT addressing -set(XRAY_COMMON_DEFINITIONS_DSO ${XRAY_COMMON_DEFINITIONS} XRAY_PIC) - # Too many existing bugs, needs cleanup. append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format XRAY_CFLAGS) @@ -215,16 +201,7 @@ if (APPLE) CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) - add_compiler_rt_object_libraries(RTXrayDSO - OS ${XRAY_SUPPORTED_OS} - ARCHS ${XRAY_DSO_SUPPORTED_ARCH} - SOURCES ${XRAY_DSO_SOURCES} - ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS_DSO} - DEPS ${XRAY_DEPS}) set(XRAY_RTXRAY_ARCH_LIBS "") - set(XRAY_DSO_RTXRAY_ARCH_LIBS "") foreach(arch ${XRAY_SUPPORTED_ARCH}) if(NOT ${arch} IN_LIST XRAY_SOURCE_ARCHS) continue() @@ -238,17 +215,6 @@ if (APPLE) DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) list(APPEND XRAY_RTXRAY_ARCH_LIBS RTXray_${arch}) - if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) - add_compiler_rt_object_libraries(RTXrayDSO_${arch} - OS ${XRAY_SUPPORTED_OS} - ARCHS ${XRAY_DSO_SUPPORTED_ARCH} - SOURCES ${${arch}_DSO_SOURCES} - ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS_DSO} - DEPS ${XRAY_DEPS}) - list(APPEND XRAY_DSO_RTXRAY_ARCH_LIBS RTXrayDSO_${arch}) - endif() endforeach() add_compiler_rt_object_libraries(RTXrayFDR OS ${XRAY_SUPPORTED_OS} @@ -286,17 +252,6 @@ if (APPLE) LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} LINK_LIBS ${XRAY_LINK_LIBS} PARENT_TARGET xray) - add_compiler_rt_runtime(clang_rt.xray-dso - STATIC - OS ${XRAY_SUPPORTED_OS} - ARCHS ${XRAY_DSO_SUPPORTED_ARCH} - OBJECT_LIBS RTXrayDSO ${XRAY_DSO_RTXRAY_ARCH_LIBS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS} - LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - PARENT_TARGET xray) - add_compiler_rt_runtime(clang_rt.xray-fdr STATIC OS ${XRAY_SUPPORTED_OS} @@ -391,37 +346,16 @@ else() # not Apple DEFS ${XRAY_COMMON_DEFINITIONS} OBJECT_LIBS RTXrayBASIC PARENT_TARGET xray) - # Profiler Mode runtime - add_compiler_rt_runtime(clang_rt.xray-profiling - STATIC - ARCHS ${arch} - CFLAGS ${XRAY_CFLAGS} - LINK_FLAGS ${XRAY_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - DEFS ${XRAY_COMMON_DEFINITIONS} - OBJECT_LIBS RTXrayPROFILING - PARENT_TARGET xray) - - if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) - # TODO: Only implemented for X86 at the moment - add_compiler_rt_object_libraries(RTXrayDSO - ARCHS ${arch} - SOURCES ${XRAY_DSO_SOURCES} ${${arch}_DSO_SOURCES} - ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS_DSO} - DEPS ${XRAY_DEPS}) - # DSO runtime archive - add_compiler_rt_runtime(clang_rt.xray-dso - STATIC - ARCHS ${arch} - CFLAGS ${XRAY_CFLAGS} - LINK_FLAGS ${XRAY_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - DEFS ${XRAY_COMMON_DEFINITIONS} - OBJECT_LIBS RTXrayDSO - PARENT_TARGET xray) - endif() + # Profiler Mode runtime + add_compiler_rt_runtime(clang_rt.xray-profiling + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + LINK_FLAGS ${XRAY_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayPROFILING + PARENT_TARGET xray) endforeach() endif() # not Apple diff --git a/compiler-rt/lib/xray/xray_dso_init.cpp b/compiler-rt/lib/xray/xray_dso_init.cpp deleted file mode 100644 index eb754db54c64fa..00000000000000 --- a/compiler-rt/lib/xray/xray_dso_init.cpp +++ /dev/null @@ -1,62 +0,0 @@ -//===-- xray_init.cpp -------------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is a part of XRay, a dynamic runtime instrumentation system. -// -// XRay initialisation logic for DSOs. -//===----------------------------------------------------------------------===// - -#include "sanitizer_common/sanitizer_atomic.h" -#include "xray_defs.h" -#include "xray_flags.h" -#include "xray_interface_internal.h" - -using namespace __sanitizer; - -extern "C" { -extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak)) -__attribute__((visibility("hidden"))); -extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak)) -__attribute__((visibility("hidden"))); -extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)) -__attribute__((visibility("hidden"))); -extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)) -__attribute__((visibility("hidden"))); - -#if SANITIZER_APPLE -// HACK: This is a temporary workaround to make XRay build on -// Darwin, but it will probably not work at runtime. -extern const XRaySledEntry __start_xray_instr_map[] = {}; -extern const XRaySledEntry __stop_xray_instr_map[] = {}; -extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {}; -extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {}; -#endif -} - -// Handler functions to call in the patched entry/exit sled. -extern atomic_uintptr_t XRayPatchedFunction; -extern atomic_uintptr_t XRayArgLogger; -extern atomic_uintptr_t XRayPatchedCustomEvent; -extern atomic_uintptr_t XRayPatchedTypedEvent; - -static int __xray_object_id{-1}; - -// Note: .preinit_array initialization does not work for DSOs -__attribute__((constructor(0))) static void -__xray_init_dso() XRAY_NEVER_INSTRUMENT { - // Register sleds in main XRay runtime. - __xray_object_id = - __xray_register_dso(__start_xray_instr_map, __stop_xray_instr_map, - __start_xray_fn_idx, __stop_xray_fn_idx, {}); -} - -__attribute__((destructor(0))) static void -__xray_finalize_dso() XRAY_NEVER_INSTRUMENT { - // Inform the main runtime that this DSO is no longer used. - __xray_deregister_dso(__xray_object_id); -} diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp index 53c93be89cd148..f22a31b95686d0 100644 --- a/compiler-rt/lib/xray/xray_init.cpp +++ b/compiler-rt/lib/xray/xray_init.cpp @@ -16,8 +16,6 @@ #include #include "sanitizer_common/sanitizer_common.h" -#include "xray/xray_interface.h" -#include "xray_allocator.h" #include "xray_defs.h" #include "xray_flags.h" #include "xray_interface_internal.h" @@ -30,7 +28,7 @@ extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)); extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)); #if SANITIZER_APPLE -// HACK: This is a temporary workaround to make XRay build on +// HACK: This is a temporary workaround to make XRay build on // Darwin, but it will probably not work at runtime. const XRaySledEntry __start_xray_instr_map[] = {}; extern const XRaySledEntry __stop_xray_instr_map[] = {}; @@ -45,16 +43,14 @@ using namespace __xray; // the weak symbols defined above (__start_xray_inst_map and // __stop_xray_instr_map) to initialise the instrumentation map that XRay uses // for runtime patching/unpatching of instrumentation points. +// +// FIXME: Support DSO instrumentation maps too. The current solution only works +// for statically linked executables. atomic_uint8_t XRayInitialized{0}; // This should always be updated before XRayInitialized is updated. SpinMutex XRayInstrMapMutex; - -// Contains maps for the main executable as well as DSOs. -XRaySledMap *XRayInstrMaps; - -// Number of binary objects registered. -atomic_uint32_t XRayNumObjects{0}; +XRaySledMap XRayInstrMap; // Global flag to determine whether the flags have been initialized. atomic_uint8_t XRayFlagsInitialized{0}; @@ -62,63 +58,6 @@ atomic_uint8_t XRayFlagsInitialized{0}; // A mutex to allow only one thread to initialize the XRay data structures. SpinMutex XRayInitMutex; -// Registers XRay sleds and trampolines coming from the main executable or one -// of the linked DSOs. -// Returns the object ID if registration is successful, -1 otherwise. -int32_t -__xray_register_sleds(const XRaySledEntry *SledsBegin, - const XRaySledEntry *SledsEnd, - const XRayFunctionSledIndex *FnIndexBegin, - const XRayFunctionSledIndex *FnIndexEnd, bool FromDSO, - XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { - if (!SledsBegin || !SledsEnd) { - Report("Invalid XRay sleds.\n"); - return -1; - } - XRaySledMap SledMap; - SledMap.FromDSO = FromDSO; - SledMap.Loaded = true; - SledMap.Trampolines = Trampolines; - SledMap.Sleds = SledsBegin; - SledMap.Entries = SledsEnd - SledsBegin; - if (FnIndexBegin != nullptr) { - SledMap.SledsIndex = FnIndexBegin; - SledMap.Functions = FnIndexEnd - FnIndexBegin; - } else { - size_t CountFunctions = 0; - uint64_t LastFnAddr = 0; - - for (std::size_t I = 0; I < SledMap.Entries; I++) { - const auto &Sled = SledMap.Sleds[I]; - const auto Function = Sled.function(); - if (Function != LastFnAddr) { - CountFunctions++; - LastFnAddr = Function; - } - } - SledMap.SledsIndex = nullptr; - SledMap.Functions = CountFunctions; - } - if (SledMap.Functions >= XRayMaxFunctions) { - Report("Too many functions! Maximum is %ld\n", XRayMaxFunctions); - return -1; - } - - if (Verbosity()) - Report("Registering %d new functions!\n", SledMap.Functions); - - { - SpinMutexLock Guard(&XRayInstrMapMutex); - auto Idx = atomic_fetch_add(&XRayNumObjects, 1, memory_order_acq_rel); - if (Idx >= XRayMaxObjects) { - Report("Too many objects registered! Maximum is %ld\n", XRayMaxObjects); - return -1; - } - XRayInstrMaps[Idx] = std::move(SledMap); - return Idx; - } -} - // __xray_init() will do the actual loading of the current process' memory map // and then proceed to look for the .xray_instr_map section/segment. void __xray_init() XRAY_NEVER_INSTRUMENT { @@ -141,21 +80,29 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { return; } - atomic_store(&XRayNumObjects, 0, memory_order_release); - - // Pre-allocation takes up approx. 5kB for XRayMaxObjects=64. - XRayInstrMaps = allocateBuffer(XRayMaxObjects); - - int MainBinaryId = - __xray_register_sleds(__start_xray_instr_map, __stop_xray_instr_map, - __start_xray_fn_idx, __stop_xray_fn_idx, false, {}); + { + SpinMutexLock Guard(&XRayInstrMapMutex); + XRayInstrMap.Sleds = __start_xray_instr_map; + XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map; + if (__start_xray_fn_idx != nullptr) { + XRayInstrMap.SledsIndex = __start_xray_fn_idx; + XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx; + } else { + size_t CountFunctions = 0; + uint64_t LastFnAddr = 0; + + for (std::size_t I = 0; I < XRayInstrMap.Entries; I++) { + const auto &Sled = XRayInstrMap.Sleds[I]; + const auto Function = Sled.function(); + if (Function != LastFnAddr) { + CountFunctions++; + LastFnAddr = Function; + } + } - // The executable should always get ID 0. - if (MainBinaryId != 0) { - Report("Registering XRay sleds failed.\n"); - return; + XRayInstrMap.Functions = CountFunctions; + } } - atomic_store(&XRayInitialized, true, memory_order_release); #ifndef XRAY_NO_PREINIT @@ -164,84 +111,6 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { #endif } -// Registers XRay sleds and trampolines of an instrumented DSO. -// Returns the object ID if registration is successful, -1 otherwise. -// -// Default visibility is hidden, so we have to explicitly make it visible to -// DSO. -SANITIZER_INTERFACE_ATTRIBUTE int32_t __xray_register_dso( - const XRaySledEntry *SledsBegin, const XRaySledEntry *SledsEnd, - const XRayFunctionSledIndex *FnIndexBegin, - const XRayFunctionSledIndex *FnIndexEnd, - XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { - // Make sure XRay has been initialized in the main executable. - __xray_init(); - - if (__xray_num_objects() == 0) { - if (Verbosity()) - Report("No XRay instrumentation map in main executable. Not initializing " - "XRay for DSO.\n"); - return -1; - } - - // Register sleds in global map. - int ObjId = __xray_register_sleds(SledsBegin, SledsEnd, FnIndexBegin, - FnIndexEnd, true, Trampolines); - -#ifndef XRAY_NO_PREINIT - if (ObjId >= 0 && flags()->patch_premain) - __xray_patch_object(ObjId); -#endif - - return ObjId; -} - -// Deregisters a DSO from the main XRay runtime. -// Called from the DSO-local runtime when the library is unloaded (e.g. if -// dlclose is called). -// Returns true if the object ID is valid and the DSO was successfully -// deregistered. -SANITIZER_INTERFACE_ATTRIBUTE bool -__xray_deregister_dso(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - - if (!atomic_load(&XRayInitialized, memory_order_acquire)) { - if (Verbosity()) - Report("XRay has not been initialized. Cannot deregister DSO.\n"); - return false; - } - - if (ObjId <= 0 || ObjId >= __xray_num_objects()) { - if (Verbosity()) - Report("Can't deregister object with ID %d: ID is invalid.\n", ObjId); - return false; - } - - { - SpinMutexLock Guard(&XRayInstrMapMutex); - auto &Entry = XRayInstrMaps[ObjId]; - if (!Entry.FromDSO) { - if (Verbosity()) - Report("Can't deregister object with ID %d: object does not correspond " - "to a shared library.\n", - ObjId); - return false; - } - if (!Entry.Loaded) { - if (Verbosity()) - Report("Can't deregister object with ID %d: object is not loaded.\n", - ObjId); - return true; - } - // Mark DSO as unloaded. No need to unpatch. - Entry.Loaded = false; - } - - if (Verbosity()) - Report("Deregistered object with ID %d.\n", ObjId); - - return true; -} - // FIXME: Make check-xray tests work on FreeBSD without // SANITIZER_CAN_USE_PREINIT_ARRAY. // See sanitizer_internal_defs.h where the macro is defined. diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 402fc3d07b4e2a..5839043fcb93a8 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -36,8 +36,7 @@ extern __sanitizer::SpinMutex XRayInstrMapMutex; extern __sanitizer::atomic_uint8_t XRayInitialized; -extern __xray::XRaySledMap *XRayInstrMaps; -extern __sanitizer::atomic_uint32_t XRayNumObjects; +extern __xray::XRaySledMap XRayInstrMap; namespace __xray { @@ -62,16 +61,16 @@ static const int16_t cSledLength = 20; #endif /* CPU architecture */ // This is the function to call when we encounter the entry or exit sleds. -atomic_uintptr_t XRayPatchedFunction SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayPatchedFunction{0}; // This is the function to call from the arg1-enabled sleds/trampolines. -atomic_uintptr_t XRayArgLogger SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayArgLogger{0}; // This is the function to call when we encounter a custom event log call. -atomic_uintptr_t XRayPatchedCustomEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayPatchedCustomEvent{0}; // This is the function to call when we encounter a typed event log call. -atomic_uintptr_t XRayPatchedTypedEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayPatchedTypedEvent{0}; // This is the global status to determine whether we are currently // patching/unpatching. @@ -151,42 +150,27 @@ class MProtectHelper { namespace { -bool isObjectLoaded(int32_t ObjId) { - SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - return false; - } - return XRayInstrMaps[ObjId].Loaded; -} - -bool patchSled(const XRaySledEntry &Sled, bool Enable, int32_t FuncId, - const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT { +bool patchSled(const XRaySledEntry &Sled, bool Enable, + int32_t FuncId) XRAY_NEVER_INSTRUMENT { bool Success = false; switch (Sled.Kind) { case XRayEntryType::ENTRY: - Success = - patchFunctionEntry(Enable, FuncId, Sled, Trampolines.EntryTrampoline); + Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry); break; case XRayEntryType::EXIT: - Success = - patchFunctionExit(Enable, FuncId, Sled, Trampolines.ExitTrampoline); + Success = patchFunctionExit(Enable, FuncId, Sled); break; case XRayEntryType::TAIL: - Success = patchFunctionTailExit(Enable, FuncId, Sled, - Trampolines.TailExitTrampoline); + Success = patchFunctionTailExit(Enable, FuncId, Sled); break; case XRayEntryType::LOG_ARGS_ENTRY: - Success = - patchFunctionEntry(Enable, FuncId, Sled, Trampolines.LogArgsTrampoline); + Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry); break; case XRayEntryType::CUSTOM_EVENT: - Success = patchCustomEvent(Enable, FuncId, Sled, - Trampolines.CustomEventTrampoline); + Success = patchCustomEvent(Enable, FuncId, Sled); break; case XRayEntryType::TYPED_EVENT: - Success = - patchTypedEvent(Enable, FuncId, Sled, Trampolines.TypedEventTrampoline); + Success = patchTypedEvent(Enable, FuncId, Sled); break; default: Report("Unsupported sled kind '%" PRIu64 "' @%04x\n", Sled.Address, @@ -221,9 +205,10 @@ findFunctionSleds(int32_t FuncId, return Index; } -XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, +XRayPatchingStatus patchFunction(int32_t FuncId, bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, memory_order_acquire)) + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. uint8_t NotPatching = false; @@ -235,24 +220,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - Report("Unable to patch function: invalid sled map index: %d", ObjId); - return XRayPatchingStatus::FAILED; - } - InstrMap = XRayInstrMaps[ObjId]; + InstrMap = XRayInstrMap; } // If we don't have an index, we can't patch individual functions. if (InstrMap.Functions == 0) return XRayPatchingStatus::NOT_INITIALIZED; - // Check if the corresponding DSO has been unloaded. - if (!InstrMap.Loaded) { - Report("Invalid function id provided: %d\n", FuncId); - return XRayPatchingStatus::NOT_INITIALIZED; - } - // FuncId must be a positive number, less than the number of functions // instrumented. if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) { @@ -260,8 +234,6 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, return XRayPatchingStatus::FAILED; } - auto PackedId = __xray::MakePackedId(FuncId, ObjId); - // Now we patch ths sleds for this specific function. XRayFunctionSledIndex SledRange; if (InstrMap.SledsIndex) { @@ -270,13 +242,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, } else { SledRange = findFunctionSleds(FuncId, InstrMap); } - auto *f = SledRange.Begin; bool SucceedOnce = false; for (size_t i = 0; i != SledRange.Size; ++i) - SucceedOnce |= patchSled(f[i], Enable, PackedId, InstrMap.Trampolines); + SucceedOnce |= patchSled(f[i], Enable, FuncId); - atomic_store(&XRayPatching, false, memory_order_release); + atomic_store(&XRayPatching, false, + memory_order_release); if (!SucceedOnce) { Report("Failed patching any sled for function '%d'.", FuncId); @@ -289,31 +261,32 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, // controlPatching implements the common internals of the patching/unpatching // implementation. |Enable| defines whether we're enabling or disabling the // runtime XRay instrumentation. -// This function should only be called after ensuring that XRay is initialized -// and no other thread is currently patching. -XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { +XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!atomic_compare_exchange_strong( + &XRayPatching, &NotPatching, true, memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + uint8_t PatchingSuccess = false; + auto XRayPatchingStatusResetter = + at_scope_exit([&PatchingSuccess] { + if (!PatchingSuccess) + atomic_store(&XRayPatching, false, + memory_order_release); + }); + XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - Report("Unable to patch functions: invalid sled map index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; - } - InstrMap = XRayInstrMaps[ObjId]; + InstrMap = XRayInstrMap; } if (InstrMap.Entries == 0) return XRayPatchingStatus::NOT_INITIALIZED; - if (Verbosity()) - Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries); - - // Check if the corresponding DSO has been unloaded. - if (!InstrMap.Loaded) { - Report("Object is not loaded at index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; - } - uint32_t FuncId = 1; uint64_t CurFun = 0; @@ -363,96 +336,20 @@ XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { ++FuncId; CurFun = F; } - auto PackedId = __xray::MakePackedId(FuncId, ObjId); - patchSled(Sled, Enable, PackedId, InstrMap.Trampolines); + patchSled(Sled, Enable, FuncId); } - atomic_store(&XRayPatching, false, memory_order_release); + atomic_store(&XRayPatching, false, + memory_order_release); + PatchingSuccess = true; return XRayPatchingStatus::SUCCESS; } -// Controls patching for all registered objects. -// Returns: SUCCESS, if patching succeeds for all objects. -// NOT_INITIALIZED, if one or more objects returned NOT_INITIALIZED -// but none failed. -// FAILED, if patching of one or more objects failed. -XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, memory_order_acquire)) - return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - - uint8_t NotPatching = false; - if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, - memory_order_acq_rel)) - return XRayPatchingStatus::ONGOING; // Already patching. - - auto XRayPatchingStatusResetter = at_scope_exit( - [] { atomic_store(&XRayPatching, false, memory_order_release); }); - - unsigned NumObjects = __xray_num_objects(); - - XRayPatchingStatus CombinedStatus{NOT_INITIALIZED}; - for (unsigned I = 0; I < NumObjects; ++I) { - if (!isObjectLoaded(I)) - continue; - auto LastStatus = controlPatchingObjectUnchecked(Enable, I); - switch (LastStatus) { - case SUCCESS: - if (CombinedStatus == NOT_INITIALIZED) - CombinedStatus = SUCCESS; - break; - case FAILED: - // Report failure, but try to patch the remaining objects - CombinedStatus = FAILED; - break; - case NOT_INITIALIZED: - // XRay has been initialized but there are no sleds available for this - // object. Try to patch remaining objects. - if (CombinedStatus != FAILED) - CombinedStatus = NOT_INITIALIZED; - break; - case ONGOING: - UNREACHABLE("Status ONGOING should not appear at this point"); - default: - UNREACHABLE("Unhandled patching status"); - } - } - return CombinedStatus; -} - -// Controls patching for one object. -XRayPatchingStatus controlPatching(bool Enable, - int32_t ObjId) XRAY_NEVER_INSTRUMENT { - - if (!atomic_load(&XRayInitialized, memory_order_acquire)) - return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - - uint8_t NotPatching = false; - if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, - memory_order_acq_rel)) - return XRayPatchingStatus::ONGOING; // Already patching. - - auto XRayPatchingStatusResetter = at_scope_exit( - [] { atomic_store(&XRayPatching, false, memory_order_release); }); - - return controlPatchingObjectUnchecked(Enable, ObjId); -} - -XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId, +XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, bool Enable) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - Report("Unable to patch function: invalid sled map index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; - } - InstrMap = XRayInstrMaps[ObjId]; - } - - // Check if the corresponding DSO has been unloaded. - if (!InstrMap.Loaded) { - Report("Object is not loaded at index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; + InstrMap = XRayInstrMap; } // FuncId must be a positive number, less than the number of functions @@ -501,7 +398,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId, Report("Failed mprotect: %d\n", errno); return XRayPatchingStatus::FAILED; } - return patchFunction(FuncId, ObjId, Enable); + return patchFunction(FuncId, Enable); } } // namespace @@ -515,10 +412,12 @@ using namespace __xray; int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, memory_order_acquire)) { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { atomic_store(&__xray::XRayPatchedFunction, - reinterpret_cast(entry), memory_order_release); + reinterpret_cast(entry), + memory_order_release); return 1; } return 0; @@ -526,9 +425,11 @@ int __xray_set_handler(void (*entry)(int32_t, int __xray_set_customevent_handler(void (*entry)(void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, memory_order_acquire)) { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { atomic_store(&__xray::XRayPatchedCustomEvent, - reinterpret_cast(entry), memory_order_release); + reinterpret_cast(entry), + memory_order_release); return 1; } return 0; @@ -536,9 +437,11 @@ int __xray_set_customevent_handler(void (*entry)(void *, size_t)) int __xray_set_typedevent_handler(void (*entry)(size_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, memory_order_acquire)) { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { atomic_store(&__xray::XRayPatchedTypedEvent, - reinterpret_cast(entry), memory_order_release); + reinterpret_cast(entry), + memory_order_release); return 1; } return 0; @@ -571,78 +474,39 @@ XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { return controlPatching(true); } -XRayPatchingStatus __xray_patch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return controlPatching(true, ObjId); -} - XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT { return controlPatching(false); } -XRayPatchingStatus __xray_unpatch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return controlPatching(false, ObjId); -} - XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - auto Ids = __xray::UnpackId(FuncId); - auto ObjId = Ids.first; - auto FnId = Ids.second; - return mprotectAndPatchFunction(FnId, ObjId, true); -} - -XRayPatchingStatus -__xray_patch_function_in_object(int32_t FuncId, - int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, ObjId, true); + return mprotectAndPatchFunction(FuncId, true); } XRayPatchingStatus __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - auto Ids = __xray::UnpackId(FuncId); - auto ObjId = Ids.first; - auto FnId = Ids.second; - return mprotectAndPatchFunction(FnId, ObjId, false); -} - -XRayPatchingStatus -__xray_unpatch_function_in_object(int32_t FuncId, - int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, ObjId, false); + return mprotectAndPatchFunction(FuncId, false); } int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) { - if (!atomic_load(&XRayInitialized, memory_order_acquire)) + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) return 0; // A relaxed write might not be visible even if the current thread gets // scheduled on a different CPU/NUMA node. We need to wait for everyone to // have this handler installed for consistency of collected data across CPUs. atomic_store(&XRayArgLogger, reinterpret_cast(entry), - memory_order_release); + memory_order_release); return 1; } int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } -uintptr_t -__xray_function_address(int32_t CombinedFuncId) XRAY_NEVER_INSTRUMENT { - auto Ids = __xray::UnpackId(CombinedFuncId); - return __xray_function_address_in_object(Ids.second, Ids.first); -} - -uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId) - XRAY_NEVER_INSTRUMENT { +uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - auto count = atomic_load(&XRayNumObjects, memory_order_acquire); - if (ObjId < 0 || ObjId >= count) { - Report("Unable to determine function address: invalid sled map index %d " - "(size is %d)\n", - ObjId, (int)count); - return 0; - } - InstrMap = XRayInstrMaps[ObjId]; + InstrMap = XRayInstrMap; } if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) @@ -661,29 +525,6 @@ uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId) } size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT { - return __xray_max_function_id_in_object(0); -} - -size_t __xray_max_function_id_in_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) - return 0; - return XRayInstrMaps[ObjId].Functions; -} - -size_t __xray_num_objects() XRAY_NEVER_INSTRUMENT { SpinMutexLock Guard(&XRayInstrMapMutex); - return atomic_load(&XRayNumObjects, memory_order_acquire); -} - -int32_t __xray_unpack_function_id(int32_t PackedId) { - return __xray::UnpackId(PackedId).second; -} - -int32_t __xray_unpack_object_id(int32_t PackedId) { - return __xray::UnpackId(PackedId).first; -} - -int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId) { - return __xray::MakePackedId(FuncId, ObjId); + return XRayInstrMap.Functions; } diff --git a/compiler-rt/lib/xray/xray_interface_internal.h b/compiler-rt/lib/xray/xray_interface_internal.h index 5fbaa9c3f315b1..80c07c167f6461 100644 --- a/compiler-rt/lib/xray/xray_interface_internal.h +++ b/compiler-rt/lib/xray/xray_interface_internal.h @@ -18,18 +18,6 @@ #include "xray/xray_interface.h" #include #include -#include - -extern "C" { -// The following functions have to be defined in assembler, on a per-platform -// basis. See xray_trampoline_*.S files for implementations. -extern void __xray_FunctionEntry(); -extern void __xray_FunctionExit(); -extern void __xray_FunctionTailExit(); -extern void __xray_ArgLoggerEntry(); -extern void __xray_CustomEvent(); -extern void __xray_TypedEvent(); -} extern "C" { @@ -79,77 +67,36 @@ struct XRayFunctionSledIndex { uintptr_t(Begin)); } }; - -struct XRayTrampolines { - void (*EntryTrampoline)(); - void (*ExitTrampoline)(); - void (*TailExitTrampoline)(); - void (*LogArgsTrampoline)(); - void (*CustomEventTrampoline)(); - void (*TypedEventTrampoline)(); - - XRayTrampolines() { - // These resolve to the definitions in the respective executable or DSO. - EntryTrampoline = __xray_FunctionEntry; - ExitTrampoline = __xray_FunctionExit; - TailExitTrampoline = __xray_FunctionTailExit; - LogArgsTrampoline = __xray_ArgLoggerEntry; - CustomEventTrampoline = __xray_CustomEvent; - TypedEventTrampoline = __xray_TypedEvent; - } -}; - -extern int32_t __xray_register_dso(const XRaySledEntry *SledsBegin, - const XRaySledEntry *SledsEnd, - const XRayFunctionSledIndex *FnIndexBegin, - const XRayFunctionSledIndex *FnIndexEnd, - XRayTrampolines Trampolines); - -extern bool __xray_deregister_dso(int32_t ObjId); } namespace __xray { -constexpr uint32_t XRayNFnBits = 24; -constexpr uint32_t XRayNObjBits = 8; - -constexpr uint32_t XRayFnBitMask = 0x00FFFFFF; -constexpr uint32_t XRayObjBitMask = 0xFF000000; - -constexpr size_t XRayMaxFunctions = 1 << XRayNFnBits; -constexpr size_t XRayMaxObjects = 1 << XRayNObjBits; - -inline int32_t MakePackedId(int32_t FnId, int32_t ObjId) { - return ((ObjId << XRayNFnBits) & XRayObjBitMask) | (FnId & XRayFnBitMask); -} - -inline std::pair UnpackId(int32_t PackedId) { - uint32_t ObjId = (PackedId & XRayObjBitMask) >> XRayNFnBits; - uint32_t FnId = PackedId & XRayFnBitMask; - return {ObjId, FnId}; -} - struct XRaySledMap { const XRaySledEntry *Sleds; size_t Entries; const XRayFunctionSledIndex *SledsIndex; size_t Functions; - XRayTrampolines Trampolines; - bool FromDSO; - bool Loaded; }; bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, void (*Trampoline)()); -bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, - void (*Trampoline)()); +bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, - const XRaySledEntry &Sled, void (*Trampoline)()); -bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, - void (*Trampoline)()); -bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, - void (*Trampoline)()); + const XRaySledEntry &Sled); +bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); } // namespace __xray +extern "C" { +// The following functions have to be defined in assembler, on a per-platform +// basis. See xray_trampoline_*.S files for implementations. +extern void __xray_FunctionEntry(); +extern void __xray_FunctionExit(); +extern void __xray_FunctionTailExit(); +extern void __xray_ArgLoggerEntry(); +extern void __xray_CustomEvent(); +extern void __xray_TypedEvent(); +} + #endif diff --git a/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/compiler-rt/lib/xray/xray_trampoline_x86_64.S index 0f480547b52cc6..01098f60eeab8b 100644 --- a/compiler-rt/lib/xray/xray_trampoline_x86_64.S +++ b/compiler-rt/lib/xray/xray_trampoline_x86_64.S @@ -107,16 +107,6 @@ .section __TEXT,__text #endif -.macro LOAD_HANDLER_ADDR handler -#if !defined(XRAY_PIC) - movq ASM_SYMBOL(\handler)(%rip), %rax -#else - movq ASM_SYMBOL(\handler)@GOTPCREL(%rip), %rax - movq (%rax), %rax -#endif -.endm - - //===----------------------------------------------------------------------===// .globl ASM_SYMBOL(__xray_FunctionEntry) @@ -131,7 +121,7 @@ ASM_SYMBOL(__xray_FunctionEntry): // This load has to be atomic, it's concurrent with __xray_patch(). // On x86/amd64, a simple (type-aligned) MOV instruction is enough. - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax, %rax je LOCAL_LABEL(tmp0) @@ -169,7 +159,7 @@ ASM_SYMBOL(__xray_FunctionExit): movupd %xmm1, 16(%rsp) movq %rax, 8(%rsp) movq %rdx, 0(%rsp) - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(tmp2) @@ -205,7 +195,7 @@ ASM_SYMBOL(__xray_FunctionTailExit): SAVE_REGISTERS ALIGN_STACK_16B - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(tmp4) @@ -234,12 +224,12 @@ ASM_SYMBOL(__xray_ArgLoggerEntry): ALIGN_STACK_16B // Again, these function pointer loads must be atomic; MOV is fine. - LOAD_HANDLER_ADDR _ZN6__xray13XRayArgLoggerE + movq ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax testq %rax, %rax jne LOCAL_LABEL(arg1entryLog) // If [arg1 logging handler] not set, defer to no-arg logging. - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax, %rax je LOCAL_LABEL(arg1entryFail) @@ -278,7 +268,7 @@ ASM_SYMBOL(__xray_CustomEvent): // We take two arguments to this trampoline, which should be in rdi and rsi // already. - LOAD_HANDLER_ADDR _ZN6__xray22XRayPatchedCustomEventE + movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(customEventCleanup) @@ -303,7 +293,7 @@ ASM_SYMBOL(__xray_TypedEvent): // We pass three arguments to this trampoline, which should be in rdi, rsi // and rdx without our intervention. - LOAD_HANDLER_ADDR _ZN6__xray21XRayPatchedTypedEventE + movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(typedEventCleanup) diff --git a/compiler-rt/lib/xray/xray_x86_64.cpp b/compiler-rt/lib/xray/xray_x86_64.cpp index 663a51b2686614..b9666a40861d48 100644 --- a/compiler-rt/lib/xray/xray_x86_64.cpp +++ b/compiler-rt/lib/xray/xray_x86_64.cpp @@ -170,8 +170,7 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, } bool patchFunctionExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -193,11 +192,11 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, // Prerequisite is to compute the relative offset fo the // __xray_FunctionExit function's address. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = reinterpret_cast(Trampoline) - + int64_t TrampolineOffset = reinterpret_cast(__xray_FunctionExit) - (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(Trampoline), + reinterpret_cast(__xray_FunctionExit), reinterpret_cast(Address)); return false; } @@ -218,16 +217,16 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, } bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the tail call sled with a similar // sequence as the entry sled, but calls the tail exit sled instead. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = reinterpret_cast(Trampoline) - - (static_cast(Address) + 11); + int64_t TrampolineOffset = + reinterpret_cast(__xray_FunctionTailExit) - + (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(Trampoline), + reinterpret_cast(__xray_FunctionTailExit), reinterpret_cast(Address)); return false; } @@ -248,8 +247,7 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, } bool patchCustomEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -277,8 +275,7 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, } bool patchTypedEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp deleted file mode 100644 index 31c615bd1f81bf..00000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Testing shared library support in basic logging mode. - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o - -// RUN: XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s -// RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s -// RUN: rm basic-mode-dso-* - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include -#include - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); - sleep(1); -} - -extern void instrumented_in_dso(); - -int main() { - // Explicit patching to ensure the DSO has been loaded - __xray_patch(); - instrumented_in_executable(); - // CHECK: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called -} - -//--- testlib.cpp - -#include -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} - -// ACCOUNT: funcid,count,min,median,90%ile,99%ile,max,sum,debug,function -// ACCOUNT-NEXT: 1,1,{{.*}} -// ACCOUNT-NEXT: 16777217,1,{{.*}} diff --git a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp deleted file mode 100644 index 92f3c29e970d42..00000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp +++ /dev/null @@ -1,14 +0,0 @@ -// Test that the DSO-local runtime library has been linked if -fxray-shared is passed. -// -// RUN: %clangxx -fxray-instrument -fxray-shared %s -shared -o %t.so -// RUN: llvm-nm %t.so | FileCheck %s --check-prefix ENABLED - -// RUN: %clangxx -fxray-instrument %s -shared -o %t.so -// RUN: llvm-nm %t.so | FileCheck %s --check-prefix DISABLED -// -// REQUIRES: target=x86_64{{.*}} - -[[clang::xray_always_instrument]] int always_instrumented() { return 42; } - -// ENABLED: __start_xray_instr_map -// DISABLED-NOT: __start_xray_instr_map diff --git a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp deleted file mode 100644 index 9db411d5ff1c6e..00000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// Check that we can patch and un-patch DSOs loaded with dlopen. -// - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp -o %t/main.o -// -// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlib.so 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include -#include - -void test_handler(int32_t fid, XRayEntryType type) { - printf("called: %d, type=%d\n", fid, static_cast(type)); -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -typedef void (*dso_func_type)(); - -int main(int argc, char **argv) { - if (argc < 2) { - printf("Shared library argument missing\n"); - // CHECK-NOT: Shared library argument missing - return 1; - } - - const char *dso_path = argv[1]; - - void *dso_handle = dlopen(dso_path, RTLD_LAZY); - if (!dso_handle) { - printf("Failed to load shared library\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - return 1; - } - return 1; - } - - dso_func_type instrumented_in_dso = - (dso_func_type)dlsym(dso_handle, "_Z19instrumented_in_dsov"); - if (!instrumented_in_dso) { - printf("Failed to find symbol\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - return 1; - } - return 1; - } - - __xray_set_handler(test_handler); - - instrumented_in_executable(); - // CHECK: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 - - auto status = __xray_unpatch(); - printf("unpatching status: %d\n", static_cast(status)); - // CHECK-NEXT: unpatching status: 1 - - instrumented_in_executable(); - // CHECK-NEXT: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called - - status = __xray_patch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - - instrumented_in_executable(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 - - dlclose(dso_handle); - - status = __xray_unpatch(); - printf("unpatching status: %d\n", static_cast(status)); - // CHECK-NEXT: unpatching status: 1 -} - -//--- testlib.cpp - -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} diff --git a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp deleted file mode 100644 index 89da2764c35cee..00000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp +++ /dev/null @@ -1,197 +0,0 @@ -// Check that loading libraries with different modes (RTLD_LOCAL/RTLD_GLOBAL) -// and dependencies on other DSOs work correctly. -// - -// RUN: split-file %s %t -// -// Build shared libs with dependencies b->c and e->f -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testliba.cpp -o %t/testliba.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibc.cpp -o %t/testlibc.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibb.cpp %t/testlibc.so -o %t/testlibb.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibd.cpp -o %t/testlibd.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibf.cpp -o %t/testlibf.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibe.cpp %t/testlibf.so -o %t/testlibe.so -// -// Executable links with a and b explicitly and loads d and e at runtime. -// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testliba.so %t/testlibb.so -o %t/main.o -// -// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlibd.so %t/testlibe.so 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include -#include - -[[clang::xray_never_instrument]] void test_handler(int32_t fid, - XRayEntryType type) { - printf("called: %d, object=%d, fn=%d, type=%d\n", fid, (fid >> 24) & 0xFF, - fid & 0x00FFFFFF, static_cast(type)); -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -typedef void (*dso_func_type)(); - -[[clang::xray_never_instrument]] void *load_dso(const char *path, int mode) { - void *dso_handle = dlopen(path, mode); - if (!dso_handle) { - printf("failed to load shared library\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - } - return nullptr; - } - return dso_handle; -} - -[[clang::xray_never_instrument]] void find_and_call(void *dso_handle, - const char *fn) { - dso_func_type dso_fn = (dso_func_type)dlsym(dso_handle, fn); - if (!dso_fn) { - printf("failed to find symbol\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - } - return; - } - dso_fn(); -} - -extern void a(); -extern void b(); - -int main(int argc, char **argv) { - - if (argc < 3) { - printf("Shared library arguments missing\n"); - // CHECK-NOT: Shared library arguments missing - return 1; - } - - const char *dso_path_d = argv[1]; - const char *dso_path_e = argv[2]; - - __xray_set_handler(test_handler); - - instrumented_in_executable(); - // CHECK: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=1 - - a(); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: a called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1]], fn=1, type=1 - - // Make sure this object ID does not appear again - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ1]] - - b(); // b calls c - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: b called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: c called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ3]] - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ2]] - - // Now check explicit loading with RTLD_LOCAL - - void *dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_LOCAL); - void *dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_LOCAL); - // CHECK-NOT: failed to load shared library - - find_and_call(dso_handle_d, "_Z1dv"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: d called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ4]] - - find_and_call(dso_handle_e, "_Z1ev"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: e called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: f called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ6]] - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ5]] - - // Unload DSOs - dlclose(dso_handle_d); - dlclose(dso_handle_e); - - // Repeat test with RTLD_GLOBAL - dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_GLOBAL); - dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_GLOBAL); - // CHECK-NOT: failed to load shared library - - find_and_call(dso_handle_d, "_Z1dv"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: d called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ7]] - - find_and_call(dso_handle_e, "_Z1ev"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: e called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: f called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ9]] - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ8]] - - auto status = __xray_unpatch(); - printf("unpatching status: %d\n", static_cast(status)); - // CHECK-NEXT: unpatching status: 1 - - dlclose(dso_handle_d); - dlclose(dso_handle_e); -} - -//--- libgenmacro.inc -#include -// Helper macros to quickly generate libraries containing a single function. -#define GENERATE_LIB(NAME) \ - [[clang::xray_always_instrument]] void NAME() { printf(#NAME " called\n"); } - -#define GENERATE_LIB_WITH_CALL(NAME, FN) \ - extern void FN(); \ - [[clang::xray_always_instrument]] void NAME() { \ - printf(#NAME " called\n"); \ - FN(); \ - } - -//--- testliba.cpp -#include "libgenmacro.inc" -GENERATE_LIB(a) - -//--- testlibb.cpp -#include "libgenmacro.inc" -GENERATE_LIB_WITH_CALL(b, c) - -//--- testlibc.cpp -#include "libgenmacro.inc" -GENERATE_LIB(c) - -//--- testlibd.cpp -#include "libgenmacro.inc" -GENERATE_LIB(d) - -//--- testlibe.cpp -#include "libgenmacro.inc" -GENERATE_LIB_WITH_CALL(e, f) - -//--- testlibf.cpp -#include "libgenmacro.inc" -GENERATE_LIB(f) diff --git a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp deleted file mode 100644 index 0708d0383439d0..00000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Checking that DSOs are automatically patched upon load, if patch_premain is passed. - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o - -// RUN: XRAY_OPTIONS="patch_premain=true,verbosity=1" %run %t/main.o 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include - -void test_handler(int32_t fid, XRayEntryType type) { - printf("called: %d, type=%d\n", fid, static_cast(type)); -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -extern void instrumented_in_dso(); - -int main() { - __xray_set_handler(test_handler); - instrumented_in_executable(); - // CHECK: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 -} - -//--- testlib.cpp - -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp deleted file mode 100644 index d3e992dd497725..00000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Check that we can patch and un-patch on demand, and that logging gets invoked -// appropriately. -// - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o - -// RUN: XRAY_OPTIONS="patch_premain=false" %run %t/main.o 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include - -bool called = false; - -void test_handler(int32_t fid, XRayEntryType type) { - printf("called: %d, type=%d\n", fid, static_cast(type)); - called = true; -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -extern void instrumented_in_dso(); - -int main() { - __xray_set_handler(test_handler); - instrumented_in_executable(); - // CHECK: instrumented_in_executable called - instrumented_in_dso(); - // CHECK: instrumented_in_dso called - auto status = __xray_patch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - instrumented_in_executable(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 - status = __xray_unpatch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - instrumented_in_executable(); - // CHECK-NEXT: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called - status = __xray_patch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - __xray_remove_handler(); - instrumented_in_executable(); - // CHECK-NEXT: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called - status = __xray_unpatch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 -} - -//--- testlib.cpp - -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} From 14705a912f6296700cef4d2aa7eb100f71dfbd0a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 11 Oct 2024 16:16:12 +0400 Subject: [PATCH 158/177] CodeGen: Remove redundant REQUIRES registered-target from tests (#111982) These are already in target specific test directories. --- llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir | 1 - llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll | 1 - llvm/test/CodeGen/X86/tls-align.ll | 1 - 3 files changed, 3 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir index c1ddc9c14d814b..51e9ed6fef2d3a 100644 --- a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir +++ b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir @@ -1,7 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=aarch64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s # RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction -verify-each %s -o - | FileCheck %s -# REQUIRES: aarch64-registered-target # Verify that the register class is correctly constrained after the twoaddress replacement --- diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll index d0fd6685df3d73..cca70005b4cdc1 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature ; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s -; REQUIRES: amdgpu-registered-target target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" diff --git a/llvm/test/CodeGen/X86/tls-align.ll b/llvm/test/CodeGen/X86/tls-align.ll index e996c00dbf1d4a..94f9b9045cf24c 100644 --- a/llvm/test/CodeGen/X86/tls-align.ll +++ b/llvm/test/CodeGen/X86/tls-align.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86-registered-target ; RUN: opt -passes=instcombine -S < %s | FileCheck %s %class.Arr = type <{ [160 x %class.Derived], i32, [4 x i8] }> From 900ea21ffb38ba5b783b20f394c43c6c89d58086 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 11 Oct 2024 05:25:12 -0700 Subject: [PATCH 159/177] [NFC][CodingStandard] Add additional example for if-else brace rule (#111733) Add example to document that single statement `else` needs a brace if the associated `if` needs a brace. --- llvm/docs/CodingStandards.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst index 63df5af2523db6..87bbb3d127ad51 100644 --- a/llvm/docs/CodingStandards.rst +++ b/llvm/docs/CodingStandards.rst @@ -1713,6 +1713,14 @@ would help to avoid running into a "dangling else" situation. handleOtherDecl(D); } + // Use braces for the `else` block to keep it uniform with the `if` block. + if (isa(D)) { + verifyFunctionDecl(D); + handleFunctionDecl(D); + } else { + handleOtherDecl(D); + } + // This should also omit braces. The `for` loop contains only a single // statement, so it shouldn't have braces. The `if` also only contains a // single simple statement (the `for` loop), so it also should omit braces. From fa789dffb1e12c2aece0187aeacc48dfb1768340 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 11 Oct 2024 05:26:03 -0700 Subject: [PATCH 160/177] [NFC] Rename `Intrinsic::getDeclaration` to `getOrInsertDeclaration` (#111752) Rename the function to reflect its correct behavior and to be consistent with `Module::getOrInsertFunction`. This is also in preparation of adding a new `Intrinsic::getDeclaration` that will have behavior similar to `Module::getFunction` (i.e, just lookup, no creation). --- clang/lib/CodeGen/CGBuiltin.cpp | 11 +- clang/lib/CodeGen/CGDecl.cpp | 8 +- clang/lib/CodeGen/CGException.cpp | 4 +- clang/lib/CodeGen/CodeGenFunction.cpp | 4 +- clang/lib/CodeGen/CodeGenModule.cpp | 4 +- clang/lib/CodeGen/Targets/SystemZ.cpp | 4 +- llvm/examples/BrainF/BrainF.cpp | 4 +- llvm/include/llvm-c/Core.h | 4 +- llvm/include/llvm/IR/IntrinsicInst.h | 6 +- llvm/include/llvm/IR/Intrinsics.h | 9 +- llvm/include/llvm/IR/MatrixBuilder.h | 8 +- llvm/lib/AsmParser/LLParser.cpp | 2 +- llvm/lib/CodeGen/ExpandLargeFpConvert.cpp | 2 +- llvm/lib/CodeGen/ExpandMemCmp.cpp | 2 +- llvm/lib/CodeGen/ExpandVectorPredication.cpp | 14 +- llvm/lib/CodeGen/HardwareLoops.cpp | 12 +- llvm/lib/CodeGen/IntrinsicLowering.cpp | 2 +- llvm/lib/CodeGen/SafeStack.cpp | 3 +- llvm/lib/CodeGen/SjLjEHPrepare.cpp | 22 +- llvm/lib/CodeGen/StackProtector.cpp | 5 +- llvm/lib/CodeGen/WasmEHPrepare.cpp | 15 +- llvm/lib/IR/AutoUpgrade.cpp | 318 +++++++++--------- llvm/lib/IR/Core.cpp | 2 +- llvm/lib/IR/DIBuilder.cpp | 8 +- llvm/lib/IR/DebugProgramInstruction.cpp | 8 +- llvm/lib/IR/IRBuilder.cpp | 96 +++--- llvm/lib/IR/IntrinsicInst.cpp | 29 +- llvm/lib/IR/Intrinsics.cpp | 5 +- llvm/lib/IR/Module.cpp | 9 +- llvm/lib/IR/VectorBuilder.cpp | 4 +- .../Target/AArch64/AArch64ISelLowering.cpp | 29 +- .../Target/AArch64/AArch64StackTagging.cpp | 18 +- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- llvm/lib/Target/AArch64/SMEABIPass.cpp | 14 +- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 24 +- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 15 +- .../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 2 +- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 17 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 13 +- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 8 +- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 14 +- llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 11 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 +- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 4 +- .../Target/AMDGPU/SIAnnotateControlFlow.cpp | 16 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 25 +- llvm/lib/Target/ARM/ARMParallelDSP.cpp | 13 +- llvm/lib/Target/ARM/MVETailPredication.cpp | 2 +- .../Target/BPF/BPFAbstractMemberAccess.cpp | 2 +- llvm/lib/Target/BPF/BPFAdjustOpt.cpp | 2 +- .../Target/BPF/BPFPreserveStaticOffset.cpp | 2 +- llvm/lib/Target/DirectX/DXILOpLowering.cpp | 4 +- llvm/lib/Target/Hexagon/HexagonGenExtract.cpp | 2 +- .../Target/Hexagon/HexagonISelLowering.cpp | 4 +- .../Hexagon/HexagonLoopIdiomRecognition.cpp | 3 +- .../Target/Hexagon/HexagonVectorCombine.cpp | 11 +- .../LoongArch/LoongArchISelLowering.cpp | 4 +- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 2 +- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 3 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 8 +- .../Target/PowerPC/PPCLowerMASSVEntries.cpp | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 30 +- .../Target/SPIRV/SPIRVPrepareFunctions.cpp | 8 +- llvm/lib/Target/SystemZ/SystemZTDC.cpp | 4 +- .../WebAssemblyLowerEmscriptenEHSjLj.cpp | 2 +- .../WebAssemblyLowerRefTypesIntPtrConv.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +- .../Target/X86/X86InstCombineIntrinsic.cpp | 6 +- llvm/lib/Target/X86/X86PartialReduction.cpp | 2 +- llvm/lib/Target/X86/X86WinEHState.cpp | 16 +- .../Target/XCore/XCoreLowerThreadLocal.cpp | 4 +- .../AggressiveInstCombine.cpp | 11 +- llvm/lib/Transforms/Coroutines/Coroutines.cpp | 5 +- llvm/lib/Transforms/IPO/CrossDSOCFI.cpp | 3 +- .../lib/Transforms/IPO/SampleProfileProbe.cpp | 2 +- .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 12 +- .../InstCombine/InstCombineAddSub.cpp | 7 +- .../InstCombine/InstCombineAndOrXor.cpp | 14 +- .../InstCombine/InstCombineCalls.cpp | 29 +- .../InstCombine/InstCombineCasts.cpp | 15 +- .../InstCombine/InstCombineCompares.cpp | 18 +- .../InstCombine/InstCombineSelect.cpp | 19 +- .../InstCombineSimplifyDemanded.cpp | 2 +- .../InstCombine/InstCombineVectorOps.cpp | 8 +- .../InstCombine/InstructionCombining.cpp | 6 +- .../Instrumentation/AddressSanitizer.cpp | 4 +- .../Instrumentation/BoundsChecking.cpp | 2 +- .../Instrumentation/HWAddressSanitizer.cpp | 4 +- llvm/lib/Transforms/Instrumentation/KCFI.cpp | 3 +- .../Instrumentation/MemorySanitizer.cpp | 6 +- .../Instrumentation/PGOInstrumentation.cpp | 23 +- .../Instrumentation/SanitizerCoverage.cpp | 2 +- .../Instrumentation/ThreadSanitizer.cpp | 7 +- .../ObjCARC/ARCRuntimeEntryPoints.h | 2 +- .../Transforms/Scalar/InferAddressSpaces.cpp | 14 +- .../Transforms/Scalar/LoopDataPrefetch.cpp | 2 +- llvm/lib/Transforms/Scalar/LoopFlatten.cpp | 4 +- .../Transforms/Scalar/LoopIdiomRecognize.cpp | 4 +- .../Transforms/Scalar/LowerGuardIntrinsic.cpp | 2 +- .../Scalar/LowerMatrixIntrinsics.cpp | 2 +- .../Transforms/Scalar/MakeGuardsExplicit.cpp | 2 +- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 4 +- .../Scalar/RewriteStatepointsForGC.cpp | 4 +- llvm/lib/Transforms/Scalar/Scalarizer.cpp | 5 +- .../Transforms/Utils/AssumeBundleBuilder.cpp | 3 +- llvm/lib/Transforms/Utils/CloneFunction.cpp | 4 +- llvm/lib/Transforms/Utils/CodeExtractor.cpp | 3 +- .../Utils/EntryExitInstrumenter.cpp | 2 +- llvm/lib/Transforms/Utils/InlineFunction.cpp | 7 +- llvm/lib/Transforms/Utils/IntegerDivision.cpp | 4 +- llvm/lib/Transforms/Utils/Local.cpp | 3 +- .../lib/Transforms/Utils/LowerGlobalDtors.cpp | 4 +- .../Transforms/Utils/MemoryTaggingSupport.cpp | 6 +- llvm/lib/Transforms/Utils/PredicateInfo.cpp | 4 +- .../Utils/PromoteMemoryToRegister.cpp | 2 +- .../Utils/RelLookupTableConverter.cpp | 2 +- .../Utils/ScalarEvolutionExpander.cpp | 4 +- .../lib/Transforms/Utils/SimplifyLibCalls.cpp | 2 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- .../llvm-reduce/deltas/ReduceOpcodes.cpp | 2 +- .../Analysis/AssumeBundleQueriesTest.cpp | 3 +- llvm/unittests/Analysis/MemorySSATest.cpp | 2 +- llvm/unittests/Analysis/ValueTrackingTest.cpp | 4 +- llvm/unittests/IR/BasicBlockTest.cpp | 8 +- llvm/unittests/IR/DebugInfoTest.cpp | 3 +- llvm/unittests/IR/IRBuilderTest.cpp | 5 +- llvm/unittests/IR/IntrinsicsTest.cpp | 2 +- llvm/unittests/IR/PatternMatch.cpp | 2 +- llvm/unittests/IR/VPIntrinsicTest.cpp | 4 +- .../Transforms/Vectorize/VPlanTest.cpp | 3 +- .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td | 6 +- .../LLVMIR/LLVMToLLVMIRTranslation.cpp | 5 +- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 7 +- polly/lib/CodeGen/IslExprBuilder.cpp | 12 +- polly/lib/CodeGen/PerfMonitor.cpp | 2 +- 137 files changed, 721 insertions(+), 642 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ff678ee04f9c2a..059c75fae284dd 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13648,7 +13648,7 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue()); // Built the IR for the preserve_field_info intrinsic. - llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getDeclaration( + llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_preserve_field_info, {FieldAddr->getType()}); return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind}); @@ -13670,10 +13670,10 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, llvm::Function *FnDecl; if (BuiltinID == BPF::BI__builtin_btf_type_id) - FnDecl = llvm::Intrinsic::getDeclaration( + FnDecl = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_btf_type_id, {}); else - FnDecl = llvm::Intrinsic::getDeclaration( + FnDecl = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_preserve_type_info, {}); CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue}); Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo); @@ -13708,7 +13708,7 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue()); Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++); - llvm::Function *IntrinsicFn = llvm::Intrinsic::getDeclaration( + llvm::Function *IntrinsicFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_preserve_enum_value, {}); CallInst *Fn = Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue}); @@ -18895,7 +18895,8 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: { } case Builtin::BI__builtin_hlsl_wave_is_first_lane: { Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic(); - return EmitRuntimeCall(Intrinsic::getDeclaration(&CGM.getModule(), ID)); + return EmitRuntimeCall( + Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); } case Builtin::BI__builtin_hlsl_elementwise_sign: { auto *Arg0 = E->getArg(0); diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 563f728e29d781..30af9268b30e2e 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -2509,8 +2509,8 @@ void CodeGenFunction::pushRegularPartialArrayCleanup(llvm::Value *arrayBegin, llvm::Function *CodeGenModule::getLLVMLifetimeStartFn() { if (LifetimeStartFn) return LifetimeStartFn; - LifetimeStartFn = llvm::Intrinsic::getDeclaration(&getModule(), - llvm::Intrinsic::lifetime_start, AllocaInt8PtrTy); + LifetimeStartFn = llvm::Intrinsic::getOrInsertDeclaration( + &getModule(), llvm::Intrinsic::lifetime_start, AllocaInt8PtrTy); return LifetimeStartFn; } @@ -2518,8 +2518,8 @@ llvm::Function *CodeGenModule::getLLVMLifetimeStartFn() { llvm::Function *CodeGenModule::getLLVMLifetimeEndFn() { if (LifetimeEndFn) return LifetimeEndFn; - LifetimeEndFn = llvm::Intrinsic::getDeclaration(&getModule(), - llvm::Intrinsic::lifetime_end, AllocaInt8PtrTy); + LifetimeEndFn = llvm::Intrinsic::getOrInsertDeclaration( + &getModule(), llvm::Intrinsic::lifetime_end, AllocaInt8PtrTy); return LifetimeEndFn; } diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp index bb2ed237ee9f35..44a45413dbc45a 100644 --- a/clang/lib/CodeGen/CGException.cpp +++ b/clang/lib/CodeGen/CGException.cpp @@ -1843,7 +1843,7 @@ Address CodeGenFunction::recoverAddrOfEscapedLocal(CodeGenFunction &ParentCGF, std::make_pair(ParentAlloca, ParentCGF.EscapedLocals.size())); int FrameEscapeIdx = InsertPair.first->second; // call ptr @llvm.localrecover(ptr @parentFn, ptr %fp, i32 N) - llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration( + llvm::Function *FrameRecoverFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::localrecover); RecoverCall = Builder.CreateCall( FrameRecoverFn, {ParentCGF.CurFn, ParentFP, @@ -1942,7 +1942,7 @@ void CodeGenFunction::EmitCapturedLocals(CodeGenFunction &ParentCGF, // %1 = call ptr @llvm.localrecover(@"?fin$0@0@main@@",..) // %2 = load ptr, ptr %1, align 8 // ==> %2 is the frame-pointer of outermost host function - llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration( + llvm::Function *FrameRecoverFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::localrecover); ParentFP = Builder.CreateCall( FrameRecoverFn, {ParentCGF.CurFn, ParentFP, diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index e1fd9b72b8d7b2..f3023c7a20c405 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -463,7 +463,7 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { EscapeArgs.resize(EscapedLocals.size()); for (auto &Pair : EscapedLocals) EscapeArgs[Pair.second] = Pair.first; - llvm::Function *FrameEscapeFn = llvm::Intrinsic::getDeclaration( + llvm::Function *FrameEscapeFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::localescape); CGBuilderTy(*this, AllocaInsertPt).CreateCall(FrameEscapeFn, EscapeArgs); } @@ -3130,7 +3130,7 @@ void CodeGenFunction::emitAlignmentAssumptionCheck( llvm::Instruction *Assumption) { assert(isa_and_nonnull(Assumption) && cast(Assumption)->getCalledOperand() == - llvm::Intrinsic::getDeclaration( + llvm::Intrinsic::getOrInsertDeclaration( Builder.GetInsertBlock()->getParent()->getParent(), llvm::Intrinsic::assume) && "Assumption should be a call to llvm.assume()."); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 5ba098144a74e7..7a7dea4668ad09 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -6218,8 +6218,8 @@ void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) { llvm::Function *CodeGenModule::getIntrinsic(unsigned IID, ArrayRef Tys) { - return llvm::Intrinsic::getDeclaration(&getModule(), (llvm::Intrinsic::ID)IID, - Tys); + return llvm::Intrinsic::getOrInsertDeclaration(&getModule(), + (llvm::Intrinsic::ID)IID, Tys); } static llvm::StringMapEntry & diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp index 56129622f48dbd..23c96fa5cf98cb 100644 --- a/clang/lib/CodeGen/Targets/SystemZ.cpp +++ b/clang/lib/CodeGen/Targets/SystemZ.cpp @@ -110,8 +110,8 @@ class SystemZTargetCodeGenInfo : public TargetCodeGenInfo { if (Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isFP128Ty()) { llvm::Module &M = CGM.getModule(); auto &Ctx = M.getContext(); - llvm::Function *TDCFunc = - llvm::Intrinsic::getDeclaration(&M, llvm::Intrinsic::s390_tdc, Ty); + llvm::Function *TDCFunc = llvm::Intrinsic::getOrInsertDeclaration( + &M, llvm::Intrinsic::s390_tdc, Ty); unsigned TDCBits = 0; switch (BuiltinID) { case Builtin::BI__builtin_isnan: diff --git a/llvm/examples/BrainF/BrainF.cpp b/llvm/examples/BrainF/BrainF.cpp index ac01961735e137..e62cc7bd591a3f 100644 --- a/llvm/examples/BrainF/BrainF.cpp +++ b/llvm/examples/BrainF/BrainF.cpp @@ -67,8 +67,8 @@ void BrainF::header(LLVMContext& C) { //declare void @llvm.memset.p0i8.i32(i8 *, i8, i32, i1) Type *Tys[] = {PointerType::getUnqual(C), Type::getInt32Ty(C)}; - Function *memset_func = Intrinsic::getDeclaration(module, Intrinsic::memset, - Tys); + Function *memset_func = + Intrinsic::getOrInsertDeclaration(module, Intrinsic::memset, Tys); //declare i32 @getchar() getchar_func = diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 28dc270ca368d2..55649d89a6b8f4 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -2807,10 +2807,10 @@ unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen); unsigned LLVMGetIntrinsicID(LLVMValueRef Fn); /** - * Create or insert the declaration of an intrinsic. For overloaded intrinsics, + * Get or insert the declaration of an intrinsic. For overloaded intrinsics, * parameter types must be provided to uniquely identify an overload. * - * @see llvm::Intrinsic::getDeclaration() + * @see llvm::Intrinsic::getOrInsertDeclaration() */ LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod, unsigned ID, diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 4458126ffa759d..920eed01374c83 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -568,9 +568,9 @@ class VPIntrinsic : public IntrinsicInst { /// \brief Declares a llvm.vp.* intrinsic in \p M that matches the parameters /// \p Params. Additionally, the load and gather intrinsics require /// \p ReturnType to be specified. - static Function *getDeclarationForParams(Module *M, Intrinsic::ID, - Type *ReturnType, - ArrayRef Params); + static Function *getOrInsertDeclarationForParams(Module *M, Intrinsic::ID, + Type *ReturnType, + ArrayRef Params); static std::optional getMaskParamPos(Intrinsic::ID IntrinsicID); static std::optional getVectorLengthParamPos( diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index b251036247c5c0..8c37925732a83a 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -87,14 +87,15 @@ namespace Intrinsic { /// Return the attributes for an intrinsic. AttributeList getAttributes(LLVMContext &C, ID id); - /// Create or insert an LLVM Function declaration for an intrinsic, and return - /// it. + /// Look up the Function declaration of the intrinsic \p id in the Module + /// \p M. If it does not exist, add a declaration and return it. Otherwise, + /// return the existing declaration. /// - /// The Tys parameter is for intrinsics with overloaded types (e.g., those + /// The \p Tys parameter is for intrinsics with overloaded types (e.g., those /// using iAny, fAny, vAny, or iPTRAny). For a declaration of an overloaded /// intrinsic, Tys must provide exactly one type for each overloaded type in /// the intrinsic. - Function *getDeclaration(Module *M, ID id, ArrayRef Tys = {}); + Function *getOrInsertDeclaration(Module *M, ID id, ArrayRef Tys = {}); /// Looks up Name in NameTable via binary search. NameTable must be sorted /// and all entries must start with "llvm.". If NameTable contains an exact diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h index dbf2cfb7c5e966..3a04ca87f2b558 100644 --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -72,7 +72,7 @@ class MatrixBuilder { B.getInt32(Columns)}; Type *OverloadedTypes[] = {RetType, Stride->getType()}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes); CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); @@ -95,7 +95,7 @@ class MatrixBuilder { B.getInt32(Rows), B.getInt32(Columns)}; Type *OverloadedTypes[] = {Matrix->getType(), Stride->getType()}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes); CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); @@ -115,7 +115,7 @@ class MatrixBuilder { Type *OverloadedTypes[] = {ReturnType}; Value *Ops[] = {Matrix, B.getInt32(Rows), B.getInt32(Columns)}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_transpose, OverloadedTypes); return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); @@ -136,7 +136,7 @@ class MatrixBuilder { B.getInt32(RHSColumns)}; Type *OverloadedTypes[] = {ReturnType, LHSType, RHSType}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_multiply, OverloadedTypes); return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); } diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index c3b4a8235ce637..5b9bddeb7cfe82 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -360,7 +360,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { OverloadTys)) return error(Info.second, "invalid intrinsic signature"); - U.set(Intrinsic::getDeclaration(M, IID, OverloadTys)); + U.set(Intrinsic::getOrInsertDeclaration(M, IID, OverloadTys)); } Info.first->eraseFromParent(); diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp index 11f123aa5bed85..0a3d0cf8ec9300 100644 --- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp +++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp @@ -356,7 +356,7 @@ static void expandIToFP(Instruction *IToFP) { Entry->getTerminator()->eraseFromParent(); Function *CTLZ = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, IntTy); + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, IntTy); ConstantInt *True = Builder.getTrue(); // entry: diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 04222d5b4afd4c..6d626de0b4e635 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -355,7 +355,7 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType, // Swap bytes if required. if (BSwapSizeType) { - Function *Bswap = Intrinsic::getDeclaration( + Function *Bswap = Intrinsic::getOrInsertDeclaration( CI->getModule(), Intrinsic::bswap, BSwapSizeType); Lhs = Builder.CreateCall(Bswap, Lhs); Rhs = Builder.CreateCall(Bswap, Rhs); diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index ffe879ff049648..32ba3e91822ddb 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -237,7 +237,7 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder, if (ElemCount.isScalable()) { auto *M = Builder.GetInsertBlock()->getModule(); Type *BoolVecTy = VectorType::get(Builder.getInt1Ty(), ElemCount); - Function *ActiveMaskFunc = Intrinsic::getDeclaration( + Function *ActiveMaskFunc = Intrinsic::getOrInsertDeclaration( M, Intrinsic::get_active_lane_mask, {BoolVecTy, EVLParam->getType()}); // `get_active_lane_mask` performs an implicit less-than comparison. Value *ConstZero = Builder.getInt32(0); @@ -299,7 +299,7 @@ Value *CachingVPExpander::expandPredicationToIntCall( case Intrinsic::umin: { Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -308,7 +308,7 @@ Value *CachingVPExpander::expandPredicationToIntCall( case Intrinsic::bswap: case Intrinsic::bitreverse: { Value *Op = VPI.getOperand(0); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -327,7 +327,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( case Intrinsic::fabs: case Intrinsic::sqrt: { Value *Op0 = VPI.getOperand(0); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -337,7 +337,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( case Intrinsic::minnum: { Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -350,7 +350,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); Value *Op2 = VPI.getOperand(2); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp; if (Intrinsic::isConstrainedFPIntrinsic(UnpredicatedIntrinsicID)) @@ -594,7 +594,7 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { // TODO add caching auto *M = VPI.getModule(); Function *VScaleFunc = - Intrinsic::getDeclaration(M, Intrinsic::vscale, Int32Ty); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, Int32Ty); IRBuilder<> Builder(VPI.getParent(), VPI.getIterator()); Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue()); Value *VScale = Builder.CreateCall(VScaleFunc, {}, "vscale"); diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp index 9205eabcf5684e..c8a63304a3b63b 100644 --- a/llvm/lib/CodeGen/HardwareLoops.cpp +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -512,7 +512,7 @@ Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { : Intrinsic::test_set_loop_iterations) : (UsePhi ? Intrinsic::start_loop_iterations : Intrinsic::set_loop_iterations); - Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty); + Function *LoopIter = Intrinsic::getOrInsertDeclaration(M, ID, Ty); Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit); // Use the return value of the intrinsic to control the entry of the loop. @@ -541,9 +541,8 @@ void HardwareLoop::InsertLoopDec() { Attribute::StrictFP)) CondBuilder.setIsFPConstrained(true); - Function *DecFunc = - Intrinsic::getDeclaration(M, Intrinsic::loop_decrement, - LoopDecrement->getType()); + Function *DecFunc = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::loop_decrement, LoopDecrement->getType()); Value *Ops[] = { LoopDecrement }; Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops); Value *OldCond = ExitBranch->getCondition(); @@ -566,9 +565,8 @@ Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) { Attribute::StrictFP)) CondBuilder.setIsFPConstrained(true); - Function *DecFunc = - Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg, - { EltsRem->getType() }); + Function *DecFunc = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::loop_decrement_reg, {EltsRem->getType()}); Value *Ops[] = { EltsRem, LoopDecrement }; Value *Call = CondBuilder.CreateCall(DecFunc, Ops); diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp index 256c081b46e262..f799a8cfc1ba7e 100644 --- a/llvm/lib/CodeGen/IntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp @@ -474,7 +474,7 @@ bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) { // Okay, we can do this xform, do so now. Module *M = CI->getModule(); - Function *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty); + Function *Int = Intrinsic::getOrInsertDeclaration(M, Intrinsic::bswap, Ty); Value *Op = CI->getArgOperand(0); Op = CallInst::Create(Int, Op, CI->getName(), CI->getIterator()); diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index e41d1bfb0e530d..a50909af8bfcfb 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -368,7 +368,8 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) { if (!StackGuardVar) { TL.insertSSPDeclarations(*M); - return IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard)); + return IRB.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard)); } return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard"); diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp index 054f7d7215962e..c4ad9f0b2172fc 100644 --- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp +++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp @@ -508,17 +508,19 @@ bool SjLjEHPrepareImpl::runOnFunction(Function &F) { PointerType *AllocaPtrTy = M.getDataLayout().getAllocaPtrType(M.getContext()); - FrameAddrFn = - Intrinsic::getDeclaration(&M, Intrinsic::frameaddress, {AllocaPtrTy}); - StackAddrFn = - Intrinsic::getDeclaration(&M, Intrinsic::stacksave, {AllocaPtrTy}); - StackRestoreFn = - Intrinsic::getDeclaration(&M, Intrinsic::stackrestore, {AllocaPtrTy}); + FrameAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::frameaddress, + {AllocaPtrTy}); + StackAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::stacksave, + {AllocaPtrTy}); + StackRestoreFn = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::stackrestore, {AllocaPtrTy}); BuiltinSetupDispatchFn = - Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch); - LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda); - CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite); - FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch); + LSDAAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_lsda); + CallSiteFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_callsite); + FuncCtxFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); bool Res = setupEntryBlockAndCallSites(F); return Res; diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 1f23838b2de0ca..a192161bbd9481 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -519,7 +519,8 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M, if (SupportsSelectionDAGSP) *SupportsSelectionDAGSP = true; TLI->insertSSPDeclarations(*M); - return B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard)); + return B.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard)); } /// Insert code into the entry block that stores the stack guard @@ -540,7 +541,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc, AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot"); Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP); - B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackprotector), + B.CreateCall(Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackprotector), {GuardSlot, AI}); return SupportsSelectionDAGSP; } diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp index 7514d49fb18a98..1701b0d04425d2 100644 --- a/llvm/lib/CodeGen/WasmEHPrepare.cpp +++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp @@ -196,7 +196,7 @@ bool WasmEHPrepareImpl::prepareThrows(Function &F) { bool Changed = false; // wasm.throw() intinsic, which will be lowered to wasm 'throw' instruction. - ThrowF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_throw); + ThrowF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_throw); // Insert an unreachable instruction after a call to @llvm.wasm.throw and // delete all following instructions within the BB, and delete all the dead // children of the BB as well. @@ -260,18 +260,21 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) { 0, 2, "selector_gep"); // wasm.landingpad.index() intrinsic, which is to specify landingpad index - LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index); + LPadIndexF = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_landingpad_index); // wasm.lsda() intrinsic. Returns the address of LSDA table for the current // function. - LSDAF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_lsda); + LSDAF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_lsda); // wasm.get.exception() and wasm.get.ehselector() intrinsics. Calls to these // are generated in clang. - GetExnF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_exception); - GetSelectorF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_ehselector); + GetExnF = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_get_exception); + GetSelectorF = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_get_ehselector); // wasm.catch() will be lowered down to wasm 'catch' instruction in // instruction selection. - CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch); + CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch); // _Unwind_CallPersonality() wrapper function, which calls the personality CallPersonalityF = M.getOrInsertFunction("_Unwind_CallPersonality", diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 215bfc8c6cfe3e..477b77a6dd5335 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -65,7 +65,7 @@ static bool upgradePTESTIntrinsic(Function *F, Intrinsic::ID IID, // Yes, it's old, replace it with new version. rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -81,7 +81,7 @@ static bool upgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID, // Move this function aside and map down. rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -94,7 +94,7 @@ static bool upgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -104,7 +104,7 @@ static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -114,7 +114,7 @@ static bool upgradeX86BF16DPIntrinsic(Function *F, Intrinsic::ID IID, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -502,8 +502,8 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::x86_rdtscp); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::x86_rdtscp); return true; } @@ -609,14 +609,15 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name, if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } return false; // No other 'x86.xop.*' } if (Name == "seh.recoverfp") { - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_recoverfp); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::eh_recoverfp); return true; } @@ -630,15 +631,15 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, Function *&NewFn) { if (Name.starts_with("rbit")) { // '(arm|aarch64).rbit'. - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::bitreverse, F->arg_begin()->getType()); return true; } if (Name == "thread.pointer") { // '(arm|aarch64).thread.pointer'. - NewFn = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::thread_pointer); return true; } @@ -663,7 +664,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, std::array Tys{ {F->getReturnType(), FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)}}; - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); return true; } return false; // No other '(arm|aarch64).neon.bfdot.*'. @@ -688,7 +689,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmlalt) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } return false; // No other '(arm|aarch64).neon.bfm*.v16i8'. @@ -712,8 +713,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, .StartsWith("vqsubu.", Intrinsic::usub_sat) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->arg_begin()->getType()); return true; } @@ -733,10 +734,10 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, auto fArgs = F->getFunctionType()->params(); Type *Tys[] = {fArgs[0], fArgs[1]}; if (Groups[1].size() == 1) - NewFn = Intrinsic::getDeclaration(F->getParent(), - StoreInts[fArgs.size() - 3], Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), StoreInts[fArgs.size() - 3], Tys); else - NewFn = Intrinsic::getDeclaration( + NewFn = Intrinsic::getOrInsertDeclaration( F->getParent(), StoreLaneInts[fArgs.size() - 5], Tys); return true; } @@ -810,8 +811,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, .StartsWith("rbit", Intrinsic::bitreverse) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->arg_begin()->getType()); return true; } @@ -821,8 +822,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, return false; // Invalid IR. VectorType *Ty = dyn_cast(F->getReturnType()); if (Ty && Ty->getElementType()->isFloatingPointTy()) { - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::aarch64_neon_faddp, Ty); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_neon_faddp, Ty); return true; } } @@ -840,7 +841,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, .Case("mlalt", Intrinsic::aarch64_sve_bfmlalt_lane_v2) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } return false; // No other 'aarch64.sve.bf*.lane'. @@ -861,8 +862,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, auto Args = F->getFunctionType()->params(); Type *Tys[] = {F->getReturnType(), Args[1]}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::aarch64_sve_faddqv, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_sve_faddqv, Tys); return true; } @@ -880,8 +881,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, Intrinsic::aarch64_sve_ld3_sret, Intrinsic::aarch64_sve_ld4_sret, }; - NewFn = Intrinsic::getDeclaration(F->getParent(), - LoadIDs[Name[0] - '2'], Ty); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + LoadIDs[Name[0] - '2'], Ty); return true; } return false; // No other 'aarch64.sve.ld*'. @@ -892,8 +893,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, if (Name.starts_with("get")) { // 'aarch64.sve.tuple.get*'. Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::vector_extract, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::vector_extract, Tys); return true; } @@ -901,8 +902,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, // 'aarch64.sve.tuple.set*'. auto Args = F->getFunctionType()->params(); Type *Tys[] = {Args[0], Args[2], Args[1]}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::vector_insert, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::vector_insert, Tys); return true; } @@ -911,8 +912,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, // 'aarch64.sve.tuple.create*'. auto Args = F->getFunctionType()->params(); Type *Tys[] = {F->getReturnType(), Args[1]}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::vector_insert, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::vector_insert, Tys); return true; } return false; // No other 'aarch64.sve.tuple.*'. @@ -1026,8 +1027,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.consume_front("amdgcn.")) { if (Name == "alignbit") { // Target specific intrinsic became redundant - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::fshr, - {F->getReturnType()}); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::fshr, {F->getReturnType()}); return true; } @@ -1056,9 +1057,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.starts_with("ldexp.")) { // Target specific intrinsic became redundant - NewFn = Intrinsic::getDeclaration( - F->getParent(), Intrinsic::ldexp, - {F->getReturnType(), F->getArg(1)->getType()}); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::ldexp, + {F->getReturnType(), F->getArg(1)->getType()}); return true; } break; // No other 'amdgcn.*' @@ -1074,15 +1075,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->arg_begin()->getType()); return true; } } if (F->arg_size() == 2 && Name == "coro.end") { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::coro_end); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::coro_end); return true; } @@ -1105,7 +1107,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // converted to DbgVariableRecords later. if (Name == "addr" || (Name == "value" && F->arg_size() == 4)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_value); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::dbg_value); return true; } break; // No other 'dbg.*'. @@ -1135,7 +1138,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // Inserting overloads the inserted type. Tys.push_back(FT->getParamType(1)); rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); return true; } @@ -1171,8 +1174,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (ID != Intrinsic::not_intrinsic) { rename(F); auto Args = F->getFunctionType()->params(); - NewFn = - Intrinsic::getDeclaration(F->getParent(), ID, {Args[V2 ? 1 : 0]}); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + {Args[V2 ? 1 : 0]}); return true; } break; // No other 'expermental.vector.reduce.*'. @@ -1182,15 +1185,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.consume_front("experimental.stepvector.")) { Intrinsic::ID ID = Intrinsic::stepvector; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->getFunctionType()->getReturnType()); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), ID, F->getFunctionType()->getReturnType()); return true; } break; // No other 'e*'. case 'f': if (Name.starts_with("flt.rounds")) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::get_rounding); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::get_rounding); return true; } break; @@ -1200,8 +1204,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, auto Args = F->getFunctionType()->params(); Type* ObjectPtr[1] = {Args[0]}; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::launder_invariant_group, ObjectPtr); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::launder_invariant_group, ObjectPtr); return true; } break; @@ -1218,7 +1222,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // Get the types of dest, src, and len ArrayRef ParamTypes = F->getFunctionType()->params().slice(0, 3); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ParamTypes); + NewFn = + Intrinsic::getOrInsertDeclaration(F->getParent(), ID, ParamTypes); return true; } } @@ -1230,8 +1235,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, FT->getParamType(0), // Dest FT->getParamType(2) // len }; - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::memset, - ParamTypes); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::memset, ParamTypes); return true; } break; @@ -1247,8 +1252,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Case("popc.i", Intrinsic::ctpop) .Default(Intrinsic::not_intrinsic); if (IID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), IID, - {F->getReturnType()}); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID, + {F->getReturnType()}); return true; } } @@ -1316,8 +1321,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys, F->getParent())) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::objectsize, - Tys); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::objectsize, Tys); return true; } } @@ -1326,7 +1331,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, case 'p': if (Name.starts_with("ptr.annotation.") && F->arg_size() == 4) { rename(F); - NewFn = Intrinsic::getDeclaration( + NewFn = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::ptr_annotation, {F->arg_begin()->getType(), F->getArg(1)->getType()}); return true; @@ -1345,7 +1350,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (ID != Intrinsic::not_intrinsic) { if (!F->getFunctionType()->getParamType(2)->isIntegerTy(32)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other applicable upgrades. @@ -1359,7 +1364,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (!F->getFunctionType()->getParamType(2)->isIntegerTy(32) || F->getFunctionType()->getReturnType()->isIntegerTy(64)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other applicable upgrades. @@ -1376,7 +1381,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (ID != Intrinsic::not_intrinsic) { if (F->getFunctionType()->getReturnType()->isIntegerTy(64)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other applicable upgrades. @@ -1395,7 +1400,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, case 'v': { if (Name == "var.annotation" && F->arg_size() == 4) { rename(F); - NewFn = Intrinsic::getDeclaration( + NewFn = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::var_annotation, {{F->arg_begin()->getType(), F->getArg(1)->getType()}}); return true; @@ -1413,8 +1418,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = - Intrinsic::getDeclaration(F->getParent(), ID, F->getReturnType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->getReturnType()); return true; } @@ -1426,7 +1431,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other 'wasm.dot.i8x16.i7x16.*'. @@ -1740,8 +1745,8 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI, if (!IndexForm) std::swap(Args[0], Args[1]); - Value *V = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID), - Args); + Value *V = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Builder.CreateBitCast(CI.getArgOperand(1), Ty); @@ -1753,7 +1758,7 @@ static Value *upgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI, Type *Ty = CI.getType(); Value *Op0 = CI.getOperand(0); Value *Op1 = CI.getOperand(1); - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); Value *Res = Builder.CreateCall(Intrin, {Op0, Op1}); if (CI.arg_size() == 4) { // For masked intrinsics. @@ -1780,7 +1785,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallBase &CI, } Intrinsic::ID IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl; - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); Value *Res = Builder.CreateCall(Intrin, {Src, Src, Amt}); if (CI.arg_size() == 4) { // For masked intrinsics. @@ -1850,7 +1855,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI, } Intrinsic::ID IID = IsShiftRight ? Intrinsic::fshr : Intrinsic::fshl; - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); Value *Res = Builder.CreateCall(Intrin, {Op0, Op1, Amt}); unsigned NumArgs = CI.arg_size(); @@ -1911,7 +1916,8 @@ static Value *upgradeMaskedLoad(IRBuilder<> &Builder, Value *Ptr, static Value *upgradeAbs(IRBuilder<> &Builder, CallBase &CI) { Type *Ty = CI.getType(); Value *Op0 = CI.getArgOperand(0); - Function *F = Intrinsic::getDeclaration(CI.getModule(), Intrinsic::abs, Ty); + Function *F = + Intrinsic::getOrInsertDeclaration(CI.getModule(), Intrinsic::abs, Ty); Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)}); if (CI.arg_size() == 3) Res = emitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1)); @@ -2004,7 +2010,7 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI, // Replace a masked intrinsic with an older unmasked intrinsic. static Value *upgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI, Intrinsic::ID IID) { - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID); Value *Rep = Builder.CreateCall(Intrin, { CI.getArgOperand(0), CI.getArgOperand(1) }); return emitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2)); @@ -2263,8 +2269,8 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, SmallVector Args(CI.args()); Args.pop_back(); Args.pop_back(); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args); unsigned NumArgs = CI.arg_size(); Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep, CI.getArgOperand(NumArgs - 2)); @@ -2320,8 +2326,8 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64. Value *Arg = CI->getArgOperand(0); Value *Ctlz = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, - {Arg->getType()}), + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, + {Arg->getType()}), {Arg, Builder.getFalse()}, "ctlz"); Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); } else if (Name == "popc.ll") { @@ -2329,15 +2335,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, // i64. Value *Arg = CI->getArgOperand(0); Value *Popc = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, - {Arg->getType()}), + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctpop, + {Arg->getType()}), Arg, "ctpop"); Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); } else if (Name == "h2f") { - Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16, - {Builder.getFloatTy()}), - CI->getArgOperand(0), "h2f"); + Rep = Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::convert_from_fp16, + {Builder.getFloatTy()}), + CI->getArgOperand(0), "h2f"); } else if (Name.consume_front("bitcast.") && (Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll")) { @@ -2373,7 +2379,7 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, if (IID != Intrinsic::not_intrinsic && !F->getReturnType()->getScalarType()->isBFloatTy()) { rename(F); - Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + Function *NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); SmallVector Args; for (size_t I = 0; I < NewFn->arg_size(); ++I) { Value *Arg = CI->getArgOperand(I); @@ -2480,15 +2486,15 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, } else if (Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd") { Value *Vec = CI->getArgOperand(0); Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0); - Function *Intr = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sqrt, - Elt0->getType()); + Function *Intr = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::sqrt, Elt0->getType()); Elt0 = Builder.CreateCall(Intr, Elt0); Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0); } else if (Name.starts_with("avx.sqrt.p") || Name.starts_with("sse2.sqrt.p") || Name.starts_with("sse.sqrt.p")) { Rep = - Builder.CreateCall(Intrinsic::getDeclaration( + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::sqrt, CI->getType()), {CI->getArgOperand(0)}); } else if (Name.starts_with("avx512.mask.sqrt.p")) { @@ -2499,13 +2505,13 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, : Intrinsic::x86_avx512_sqrt_pd_512; Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); } else { - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), - Intrinsic::sqrt, - CI->getType()), - {CI->getArgOperand(0)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::sqrt, + CI->getType()), + {CI->getArgOperand(0)}); } Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); @@ -2629,8 +2635,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, break; } - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.mask.fpclass.p")) { Type *OpTy = CI->getArgOperand(0)->getType(); @@ -2652,8 +2659,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else llvm_unreachable("Unexpected intrinsic"); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.cmp.p")) { SmallVector Args(CI->args()); @@ -2681,8 +2689,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, std::swap(Mask, Args.back()); Args.push_back(Mask); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Args); } else if (Name.starts_with("avx512.mask.cmp.")) { // Integer compare intrinsics. unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); @@ -2776,8 +2784,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, cast(CI->getArgOperand(3))->getZExtValue() != 4)) { Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round : Intrinsic::x86_avx512_sitofp_round; - Function *F = - Intrinsic::getDeclaration(CI->getModule(), IID, {DstTy, SrcTy}); + Function *F = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID, + {DstTy, SrcTy}); Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)}); } else { Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt") @@ -2819,7 +2827,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2), ResultTy->getNumElements()); - Function *ELd = Intrinsic::getDeclaration( + Function *ELd = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::masked_expandload, ResultTy); Rep = Builder.CreateCall(ELd, {Ptr, MaskVec, CI->getOperand(1)}); } else if (Name.starts_with("avx512.mask.compress.store.")) { @@ -2834,7 +2842,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, getX86MaskVec(Builder, CI->getArgOperand(2), cast(ResultTy)->getNumElements()); - Function *CSt = Intrinsic::getDeclaration( + Function *CSt = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::masked_compressstore, ResultTy); Rep = Builder.CreateCall(CSt, {CI->getArgOperand(1), Ptr, MaskVec}); } else if (Name.starts_with("avx512.mask.compress.") || @@ -2847,7 +2855,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, bool IsCompress = Name[12] == 'c'; Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress : Intrinsic::x86_avx512_mask_expand; - Function *Intr = Intrinsic::getDeclaration(F->getParent(), IID, ResultTy); + Function *Intr = + Intrinsic::getOrInsertDeclaration(F->getParent(), IID, ResultTy); Rep = Builder.CreateCall(Intr, {CI->getOperand(0), CI->getOperand(1), MaskVec}); } else if (Name.starts_with("xop.vpcom")) { @@ -2910,7 +2919,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, bool ZeroMask = Name[11] == 'z'; Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask); } else if (Name == "sse42.crc32.64.8") { - Function *CRC32 = Intrinsic::getDeclaration( + Function *CRC32 = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::x86_sse42_crc32_32_8); Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C)); @@ -3405,7 +3414,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_add_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3421,7 +3430,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_div_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3437,7 +3446,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_mul_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3453,7 +3462,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_sub_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3471,13 +3480,13 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble]; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.mask.lzcnt.")) { Rep = - Builder.CreateCall(Intrinsic::getDeclaration( + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::ctlz, CI->getType()), {CI->getArgOperand(0), Builder.getInt1(false)}); Rep = @@ -3723,10 +3732,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (NegAcc) Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::fma, - Ops[0]->getType()), - Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma, + Ops[0]->getType()), + Ops); if (IsScalar) Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0); @@ -3738,10 +3747,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0); Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::fma, - Ops[0]->getType()), - Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma, + Ops[0]->getType()), + Ops); Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()), Rep, (uint64_t)0); @@ -3781,11 +3790,11 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_vfmadd_f64; else IID = Intrinsic::x86_avx512_vfmadd_f32; - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), IID); + Function *FMA = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID); Rep = Builder.CreateCall(FMA, Ops); } else { - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, - A->getType()); + Function *FMA = Intrinsic::getOrInsertDeclaration( + CI->getModule(), Intrinsic::fma, A->getType()); Rep = Builder.CreateCall(FMA, {A, B, C}); } @@ -3837,11 +3846,12 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_vfmadd_pd_512; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - {A, B, C, CI->getArgOperand(4)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + {A, B, C, CI->getArgOperand(4)}); } else { - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, - A->getType()); + Function *FMA = Intrinsic::getOrInsertDeclaration( + CI->getModule(), Intrinsic::fma, A->getType()); Rep = Builder.CreateCall(FMA, {A, B, C}); } @@ -3868,8 +3878,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = - Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops); } else if (Name.starts_with("avx512.mask.vfmaddsub.p") || Name.starts_with("avx512.mask3.vfmaddsub.p") || Name.starts_with("avx512.maskz.vfmaddsub.p") || @@ -3892,16 +3902,16 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (IsSubAdd) Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops); } else { int NumElts = cast(CI->getType())->getNumElements(); Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, - Ops[0]->getType()); + Function *FMA = Intrinsic::getOrInsertDeclaration( + CI->getModule(), Intrinsic::fma, Ops[0]->getType()); Value *Odd = Builder.CreateCall(FMA, Ops); Ops[2] = Builder.CreateFNeg(Ops[2]); Value *Even = Builder.CreateCall(FMA, Ops); @@ -3944,8 +3954,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), CI->getArgOperand(3)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru); @@ -3972,8 +3982,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4008,8 +4018,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4038,8 +4048,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4062,7 +4072,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; Value *NewCall = Builder.CreateCall( - Intrinsic::getDeclaration(CI->getModule(), IID), Args); + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); // Extract the second result and store it. Value *Data = Builder.CreateExtractValue(NewCall, 1); @@ -4108,7 +4118,7 @@ static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI, Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]); - Function *NewF = Intrinsic::getDeclaration(CI->getModule(), NewID); + Function *NewF = Intrinsic::getOrInsertDeclaration(CI->getModule(), NewID); return Builder.CreateCall(NewF, Args, CI->getName()); } @@ -4117,16 +4127,17 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (Name == "mve.vctp64.old") { // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the // correct type. - Value *VCTP = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::arm_mve_vctp64), - CI->getArgOperand(0), CI->getName()); + Value *VCTP = + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::arm_mve_vctp64), + CI->getArgOperand(0), CI->getName()); Value *C1 = Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::arm_mve_pred_v2i, {VectorType::get(Builder.getInt1Ty(), 2, false)}), VCTP); return Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::arm_mve_pred_i2v, {VectorType::get(Builder.getInt1Ty(), 4, false)}), C1); @@ -4188,19 +4199,19 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, Type *Ty = Op->getType(); if (Ty->getScalarSizeInBits() == 1) { Value *C1 = Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::arm_mve_pred_v2i, {VectorType::get(Builder.getInt1Ty(), 4, false)}), Op); Op = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), - Intrinsic::arm_mve_pred_i2v, {V2I1Ty}), + Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_i2v, {V2I1Ty}), C1); } Ops.push_back(Op); } - Function *Fn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + Function *Fn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); return Builder.CreateCall(Fn, Ops, CI->getName()); } llvm_unreachable("Unknown function for ARM CallBase upgrade."); @@ -5088,7 +5099,8 @@ void llvm::UpgradeARCRuntime(Module &M) { if (!Fn) return; - Function *NewFn = llvm::Intrinsic::getDeclaration(&M, IntrinsicFunc); + Function *NewFn = + llvm::Intrinsic::getOrInsertDeclaration(&M, IntrinsicFunc); for (User *U : make_early_inc_range(Fn->users())) { CallInst *CI = dyn_cast(U); diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index ee084e870263d0..1cf998c6850068 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -2468,7 +2468,7 @@ LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod, size_t ParamCount) { ArrayRef Tys(unwrap(ParamTypes), ParamCount); auto IID = llvm_map_to_intrinsic_id(ID); - return wrap(llvm::Intrinsic::getDeclaration(unwrap(Mod), IID, Tys)); + return wrap(llvm::Intrinsic::getOrInsertDeclaration(unwrap(Mod), IID, Tys)); } const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength) { diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 0db82cdd6373c8..447a9d65174636 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -991,7 +991,7 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, LLVMContext &Ctx = LinkedInstr->getContext(); Module *M = LinkedInstr->getModule(); if (!AssignFn) - AssignFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign); + AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); std::array Args = { MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)), @@ -1060,7 +1060,7 @@ static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) { } static Function *getDeclareIntrin(Module &M) { - return Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare); + return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare); } DbgInstPtr DIBuilder::insertDbgValueIntrinsic( @@ -1074,7 +1074,7 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic( } if (!ValueFn) - ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value); + ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value); return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB, InsertBefore); } @@ -1175,7 +1175,7 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, } if (!LabelFn) - LabelFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_label); + LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label); Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)}; diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp index 0db908211b553c..b37dbd534092c3 100644 --- a/llvm/lib/IR/DebugProgramInstruction.cpp +++ b/llvm/lib/IR/DebugProgramInstruction.cpp @@ -413,13 +413,13 @@ DbgVariableRecord::createDebugIntrinsic(Module *M, // Work out what sort of intrinsic we're going to produce. switch (getType()) { case DbgVariableRecord::LocationType::Declare: - IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare); + IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare); break; case DbgVariableRecord::LocationType::Value: - IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_value); + IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_value); break; case DbgVariableRecord::LocationType::Assign: - IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign); + IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); break; case DbgVariableRecord::LocationType::End: case DbgVariableRecord::LocationType::Any: @@ -459,7 +459,7 @@ DbgVariableRecord::createDebugIntrinsic(Module *M, DbgLabelInst * DbgLabelRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const { - auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label); + auto *LabelFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_label); Value *Args[] = { MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())}; DbgLabelInst *DbgLabel = cast( diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 8bf695e835c368..3654bf9a9e70b5 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -91,8 +91,8 @@ Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) { if (cast(Scaling)->isZero()) return Scaling; Module *M = GetInsertBlock()->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, + {Scaling->getType()}); CallInst *CI = CreateCall(TheFn, {}, {}, Name); return cast(Scaling)->isOne() ? CI : CreateMul(CI, Scaling); } @@ -142,7 +142,8 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)}; Type *Tys[] = { Ptr->getType(), Size->getType() }; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); + Function *TheFn = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -170,7 +171,8 @@ CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)}; Type *Tys[] = {Dst->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys); + Function *TheFn = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset_inline, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -197,7 +199,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)}; Type *Tys[] = {Ptr->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::memset_element_unordered_atomic, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -227,7 +229,7 @@ CallInst *IRBuilderBase::CreateMemTransferInst( Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, IntrID, Tys); + Function *TheFn = Intrinsic::getOrInsertDeclaration(M, IntrID, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -265,7 +267,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy( Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)}; Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::memcpy_element_unordered_atomic, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -381,7 +383,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove( Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)}; Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::memmove_element_unordered_atomic, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -411,23 +413,23 @@ CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Src}; Type *Tys[] = { Src->getType() }; - auto Decl = Intrinsic::getDeclaration(M, ID, Tys); + auto Decl = Intrinsic::getOrInsertDeclaration(M, ID, Tys); return CreateCall(Decl, Ops); } CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; - auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fadd, - {Src->getType()}); + auto Decl = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reduce_fadd, {Src->getType()}); return CreateCall(Decl, Ops); } CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; - auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fmul, - {Src->getType()}); + auto Decl = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reduce_fmul, {Src->getType()}); return CreateCall(Decl, Ops); } @@ -489,8 +491,8 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { "lifetime.start requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::lifetime_start, {Ptr->getType()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::lifetime_start, {Ptr->getType()}); return CreateCall(TheFn, Ops); } @@ -504,8 +506,8 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { "lifetime.end requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::lifetime_end, {Ptr->getType()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::lifetime_end, {Ptr->getType()}); return CreateCall(TheFn, Ops); } @@ -523,8 +525,8 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) { // Fill in the single overloaded type: memory object type. Type *ObjectPtr[1] = {Ptr->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::invariant_start, ObjectPtr); + Function *TheFn = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::invariant_start, ObjectPtr); return CreateCall(TheFn, Ops); } @@ -556,13 +558,13 @@ IRBuilderBase::CreateAssumption(Value *Cond, Value *Ops[] = { Cond }; Module *M = BB->getParent()->getParent(); - Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume); + Function *FnAssume = Intrinsic::getOrInsertDeclaration(M, Intrinsic::assume); return CreateCall(FnAssume, Ops, OpBundles); } Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) { Module *M = BB->getModule(); - auto *FnIntrinsic = Intrinsic::getDeclaration( + auto *FnIntrinsic = Intrinsic::getOrInsertDeclaration( M, Intrinsic::experimental_noalias_scope_decl, {}); return CreateCall(FnIntrinsic, {Scope}); } @@ -615,7 +617,7 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef OverloadedTypes, const Twine &Name) { Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, Id, OverloadedTypes); + Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Id, OverloadedTypes); return CreateCall(TheFn, Ops, {}, Name); } @@ -765,9 +767,9 @@ static CallInst *CreateGCStatepointCallCommon( const Twine &Name) { Module *M = Builder->GetInsertBlock()->getParent()->getParent(); // Fill in the one generic type'd argument (the function is also vararg) - Function *FnStatepoint = - Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint, - {ActualCallee.getCallee()->getType()}); + Function *FnStatepoint = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_statepoint, + {ActualCallee.getCallee()->getType()}); std::vector Args = getStatepointArgs( *Builder, ID, NumPatchBytes, ActualCallee.getCallee(), Flags, CallArgs); @@ -820,9 +822,9 @@ static InvokeInst *CreateGCStatepointInvokeCommon( const Twine &Name) { Module *M = Builder->GetInsertBlock()->getParent()->getParent(); // Fill in the one generic type'd argument (the function is also vararg) - Function *FnStatepoint = - Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint, - {ActualInvokee.getCallee()->getType()}); + Function *FnStatepoint = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_statepoint, + {ActualInvokee.getCallee()->getType()}); std::vector Args = getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee.getCallee(), @@ -875,7 +877,7 @@ CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint, Intrinsic::ID ID = Intrinsic::experimental_gc_result; Module *M = BB->getParent()->getParent(); Type *Types[] = {ResultType}; - Function *FnGCResult = Intrinsic::getDeclaration(M, ID, Types); + Function *FnGCResult = Intrinsic::getOrInsertDeclaration(M, ID, Types); Value *Args[] = {Statepoint}; return CreateCall(FnGCResult, Args, {}, Name); @@ -886,8 +888,8 @@ CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint, Type *ResultType, const Twine &Name) { Module *M = BB->getParent()->getParent(); Type *Types[] = {ResultType}; - Function *FnGCRelocate = - Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types); + Function *FnGCRelocate = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_relocate, Types); Value *Args[] = {Statepoint, getInt32(BaseOffset), getInt32(DerivedOffset)}; return CreateCall(FnGCRelocate, Args, {}, Name); @@ -897,7 +899,7 @@ CallInst *IRBuilderBase::CreateGCGetPointerBase(Value *DerivedPtr, const Twine &Name) { Module *M = BB->getParent()->getParent(); Type *PtrTy = DerivedPtr->getType(); - Function *FnGCFindBase = Intrinsic::getDeclaration( + Function *FnGCFindBase = Intrinsic::getOrInsertDeclaration( M, Intrinsic::experimental_gc_get_pointer_base, {PtrTy, PtrTy}); return CreateCall(FnGCFindBase, {DerivedPtr}, {}, Name); } @@ -906,7 +908,7 @@ CallInst *IRBuilderBase::CreateGCGetPointerOffset(Value *DerivedPtr, const Twine &Name) { Module *M = BB->getParent()->getParent(); Type *PtrTy = DerivedPtr->getType(); - Function *FnGCGetOffset = Intrinsic::getDeclaration( + Function *FnGCGetOffset = Intrinsic::getOrInsertDeclaration( M, Intrinsic::experimental_gc_get_pointer_offset, {PtrTy}); return CreateCall(FnGCGetOffset, {DerivedPtr}, {}, Name); } @@ -915,7 +917,7 @@ CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource, const Twine &Name) { Module *M = BB->getModule(); - Function *Fn = Intrinsic::getDeclaration(M, ID, {V->getType()}); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {V->getType()}); return createCallHelper(Fn, {V}, Name, FMFSource); } @@ -923,7 +925,7 @@ Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource, const Twine &Name) { Module *M = BB->getModule(); - Function *Fn = Intrinsic::getDeclaration(M, ID, { LHS->getType() }); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {LHS->getType()}); if (Value *V = Folder.FoldBinaryIntrinsic(ID, LHS, RHS, Fn->getReturnType(), FMFSource)) return V; @@ -936,7 +938,7 @@ CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID, Instruction *FMFSource, const Twine &Name) { Module *M = BB->getModule(); - Function *Fn = Intrinsic::getDeclaration(M, ID, Types); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, Types); return createCallHelper(Fn, Args, Name, FMFSource); } @@ -963,7 +965,7 @@ CallInst *IRBuilderBase::CreateIntrinsic(Type *RetTy, Intrinsic::ID ID, "Wrong types for intrinsic!"); // TODO: Handle varargs intrinsics. - Function *Fn = Intrinsic::getDeclaration(M, ID, OverloadTys); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, OverloadTys); return createCallHelper(Fn, Args, Name, FMFSource); } @@ -1120,7 +1122,7 @@ Value *IRBuilderBase::CreateLaunderInvariantGroup(Value *Ptr) { "launder.invariant.group only applies to pointers."); auto *PtrType = Ptr->getType(); Module *M = BB->getParent()->getParent(); - Function *FnLaunderInvariantGroup = Intrinsic::getDeclaration( + Function *FnLaunderInvariantGroup = Intrinsic::getOrInsertDeclaration( M, Intrinsic::launder_invariant_group, {PtrType}); assert(FnLaunderInvariantGroup->getReturnType() == PtrType && @@ -1137,7 +1139,7 @@ Value *IRBuilderBase::CreateStripInvariantGroup(Value *Ptr) { auto *PtrType = Ptr->getType(); Module *M = BB->getParent()->getParent(); - Function *FnStripInvariantGroup = Intrinsic::getDeclaration( + Function *FnStripInvariantGroup = Intrinsic::getOrInsertDeclaration( M, Intrinsic::strip_invariant_group, {PtrType}); assert(FnStripInvariantGroup->getReturnType() == PtrType && @@ -1152,7 +1154,8 @@ Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) { auto *Ty = cast(V->getType()); if (isa(Ty)) { Module *M = BB->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, Ty); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, Ty); return Insert(CallInst::Create(F, V), Name); } // Keep the original behaviour for fixed vector @@ -1171,7 +1174,8 @@ Value *IRBuilderBase::CreateVectorSplice(Value *V1, Value *V2, int64_t Imm, if (auto *VTy = dyn_cast(V1->getType())) { Module *M = BB->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_splice, VTy); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_splice, VTy); Value *Ops[] = {V1, V2, getInt32(Imm)}; return Insert(CallInst::Create(F, Ops), Name); @@ -1225,7 +1229,7 @@ Value *IRBuilderBase::CreatePreserveArrayAccessIndex( Type *ResultType = GetElementPtrInst::getGEPReturnType(Base, IdxList); Module *M = BB->getParent()->getParent(); - Function *FnPreserveArrayAccessIndex = Intrinsic::getDeclaration( + Function *FnPreserveArrayAccessIndex = Intrinsic::getOrInsertDeclaration( M, Intrinsic::preserve_array_access_index, {ResultType, BaseType}); Value *DimV = getInt32(Dimension); @@ -1246,7 +1250,7 @@ Value *IRBuilderBase::CreatePreserveUnionAccessIndex( auto *BaseType = Base->getType(); Module *M = BB->getParent()->getParent(); - Function *FnPreserveUnionAccessIndex = Intrinsic::getDeclaration( + Function *FnPreserveUnionAccessIndex = Intrinsic::getOrInsertDeclaration( M, Intrinsic::preserve_union_access_index, {BaseType, BaseType}); Value *DIIndex = getInt32(FieldIndex); @@ -1271,7 +1275,7 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex( GetElementPtrInst::getGEPReturnType(Base, {Zero, GEPIndex}); Module *M = BB->getParent()->getParent(); - Function *FnPreserveStructAccessIndex = Intrinsic::getDeclaration( + Function *FnPreserveStructAccessIndex = Intrinsic::getOrInsertDeclaration( M, Intrinsic::preserve_struct_access_index, {ResultType, BaseType}); Value *DIIndex = getInt32(FieldIndex); @@ -1288,8 +1292,8 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex( Value *IRBuilderBase::createIsFPClass(Value *FPNum, unsigned Test) { ConstantInt *TestV = getInt32(Test); Module *M = BB->getParent()->getParent(); - Function *FnIsFPClass = - Intrinsic::getDeclaration(M, Intrinsic::is_fpclass, {FPNum->getType()}); + Function *FnIsFPClass = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::is_fpclass, {FPNum->getType()}); return CreateCall(FnIsFPClass, {FPNum, TestV}); } diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 0a6c93fde6302f..002bab8e079e50 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -629,9 +629,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const { return false; } -Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, - Type *ReturnType, - ArrayRef Params) { +Function *VPIntrinsic::getOrInsertDeclarationForParams( + Module *M, Intrinsic::ID VPID, Type *ReturnType, ArrayRef Params) { assert(isVPIntrinsic(VPID) && "not a VP intrinsic"); Function *VPFunc; switch (VPID) { @@ -641,7 +640,7 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, OverloadTy = Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType(); - VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, OverloadTy); break; } case Intrinsic::vp_trunc: @@ -658,43 +657,43 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, case Intrinsic::vp_lrint: case Intrinsic::vp_llrint: case Intrinsic::vp_cttz_elts: - VPFunc = - Intrinsic::getDeclaration(M, VPID, {ReturnType, Params[0]->getType()}); + VPFunc = Intrinsic::getOrInsertDeclaration( + M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::vp_is_fpclass: - VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[0]->getType()}); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, {Params[0]->getType()}); break; case Intrinsic::vp_merge: case Intrinsic::vp_select: - VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()}); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, {Params[1]->getType()}); break; case Intrinsic::vp_load: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::experimental_vp_strided_load: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()}); break; case Intrinsic::vp_gather: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::vp_store: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType()}); break; case Intrinsic::experimental_vp_strided_store: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType(), Params[2]->getType()}); break; case Intrinsic::vp_scatter: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType()}); break; case Intrinsic::experimental_vp_splat: - VPFunc = Intrinsic::getDeclaration(M, VPID, ReturnType); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, ReturnType); break; } assert(VPFunc && "Could not declare VP intrinsic"); diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index ef26b1926b9767..ff8b4b7a020c2f 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -713,7 +713,8 @@ Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) { #include "llvm/IR/IntrinsicImpl.inc" #undef GET_INTRINSIC_ATTRIBUTES -Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef Tys) { +Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id, + ArrayRef Tys) { // There can never be multiple globals with the same name of different types, // because intrinsics must be a specific type. auto *FT = getType(M->getContext(), id, Tys); @@ -1078,7 +1079,7 @@ std::optional Intrinsic::remangleIntrinsicFunction(Function *F) { // invalid and we'll get an error. ExistingGV->setName(WantedName + ".renamed"); } - return Intrinsic::getDeclaration(F->getParent(), ID, ArgTys); + return Intrinsic::getOrInsertDeclaration(F->getParent(), ID, ArgTys); }(); NewDecl->setCallingConv(F->getCallingConv()); diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 704bc8d339bc57..ab48d3e4101b72 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -89,21 +89,22 @@ Module::~Module() { void Module::removeDebugIntrinsicDeclarations() { auto *DeclareIntrinsicFn = - Intrinsic::getDeclaration(this, Intrinsic::dbg_declare); + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_declare); assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) && "Debug declare intrinsic should have had uses removed."); DeclareIntrinsicFn->eraseFromParent(); auto *ValueIntrinsicFn = - Intrinsic::getDeclaration(this, Intrinsic::dbg_value); + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_value); assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) && "Debug value intrinsic should have had uses removed."); ValueIntrinsicFn->eraseFromParent(); auto *AssignIntrinsicFn = - Intrinsic::getDeclaration(this, Intrinsic::dbg_assign); + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_assign); assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) && "Debug assign intrinsic should have had uses removed."); AssignIntrinsicFn->eraseFromParent(); - auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label); + auto *LabelntrinsicFn = + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_label); assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) && "Debug label intrinsic should have had uses removed."); LabelntrinsicFn->eraseFromParent(); diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index f42948ba89042f..737f49b1334d76 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -108,8 +108,8 @@ Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID, if (VLenPosOpt) IntrinParams[*VLenPosOpt] = &requestEVL(); - auto *VPDecl = VPIntrinsic::getDeclarationForParams(&getModule(), VPID, - ReturnTy, IntrinParams); + auto *VPDecl = VPIntrinsic::getOrInsertDeclarationForParams( + &getModule(), VPID, ReturnTy, IntrinParams); return Builder.CreateCall(VPDecl, IntrinParams, Name); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8a217cd1ec5cf9..ae96e277b5fc69 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16454,8 +16454,8 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); if (Parts.size() == 4) { - auto *F = Intrinsic::getDeclaration(TI->getModule(), - Intrinsic::aarch64_neon_tbl4, VecTy); + auto *F = Intrinsic::getOrInsertDeclaration( + TI->getModule(), Intrinsic::aarch64_neon_tbl4, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); Results.push_back(Builder.CreateCall(F, Parts)); Parts.clear(); @@ -16484,7 +16484,7 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { break; } - auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy); + auto *F = Intrinsic::getOrInsertDeclaration(TI->getModule(), TblID, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); Results.push_back(Builder.CreateCall(F, Parts)); } @@ -16765,9 +16765,10 @@ static Function *getStructuredLoadFunction(Module *M, unsigned Factor, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; if (Scalable) - return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy}); + return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy}); - return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy}); + return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2], + {LDVTy, PtrTy}); } static Function *getStructuredStoreFunction(Module *M, unsigned Factor, @@ -16781,9 +16782,10 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, Intrinsic::aarch64_neon_st3, Intrinsic::aarch64_neon_st4}; if (Scalable) - return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy}); + return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy}); - return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy}); + return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2], + {STVTy, PtrTy}); } /// Lower an interleaved load into a ldN intrinsic. @@ -27247,7 +27249,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, if (ValueTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; - Function *Ldxr = Intrinsic::getDeclaration(M, Int); + Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); @@ -27266,7 +27268,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; - Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); + Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); @@ -27281,7 +27283,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilderBase &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); + Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, @@ -27296,7 +27299,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, if (Val->getType()->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; - Function *Stxr = Intrinsic::getDeclaration(M, Int); + Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Type *Int128Ty = Type::getInt128Ty(M->getContext()); @@ -27311,7 +27314,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; Type *Tys[] = { Addr->getType() }; - Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); + Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); @@ -27348,7 +27351,7 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset), diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index e62437c28b863f..fe96fedcfb82dc 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -436,10 +436,10 @@ Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst, void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr, uint64_t Size) { - auto SetTagZeroFunc = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero); - auto StgpFunc = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp); + auto SetTagZeroFunc = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_settag_zero); + auto StgpFunc = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::aarch64_stgp); InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc); bool LittleEndian = @@ -481,8 +481,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( assert(PrologueBB); IRBuilder<> IRB(&PrologueBB->front()); - Function *IRG_SP = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_irg_sp); + Function *IRG_SP = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_irg_sp); Instruction *Base = IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())}); Base->setName("basetag"); @@ -563,8 +563,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { LI = DeleteLI.get(); } - SetTagFunc = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); + SetTagFunc = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::aarch64_settag); Instruction *Base = insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT); @@ -580,7 +580,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { NextTag = (NextTag + 1) % 16; // Replace alloca with tagp(alloca). IRBuilder<> IRB(Info.AI->getNextNode()); - Function *TagP = Intrinsic::getDeclaration( + Function *TagP = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()}); Instruction *TagPCall = IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7b74bb2a03a642..91ab3fcfc4c70e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1637,7 +1637,7 @@ static std::optional instCombineSVEAllActive(IntrinsicInst &II, return std::nullopt; auto *Mod = II.getModule(); - auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()}); + auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()}); II.setCalledFunction(NewDecl); return &II; diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 174d95333d918d..2ee16a873e33b8 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -71,7 +71,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) { // A save to TPIDR2 should be followed by clearing TPIDR2_EL0. Function *WriteIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, Builder.getInt64(0)); } @@ -114,7 +114,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0. Builder.SetInsertPoint(PreludeBB); Function *TPIDR2Intr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr, {}, "tpidr2"); auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2, @@ -128,20 +128,20 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); Function *EnableZAIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_za_enable); Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); } if (FnAttrs.isNewZA()) { Function *ZeroIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_zero); Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr, Builder.getInt32(0xff)); } if (FnAttrs.isNewZT0()) { Function *ClearZT0Intr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_zero_zt); Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr, {Builder.getInt32(0)}); } @@ -153,8 +153,8 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, if (!T || !isa(T)) continue; Builder.SetInsertPoint(T); - Function *DisableZAIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable); + Function *DisableZAIntr = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::aarch64_sme_za_disable); Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index f408a013d7a379..ea88ed424dc597 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -407,8 +407,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, Value *const Identity) const { Type *AtomicTy = V->getType(); Module *M = B.GetInsertBlock()->getModule(); - Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); + Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_update_dpp, AtomicTy); // Reduce within each row of 16 lanes. for (unsigned Idx = 0; Idx < 4; Idx++) { @@ -439,8 +439,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); + Function *ReadLane = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_readlane, AtomicTy); Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); return buildNonAtomicBinOp(B, Op, Lane0, Lane32); @@ -453,8 +453,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, Value *Identity) const { Type *AtomicTy = V->getType(); Module *M = B.GetInsertBlock()->getModule(); - Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); + Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_update_dpp, AtomicTy); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -513,18 +513,18 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, Value *Identity) const { Type *AtomicTy = V->getType(); Module *M = B.GetInsertBlock()->getModule(); - Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); + Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_update_dpp, AtomicTy); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. V = B.CreateCall(UpdateDPP, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy); + Function *ReadLane = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_readlane, AtomicTy); + Function *WriteLane = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_writelane, AtomicTy); // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 052e1140533f3f..7d3164c79089e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -119,8 +119,8 @@ class AMDGPUCodeGenPrepareImpl return SqrtF32; LLVMContext &Ctx = Mod->getContext(); - SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt, - {Type::getFloatTy(Ctx)}); + SqrtF32 = Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_sqrt, + {Type::getFloatTy(Ctx)}); return SqrtF32; } @@ -129,7 +129,7 @@ class AMDGPUCodeGenPrepareImpl return LdexpF32; LLVMContext &Ctx = Mod->getContext(); - LdexpF32 = Intrinsic::getDeclaration( + LdexpF32 = Intrinsic::getOrInsertDeclaration( Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)}); return LdexpF32; } @@ -577,7 +577,7 @@ bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( Type *I32Ty = getI32Ty(Builder, I.getType()); Function *I32 = - Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bitreverse, {I32Ty}); Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); Value *LShrOp = @@ -1260,8 +1260,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl( Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) : Builder.CreateUIToFP(IB,F32Ty); - Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, - Builder.getFloatTy()); + Function *RcpDecl = Intrinsic::getOrInsertDeclaration( + Mod, Intrinsic::amdgcn_rcp, Builder.getFloatTy()); Value *RCP = Builder.CreateCall(RcpDecl, { FB }); Value *FQM = Builder.CreateFMul(FA, RCP); @@ -1455,7 +1455,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder, // Initial estimate of inv(y). Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); - Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); + Function *Rcp = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast(0x4F7FFFFE)); Value *ScaledY = Builder.CreateFMul(RcpY, Scale); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp index 45207c06a788a2..e48fed025857fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -237,7 +237,7 @@ bool optimizeSection(ArrayRef> MergeableInsts) { else NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; - Function *NewIntrin = Intrinsic::getDeclaration( + Function *NewIntrin = Intrinsic::getOrInsertDeclaration( IIList.front()->getModule(), NewIntrinID, OverloadTys); Args[ImageDimIntr->DMaskIndex] = ConstantInt::get(DMask->getType(), NewMaskVal); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index ecb4d4fa5d5c39..6a5a48778197e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -130,7 +130,8 @@ static std::optional modifyIntrinsicCall( // Modify arguments and types Func(Args, ArgTys); - Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); + Function *I = + Intrinsic::getOrInsertDeclaration(OldIntr.getModule(), NewIntr, ArgTys); CallInst *NewCall = IC.Builder.CreateCall(I, Args); NewCall->takeName(&OldIntr); @@ -502,7 +503,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp)) break; - Function *NewDecl = Intrinsic::getDeclaration( + Function *NewDecl = Intrinsic::getOrInsertDeclaration( SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()}); InnerFMF |= FMF; @@ -527,7 +528,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // f16 amdgcn.sqrt is identical to regular sqrt. if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) { - Function *NewDecl = Intrinsic::getDeclaration( + Function *NewDecl = Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::sqrt, {II.getType()}); II.setCalledFunction(NewDecl); return &II; @@ -614,7 +615,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Src1 = II.getArgOperand(1); const ConstantInt *CMask = dyn_cast(Src1); if (CMask) { - II.setCalledOperand(Intrinsic::getDeclaration( + II.setCalledOperand(Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::is_fpclass, Src0->getType())); // Clamp any excess bits, as they're illegal for the generic intrinsic. @@ -890,7 +891,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // register (which contains the bitmask of live threads). So a // comparison that always returns true is the same as a read of the // EXEC register. - Function *NewF = Intrinsic::getDeclaration( + Function *NewF = Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::read_register, II.getType()); Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; MDNode *MD = MDNode::get(II.getContext(), MDArgs); @@ -989,7 +990,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) break; - Function *NewF = Intrinsic::getDeclaration( + Function *NewF = Intrinsic::getOrInsertDeclaration( II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); Value *Args[] = {SrcLHS, SrcRHS, ConstantInt::get(CC->getType(), SrcPred)}; @@ -1205,7 +1206,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // If we can prove we don't have one of the special cases then we can use a // normal fma instead. if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { - II.setCalledOperand(Intrinsic::getDeclaration( + II.setCalledOperand(Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::fma, II.getType())); return &II; } @@ -1401,7 +1402,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); } - Function *NewIntrin = Intrinsic::getDeclaration( + Function *NewIntrin = Intrinsic::getOrInsertDeclaration( II.getModule(), II.getIntrinsicID(), OverloadTys); CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); NewCall->takeName(&II); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 53628981e12409..800bdbe04cf70d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1555,8 +1555,8 @@ bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { MIB.addImm(MFI->getLDSSize()); } else { Module *M = MF->getFunction().getParent(); - const GlobalValue *GV - = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize); + const GlobalValue *GV = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize); MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index e01c9dc66a3f1f..eb553ae4eb80ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -753,7 +753,7 @@ bool AMDGPULibCalls::fold(CallInst *CI) { CI->setArgOperand(1, SplatArg1); } - CI->setCalledFunction(Intrinsic::getDeclaration( + CI->setCalledFunction(Intrinsic::getOrInsertDeclaration( CI->getModule(), Intrinsic::ldexp, {CI->getType(), CI->getArgOperand(1)->getType()})); return true; @@ -1034,7 +1034,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) FunctionCallee ExpExpr; if (ShouldUseIntrinsic) - ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()}); + ExpExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::exp2, + {FPOp->getType()}); else { ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); if (!ExpExpr) @@ -1108,8 +1109,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, if (needlog) { FunctionCallee LogExpr; if (ShouldUseIntrinsic) { - LogExpr = - Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()}); + LogExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::log2, + {FPOp->getType()}); } else { LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); if (!LogExpr) @@ -1298,8 +1299,8 @@ void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, } } - CI->setCalledFunction( - Intrinsic::getDeclaration(CI->getModule(), IntrID, {CI->getType()})); + CI->setCalledFunction(Intrinsic::getOrInsertDeclaration( + CI->getModule(), IntrID, {CI->getType()})); } bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 51a5b7702c0093..ff5eb81490106f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -285,8 +285,8 @@ class AMDGPULowerModuleLDS { BasicBlock *Entry = &Func->getEntryBlock(); IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); - Function *Decl = - Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration( + Func->getParent(), Intrinsic::donothing, {}); Value *UseInstance[1] = { Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; @@ -529,8 +529,8 @@ class AMDGPULowerModuleLDS { // block to spare deduplicating it later. auto [It, Inserted] = tableKernelIndexCache.try_emplace(F); if (Inserted) { - Function *Decl = - Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::amdgcn_lds_kernel_id, {}); auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); IRBuilder<> Builder(&*InsertAt); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 24bfbff41ec5c0..63da3443479be3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -973,10 +973,10 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!IsAMDHSA) { - Function *LocalSizeYFn = - Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); - Function *LocalSizeZFn = - Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); + Function *LocalSizeYFn = Intrinsic::getOrInsertDeclaration( + Mod, Intrinsic::r600_read_local_size_y); + Function *LocalSizeZFn = Intrinsic::getOrInsertDeclaration( + Mod, Intrinsic::r600_read_local_size_z); CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); @@ -1022,7 +1022,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { // } hsa_kernel_dispatch_packet_t // Function *DispatchPtrFn = - Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); DispatchPtr->addRetAttr(Attribute::NoAlias); @@ -1082,7 +1082,7 @@ Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, llvm_unreachable("invalid dimension"); } - Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); + Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID); CallInst *CI = Builder.CreateCall(WorkitemIdFn); ST.makeLIDRangeMetadata(CI); F->removeFnAttr(AttrName); @@ -1564,7 +1564,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); - Function *ObjectSize = Intrinsic::getDeclaration( + Function *ObjectSize = Intrinsic::getOrInsertDeclaration( Mod, Intrinsic::objectsize, {Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 4669bb45473cb0..cfce56f0bfe968 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -336,8 +336,8 @@ static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { BasicBlock *Entry = &Func->getEntryBlock(); IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); - Function *Decl = - Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(), + Intrinsic::donothing, {}); Value *UseInstance[1] = { Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; @@ -922,7 +922,8 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, StringRef("__asan_free_impl"), FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false)); Value *ReturnAddr = IRB.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::returnaddress), IRB.getInt32(0)); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress), + IRB.getInt32(0)); Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty); Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty); IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt}); @@ -1055,8 +1056,8 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses( SetVector LDSInstructions; getLDSMemoryInstructions(Func, LDSInstructions); - Function *Decl = - Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::amdgcn_lds_kernel_id, {}); auto *KernelId = IRB.CreateCall(Decl, {}); GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable; GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index d701bf037fdfa6..5d7ca89571b27b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1112,8 +1112,8 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS)) return nullptr; Module *M = II->getModule(); - Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(), - {DestTy, SrcTy, DestTy}); + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return II; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 3758c768b8673f..59cc61e347bc0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -295,8 +295,8 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, // Remove and delete the unreachable inst. UnreachableBlock->getTerminator()->eraseFromParent(); - Function *UnreachableIntrin = - Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable); + Function *UnreachableIntrin = Intrinsic::getOrInsertDeclaration( + F.getParent(), Intrinsic::amdgcn_unreachable); // Insert a call to an intrinsic tracking that this is an unreachable // point, in case we want to kill the active lanes or something later. diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index edd881c84078c6..a7f2b66e3cd116 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -117,13 +117,15 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { BoolUndef = PoisonValue::get(Boolean); IntMaskZero = ConstantInt::get(IntMask, 0); - If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask }); - Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else, - { IntMask, IntMask }); - IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, - { IntMask }); - Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); - EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); + If = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_if, {IntMask}); + Else = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_else, + {IntMask, IntMask}); + IfBreak = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_if_break, + {IntMask}); + Loop = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_loop, {IntMask}); + EndCf = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_end_cf, + {IntMask}); } /// Is the branch condition uniform or did the StructurizeCFG pass diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index bf757edfa85890..a35582bebb08a3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21149,7 +21149,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { - Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); + Function *MCR = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_mcr); Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; @@ -21160,7 +21160,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, llvm_unreachable("makeDMB on a target so old that it has no barriers"); } } else { - Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); + Function *DMB = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_dmb); // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); @@ -21417,7 +21417,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, if (ValueTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; - Function *Ldrex = Intrinsic::getDeclaration(M, Int); + Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int); Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); @@ -21433,7 +21433,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; - Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); + Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int, Tys); CallInst *CI = Builder.CreateCall(Ldrex, Addr); CI->addParamAttr( @@ -21446,7 +21446,8 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( if (!Subtarget->hasV7Ops()) return; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); + Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_clrex)); } Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, @@ -21461,7 +21462,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, if (Val->getType()->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; - Function *Strex = Intrinsic::getDeclaration(M, Int); + Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int); Type *Int32Ty = Type::getInt32Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); @@ -21473,7 +21474,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; Type *Tys[] = { Addr->getType() }; - Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); + Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int, Tys); CallInst *CI = Builder.CreateCall( Strex, {Builder.CreateZExtOrBitCast( @@ -21601,8 +21602,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + Function *VldnFunc = Intrinsic::getOrInsertDeclaration( + LI->getModule(), LoadInts[Factor - 2], Tys); SmallVector Ops; Ops.push_back(BaseAddr); @@ -21617,7 +21618,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, PtrTy}; Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); + Intrinsic::getOrInsertDeclaration(LI->getModule(), LoadInts, Tys); SmallVector Ops; Ops.push_back(BaseAddr); @@ -21762,7 +21763,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {PtrTy, SubVecTy}; - Function *VstNFunc = Intrinsic::getDeclaration( + Function *VstNFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), StoreInts[Factor - 2], Tys); SmallVector Ops; @@ -21778,7 +21779,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {PtrTy, SubVecTy}; Function *VstNFunc = - Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); + Intrinsic::getOrInsertDeclaration(SI->getModule(), StoreInts, Tys); SmallVector Ops; Ops.push_back(BaseAddr); diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp index 861d60d3bcce95..7804725ce77319 100644 --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -630,13 +630,14 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) { Value* Args[] = { WideLd0, WideLd1, Acc }; Function *SMLAD = nullptr; if (Exchange) - SMLAD = Acc->getType()->isIntegerTy(32) ? - Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) : - Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx); + SMLAD = + Acc->getType()->isIntegerTy(32) + ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smladx) + : Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlaldx); else - SMLAD = Acc->getType()->isIntegerTy(32) ? - Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : - Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); + SMLAD = Acc->getType()->isIntegerTy(32) + ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlad) + : Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlald); IRBuilder Builder(InsertAfter->getParent(), BasicBlock::iterator(InsertAfter)); diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index e554e4d428d46f..60211db8a61ae3 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -401,7 +401,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; } - Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); + Function *VCTP = Intrinsic::getOrInsertDeclaration(M, VCTPID); Value *VCTPCall = Builder.CreateCall(VCTP, Processed); ActiveLaneMask->replaceAllUsesWith(VCTPCall); diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index 4be6220b358ba3..7921518166f97d 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -103,7 +103,7 @@ uint32_t BPFCoreSharedInfo::SeqNum; Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB, Instruction *Input, Instruction *Before) { - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::bpf_passthrough, {Input->getType(), Input->getType()}); Constant *SeqNumVal = ConstantInt::get(Type::getInt32Ty(BB->getContext()), BPFCoreSharedInfo::SeqNum++); diff --git a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp index 4ab0cbcc924779..4ca7bbe9c2a8c4 100644 --- a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp +++ b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp @@ -126,7 +126,7 @@ bool BPFAdjustOptImpl::adjustICmpToBuiltin() { Constant *Opcode = ConstantInt::get(Type::getInt32Ty(BB.getContext()), Op); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::bpf_compare, {Op0->getType(), ConstOp1->getType()}); auto *NewInst = CallInst::Create(Fn, {Opcode, Op0, ConstOp1}); NewInst->insertBefore(&I); diff --git a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp index 5d8339b4a44cec..9f7e3414beb8e3 100644 --- a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp +++ b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp @@ -163,7 +163,7 @@ static CallInst *makeIntrinsicCall(Module *M, ArrayRef Types, ArrayRef Args) { - Function *Fn = Intrinsic::getDeclaration(M, Intrinsic, Types); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, Intrinsic, Types); return CallInst::Create(Fn, Args); } diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index c0f8d433833ee7..99df4850872078 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -134,8 +134,8 @@ class OpLowerer { /// piecemeal way - we can add the casts in to avoid updating all of the uses /// or defs, and by the end all of the casts will be redundant. Value *createTmpHandleCast(Value *V, Type *Ty) { - Function *CastFn = Intrinsic::getDeclaration(&M, Intrinsic::dx_cast_handle, - {Ty, V->getType()}); + Function *CastFn = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::dx_cast_handle, {Ty, V->getType()}); CallInst *Cast = OpBuilder.getIRB().CreateCall(CastFn, {V}); CleanupCasts.push_back(Cast); return Cast; diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp index 3274f9162b543a..65bbb1364488f7 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -212,7 +212,7 @@ bool HexagonGenExtract::convert(Instruction *In) { Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; Module *Mod = BB->getParent()->getParent(); - Function *ExtF = Intrinsic::getDeclaration(Mod, IntId); + Function *ExtF = Intrinsic::getOrInsertDeclaration(Mod, IntId); Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); if (SL != 0) NewIn = IRB.CreateShl(NewIn, SL, CSL->getName()); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 856c952e785dac..03c12f5ce44707 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3865,7 +3865,7 @@ Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder, assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked : Intrinsic::hexagon_L4_loadd_locked; - Function *Fn = Intrinsic::getDeclaration(M, IntID); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID); Value *Call = Builder.CreateCall(Fn, Addr, "larx"); @@ -3886,7 +3886,7 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilderBase &Builder, assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked : Intrinsic::hexagon_S4_stored_locked; - Function *Fn = Intrinsic::getDeclaration(M, IntID); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID); Val = Builder.CreateBitCast(Val, CastTy); diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 4ef009c87a1e63..705e1f43851f7a 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1532,7 +1532,8 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At, ParsedValues &PV) { IRBuilder<> B(&*At); Module *M = At->getParent()->getParent()->getParent(); - Function *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw); + Function *PMF = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::hexagon_M4_pmpyw); Value *P = PV.P, *Q = PV.Q, *P0 = P; unsigned IC = PV.IterCount; diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index f4e495266eae3f..d2cfd3851e711d 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -2390,8 +2390,8 @@ auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo, Type *Int64Ty = Type::getInt64Ty(F.getContext()); Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst"); Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst"); - Function *FI = Intrinsic::getDeclaration(F.getParent(), - Intrinsic::hexagon_S2_valignrb); + Function *FI = Intrinsic::getOrInsertDeclaration( + F.getParent(), Intrinsic::hexagon_S2_valignrb); Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt}, "cup"); return Builder.CreateBitCast(Call, Lo->getType(), "cst"); } @@ -2587,12 +2587,13 @@ auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder, unsigned HwLen = HST.getVectorLength(); Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast : Intrinsic::hexagon_V6_pred_typecast_128B; - Function *FI = - Intrinsic::getDeclaration(F.getParent(), TC, {DestTy, Val->getType()}); + Function *FI = Intrinsic::getOrInsertDeclaration(F.getParent(), TC, + {DestTy, Val->getType()}); return Builder.CreateCall(FI, {Val}, "cup"); }; - Function *IntrFn = Intrinsic::getDeclaration(F.getParent(), IntID, ArgTys); + Function *IntrFn = + Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys); FunctionType *IntrTy = IntrFn->getFunctionType(); SmallVector IntrArgs; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index bfafb331752108..8edca34624e9b2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -5808,7 +5808,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty()); Type *Tys[] = {AlignedAddr->getType()}; Function *MaskedCmpXchg = - Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys); + Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys); Value *Result = Builder.CreateCall( MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering}); Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); @@ -5838,7 +5838,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicRMWIntrinsic( Value *Ordering = Builder.getIntN(GRLen, static_cast(AI->getOrdering())); Type *Tys[] = {AlignedAddr->getType()}; - Function *LlwOpScwLoop = Intrinsic::getDeclaration( + Function *LlwOpScwLoop = Intrinsic::getOrInsertDeclaration( AI->getModule(), getIntrinsicForMaskedAtomicRMWBinOp(GRLen, AI->getOperation()), Tys); diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 082546c4dd72f8..1e30e0113e43c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -282,7 +282,7 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, [](Value *Addr, Instruction *OriginalUser) -> Value * { PointerType *ReturnTy = PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC); - Function *CvtToGen = Intrinsic::getDeclaration( + Function *CvtToGen = Intrinsic::getOrInsertDeclaration( OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen, {ReturnTy, PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_PARAM)}); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 9a8ea8f87896ad..b141229dcfc733 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -360,7 +360,8 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { // type argument, equal to that of the nvvm intrinsic's argument. Type *Tys[] = {II->getArgOperand(0)->getType()}; return CallInst::Create( - Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); + Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys), + Args); } // Simplify to target-generic binary op. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index d9847a21489e63..911d92f0c4846b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12181,7 +12181,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getDeclaration(M, Id); + Function *Func = Intrinsic::getOrInsertDeclaration(M, Id); return Builder.CreateCall(Func, {}); } @@ -12206,7 +12206,7 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. if (isa(Inst)) return Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( Builder.GetInsertBlock()->getParent()->getParent(), Intrinsic::ppc_cfence, {Inst->getType()}), {Inst}); @@ -19005,7 +19005,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = Incr->getType(); assert(ValTy->getPrimitiveSizeInBits() == 128); - Function *RMW = Intrinsic::getDeclaration( + Function *RMW = Intrinsic::getOrInsertDeclaration( M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation())); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo"); @@ -19028,7 +19028,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( Type *ValTy = CmpVal->getType(); assert(ValTy->getPrimitiveSizeInBits() == 128); Function *IntCmpXchg = - Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::ppc_cmpxchg_i128); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo"); Value *CmpHi = diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp index d10fe11bb5877b..9c2b58a47392f9 100644 --- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp +++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp @@ -123,7 +123,7 @@ bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func, return false; CI->setCalledFunction( - Intrinsic::getDeclaration(&M, Intrinsic::pow, CI->getType())); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::pow, CI->getType())); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 230ccd8209e1f2..1f9fc984515cf6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20608,7 +20608,7 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic( Value *Ordering = Builder.getIntN(XLen, static_cast(AI->getOrdering())); Type *Tys[] = {AlignedAddr->getType()}; - Function *LrwOpScwLoop = Intrinsic::getDeclaration( + Function *LrwOpScwLoop = Intrinsic::getOrInsertDeclaration( AI->getModule(), getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys); @@ -20672,7 +20672,7 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( } Type *Tys[] = {AlignedAddr->getType()}; Function *MaskedCmpXchg = - Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys); + Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys); Value *Result = Builder.CreateCall( MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); if (XLen == 64) @@ -21170,7 +21170,7 @@ bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const { static Value *useTpOffset(IRBuilderBase &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getModule(); Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); return IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset); } @@ -21287,9 +21287,9 @@ bool RISCVTargetLowering::lowerInterleavedLoad( auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); - Function *VlsegNFunc = - Intrinsic::getDeclaration(LI->getModule(), FixedVlsegIntrIds[Factor - 2], - {VTy, LI->getPointerOperandType(), XLenTy}); + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + LI->getModule(), FixedVlsegIntrIds[Factor - 2], + {VTy, LI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); @@ -21341,9 +21341,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); - Function *VssegNFunc = - Intrinsic::getDeclaration(SI->getModule(), FixedVssegIntrIds[Factor - 2], - {VTy, SI->getPointerOperandType(), XLenTy}); + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + SI->getModule(), FixedVssegIntrIds[Factor - 2], + {VTy, SI->getPointerOperandType(), XLenTy}); auto Mask = SVI->getShuffleMask(); SmallVector Ops; @@ -21388,7 +21388,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); if (auto *FVTy = dyn_cast(ResVTy)) { - Function *VlsegNFunc = Intrinsic::getDeclaration( + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( LI->getModule(), FixedVlsegIntrIds[Factor - 2], {ResVTy, LI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); @@ -21408,7 +21408,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( NumElts * SEW / 8), Factor); - Function *VlsegNFunc = Intrinsic::getDeclaration( + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( LI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy}); Value *VL = Constant::getAllOnesValue(XLenTy); @@ -21418,7 +21418,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( SmallVector AggrTypes{Factor, ResVTy}; Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getDeclaration( + Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( LI->getModule(), Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy}); for (unsigned i = 0; i < Factor; ++i) { Value *VecExtract = @@ -21454,7 +21454,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); if (auto *FVTy = dyn_cast(InVTy)) { - Function *VssegNFunc = Intrinsic::getDeclaration( + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, SI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); @@ -21475,12 +21475,12 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( NumElts * SEW / 8), Factor); - Function *VssegNFunc = Intrinsic::getDeclaration( + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy}); Value *VL = Constant::getAllOnesValue(XLenTy); - Function *VecInsertFunc = Intrinsic::getDeclaration( + Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}); Value *StoredVal = PoisonValue::get(VecTupTy); for (unsigned i = 0; i < Factor; ++i) diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 1872b238d1077a..ecf9b6ddae1fc3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -353,11 +353,11 @@ static void lowerExpectAssume(IntrinsicInst *II) { // We need to lower this into a builtin and then the builtin into a SPIR-V // instruction. if (II->getIntrinsicID() == Intrinsic::assume) { - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( II->getModule(), Intrinsic::SPVIntrinsics::spv_assume); II->setCalledFunction(F); } else if (II->getIntrinsicID() == Intrinsic::expect) { - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( II->getModule(), Intrinsic::SPVIntrinsics::spv_expect, {II->getOperand(0)->getType()}); II->setCalledFunction(F); @@ -372,12 +372,12 @@ static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID, ArrayRef OpNos) { Function *F = nullptr; if (OpNos.empty()) { - F = Intrinsic::getDeclaration(II->getModule(), NewID); + F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID); } else { SmallVector Tys; for (unsigned OpNo : OpNos) Tys.push_back(II->getOperand(OpNo)->getType()); - F = Intrinsic::getDeclaration(II->getModule(), NewID, Tys); + F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys); } II->setCalledFunction(F); return true; diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp index f62afb8ddfcfae..345327e880ecd5 100644 --- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp @@ -366,8 +366,8 @@ bool SystemZTDCPass::runOnFunction(Function &F) { if (!Worthy) continue; // Call the intrinsic, compare result with 0. - Function *TDCFunc = - Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc, V->getType()); + Function *TDCFunc = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::s390_tdc, V->getType()); IRBuilder<> IRB(I); Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask); Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal}); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index c040e560be605f..b999f83507f4ce 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -1016,7 +1016,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { // wasm.catch() will be lowered down to wasm 'catch' instruction in // instruction selection. - CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch); + CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch); // Type for struct __WasmLongjmpArgs LongjmpArgsTy = StructType::get(Int8PtrTy, // env Int32Ty // val diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp index 2594430d1d8f3a..c61aa5eff4a708 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp @@ -72,7 +72,7 @@ bool WebAssemblyLowerRefTypesIntPtrConv::runOnFunction(Function &F) { I->replaceAllUsesWith(U); Function *TrapIntrin = - Intrinsic::getDeclaration(F.getParent(), Intrinsic::debugtrap); + Intrinsic::getOrInsertDeclaration(F.getParent(), Intrinsic::debugtrap); CallInst::Create(TrapIntrin, {}, "", I->getIterator()); worklist.insert(&*I); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7a6d20c6a121b6..de88db22279797 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31163,12 +31163,14 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) { auto *C = cast(I->getOperand(I->getOperand(0) == AI ? 1 : 0)); - BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType()); + BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_C, + AI->getType()); unsigned Imm = llvm::countr_zero(C->getZExtValue()); Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); } else { - BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType()); + BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_I, + AI->getType()); assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit); @@ -31328,7 +31330,7 @@ void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic( break; } Function *CmpArith = - Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); + Intrinsic::getOrInsertDeclaration(AI->getModule(), IID, AI->getType()); Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), PointerType::getUnqual(Ctx)); Value *Call = Builder.CreateCall( @@ -31444,7 +31446,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { return nullptr; Function *MFence = - llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); + llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 77139f38c977bb..c4374984da4b9e 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -1876,7 +1876,8 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { Value *Args[] = {Op0, CILength, CIIndex}; Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_extrqi); return Builder.CreateCall(F, Args); } } @@ -1975,7 +1976,8 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, Value *Args[] = {Op0, Op1, CILength, CIIndex}; Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_insertqi); return Builder.CreateCall(F, Args); } diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index 5bbfabcbd67bc6..e88702caa9a52b 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -278,7 +278,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) { IntrinsicNumElts = 16; } - Function *PSADBWFn = Intrinsic::getDeclaration(Op->getModule(), IID); + Function *PSADBWFn = Intrinsic::getOrInsertDeclaration(Op->getModule(), IID); if (NumElts < 16) { // Pad input with zeroes. diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index 963d613ddbfe7d..05fc6f13129f24 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -334,7 +334,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { if (UseStackGuard) { Value *Val = Builder.CreateLoad(Int32Ty, Cookie); Value *FrameAddr = Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( TheModule, Intrinsic::frameaddress, Builder.getPtrTy( TheModule->getDataLayout().getAllocaAddrSpace())), @@ -370,7 +370,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) { return Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), F); + Intrinsic::getOrInsertDeclaration(TheModule, Intrinsic::x86_seh_lsda), F); } /// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls @@ -624,17 +624,17 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { // that it can recover the original frame pointer. IRBuilder<> Builder(RegNode->getNextNode()); Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy()); - Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode), - {RegNodeI8}); + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + TheModule, Intrinsic::x86_seh_ehregnode), + {RegNodeI8}); if (EHGuardNode) { IRBuilder<> Builder(EHGuardNode->getNextNode()); Value *EHGuardNodeI8 = Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy()); - Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehguard), - {EHGuardNodeI8}); + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + TheModule, Intrinsic::x86_seh_ehguard), + {EHGuardNodeI8}); } // Calculate state numbers. diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 95962d1a0a240f..3604774ddf35bf 100644 --- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -157,8 +157,8 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) { for (User *U : Users) { Instruction *Inst = cast(U); IRBuilder<> Builder(Inst); - Function *GetID = Intrinsic::getDeclaration(GV->getParent(), - Intrinsic::xcore_getid); + Function *GetID = Intrinsic::getOrInsertDeclaration(GV->getParent(), + Intrinsic::xcore_getid); Value *ThreadID = Builder.CreateCall(GetID, {}); Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV, {Builder.getInt64(0), ThreadID}); diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 9943c3cbb9fc7d..898d55fab2b00d 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -172,7 +172,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { // %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ] // --> // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt) - Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(Phi.getModule(), IID, Phi.getType()); Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt})); return true; } @@ -331,7 +332,7 @@ static bool tryToRecognizePopCount(Instruction &I) { m_SpecificInt(Mask55)))) { LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n"); IRBuilder<> Builder(&I); - Function *Func = Intrinsic::getDeclaration( + Function *Func = Intrinsic::getOrInsertDeclaration( I.getModule(), Intrinsic::ctpop, I.getType()); I.replaceAllUsesWith(Builder.CreateCall(Func, {Root})); ++NumPopCountRecognized; @@ -398,8 +399,8 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) { return false; IRBuilder<> Builder(&I); - Function *Fn = Intrinsic::getDeclaration(I.getModule(), Intrinsic::fptosi_sat, - {SatTy, FpTy}); + Function *Fn = Intrinsic::getOrInsertDeclaration( + I.getModule(), Intrinsic::fptosi_sat, {SatTy, FpTy}); Value *Sat = Builder.CreateCall(Fn, In); I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy)); return true; @@ -431,7 +432,7 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, IRBuilderBase::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(Call->getFastMathFlags()); - Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty); + Function *Sqrt = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sqrt, Ty); Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt"); Call->replaceAllUsesWith(NewSqrt); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 1c45bcd7f6a837..45b9767657c66a 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -52,7 +52,8 @@ coro::LowererBase::LowererBase(Module &M) CallInst *coro::LowererBase::makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt) { auto *IndexVal = ConstantInt::get(Type::getInt8Ty(Context), Index); - auto *Fn = Intrinsic::getDeclaration(&TheModule, Intrinsic::coro_subfn_addr); + auto *Fn = + Intrinsic::getOrInsertDeclaration(&TheModule, Intrinsic::coro_subfn_addr); assert(Index >= CoroSubFnInst::IndexFirst && Index < CoroSubFnInst::IndexLast && @@ -183,7 +184,7 @@ void coro::suppressCoroAllocs(LLVMContext &Context, static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin, CoroSuspendInst *SuspendInst) { Module *M = SuspendInst->getModule(); - auto *Fn = Intrinsic::getDeclaration(M, Intrinsic::coro_save); + auto *Fn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::coro_save); auto *SaveInst = cast( CallInst::Create(Fn, CoroBegin, "", SuspendInst->getIterator())); assert(!SuspendInst->getCoroSave()); diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index 91d445dfc4c734..9e5d9ea31af6c4 100644 --- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -125,7 +125,8 @@ void CrossDSOCFI::buildCFICheck(Module &M) { ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId); BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F); IRBuilder<> IRBTest(TestBB); - Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test); + Function *BitsetTestFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test); Value *Test = IRBTest.CreateCall( BitsetTestFn, {&Addr, MetadataAsValue::get( diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp index d84856f71c9de6..543987d5981bab 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp @@ -401,7 +401,7 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) { assert(Builder.GetInsertPoint() != BB->end() && "Cannot get the probing point"); Function *ProbeFn = - llvm::Intrinsic::getDeclaration(M, Intrinsic::pseudoprobe); + llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::pseudoprobe); Value *Args[] = {Builder.getInt64(Guid), Builder.getInt64(Index), Builder.getInt32(0), Builder.getInt64(PseudoProbeFullDistributionFactor)}; diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 36a1841b363463..59f986b4ca2664 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -856,7 +856,7 @@ void llvm::updatePublicTypeTestCalls(Module &M, return; if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) { Function *TypeTestFunc = - Intrinsic::getDeclaration(&M, Intrinsic::type_test); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test); for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) { auto *CI = cast(U.getUser()); auto *NewCI = CallInst::Create( @@ -1187,7 +1187,8 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, Instruction *ThenTerm = SplitBlockAndInsertIfThen(Cond, &CB, /*Unreachable=*/false); Builder.SetInsertPoint(ThenTerm); - Function *TrapFn = Intrinsic::getDeclaration(&M, Intrinsic::debugtrap); + Function *TrapFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap); auto *CallTrap = Builder.CreateCall(TrapFn); CallTrap->setDebugLoc(CB.getDebugLoc()); } @@ -1434,8 +1435,8 @@ void DevirtModule::tryICallBranchFunnel( } BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr); - Function *Intr = - Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {}); + Function *Intr = Intrinsic::getOrInsertDeclaration( + &M, llvm::Intrinsic::icall_branch_funnel, {}); auto *CI = CallInst::Create(Intr, JTArgs, "", BB); CI->setTailCallKind(CallInst::TCK_MustTail); @@ -2026,7 +2027,8 @@ void DevirtModule::scanTypeTestUsers( } void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { - Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test); + Function *TypeTestFunc = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test); for (Use &U : llvm::make_early_inc_range(TypeCheckedLoadFunc->uses())) { auto *CI = dyn_cast(U.getUser()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index e5c3a20e1a6487..21588aca512758 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1232,7 +1232,8 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) { assert(I.getOpcode() == Instruction::Add && "Expecting add instruction"); Type *Ty = I.getType(); auto getUAddSat = [&]() { - return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty); + return Intrinsic::getOrInsertDeclaration(I.getModule(), Intrinsic::uadd_sat, + Ty); }; // add (umin X, ~Y), Y --> uaddsat X, Y @@ -2127,7 +2128,7 @@ static Instruction *foldSubOfMinMax(BinaryOperator &I, if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) && (Op0->hasOneUse() || Op1->hasOneUse())) { Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); - Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + Function *F = Intrinsic::getOrInsertDeclaration(I.getModule(), InvID, Ty); return CallInst::Create(F, {X, Y}); } @@ -2150,7 +2151,7 @@ static Instruction *foldSubOfMinMax(BinaryOperator &I, if (MinMax->isSigned() && match(Y, m_ZeroInt()) && match(X, m_NSWSub(m_Specific(Op0), m_Value(Z)))) { Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); - Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + Function *F = Intrinsic::getOrInsertDeclaration(I.getModule(), InvID, Ty); return CallInst::Create(F, {Op0, Z}); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 964616a4eb35e2..453071f3f982cd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2269,7 +2269,8 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I, Builder.CreateBinOp(I.getOpcode(), X->getOperand(0), Y->getOperand(0)); Value *NewOp1 = Builder.CreateBinOp(I.getOpcode(), X->getOperand(1), Y->getOperand(1)); - Function *F = Intrinsic::getDeclaration(I.getModule(), IID, I.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(I.getModule(), IID, I.getType()); return CallInst::Create(F, {NewOp0, NewOp1, X->getOperand(2)}); } case Intrinsic::bswap: @@ -2280,7 +2281,8 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I, : ConstantInt::get(I.getType(), IID == Intrinsic::bswap ? RHSC->byteSwap() : RHSC->reverseBits())); - Function *F = Intrinsic::getDeclaration(I.getModule(), IID, I.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(I.getModule(), IID, I.getType()); return CallInst::Create(F, {NewOp0}); } default: @@ -3056,7 +3058,8 @@ InstCombinerImpl::convertOrOfShiftsToFunnelShift(Instruction &Or) { static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) { if (auto Opt = IC.convertOrOfShiftsToFunnelShift(Or)) { auto [IID, FShiftArgs] = *Opt; - Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(Or.getModule(), IID, Or.getType()); return CallInst::Create(F, FShiftArgs); } @@ -3095,7 +3098,7 @@ static Instruction *matchOrConcat(Instruction &Or, Value *NewUpper = Builder.CreateZExt(Hi, Ty); NewUpper = Builder.CreateShl(NewUpper, HalfWidth); Value *BinOp = Builder.CreateOr(NewLower, NewUpper); - Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty); + Function *F = Intrinsic::getOrInsertDeclaration(Or.getModule(), id, Ty); return Builder.CreateCall(F, BinOp); }; @@ -4803,7 +4806,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { match(II->getArgOperand(1), m_One()) && isKnownToBeAPowerOfTwo(II->getArgOperand(0), /*OrZero */ true)) { IID = (IID == Intrinsic::ctlz) ? Intrinsic::cttz : Intrinsic::ctlz; - Function *F = Intrinsic::getDeclaration(II->getModule(), IID, Ty); + Function *F = + Intrinsic::getOrInsertDeclaration(II->getModule(), IID, Ty); return CallInst::Create(F, {II->getArgOperand(0), Builder.getTrue()}); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index f7a9406791801c..51e09b7e7c1437 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -488,7 +488,8 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { // cttz(bitreverse(x)) -> ctlz(x) if (match(Op0, m_BitReverse(m_Value(X)))) { Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; - Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(II.getModule(), ID, II.getType()); return CallInst::Create(F, {X, II.getArgOperand(1)}); } @@ -647,7 +648,7 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { if (Op0->hasOneUse() && match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { Function *F = - Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty); auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); @@ -657,7 +658,7 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { if (match(Op0, m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { Function *F = - Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty); return CallInst::Create(F, {X, IC.Builder.getFalse()}); } @@ -1181,7 +1182,8 @@ Instruction *InstCombinerImpl::matchSAddSubSat(IntrinsicInst &MinMax1) { return nullptr; // Finally create and return the sat intrinsic, truncated to the new type - Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy); + Function *F = Intrinsic::getOrInsertDeclaration(MinMax1.getModule(), + IntrinsicID, NewTy); Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy); Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy); Value *Sat = Builder.CreateCall(F, {AT, BT}); @@ -1286,8 +1288,8 @@ reassociateMinMaxWithConstantInOperand(IntrinsicInst *II, return nullptr; // max (max X, C), Y --> max (max X, Y), C - Function *MinMax = - Intrinsic::getDeclaration(II->getModule(), MinMaxID, II->getType()); + Function *MinMax = Intrinsic::getOrInsertDeclaration(II->getModule(), + MinMaxID, II->getType()); Value *NewInner = Builder.CreateBinaryIntrinsic(MinMaxID, X, Y); NewInner->takeName(Inner); return CallInst::Create(MinMax, {NewInner, C}); @@ -1346,7 +1348,8 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) { return nullptr; Module *Mod = II->getModule(); - Function *MinMax = Intrinsic::getDeclaration(Mod, MinMaxID, II->getType()); + Function *MinMax = + Intrinsic::getOrInsertDeclaration(Mod, MinMaxID, II->getType()); return CallInst::Create(MinMax, { MinMaxOp, ThirdOp }); } @@ -1571,7 +1574,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Type *Tys[3] = { CI.getArgOperand(0)->getType(), CI.getArgOperand(1)->getType(), CI.getArgOperand(2)->getType() }; - CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); + CI.setCalledFunction( + Intrinsic::getOrInsertDeclaration(M, MemCpyID, Tys)); Changed = true; } } @@ -2095,7 +2099,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); Module *Mod = II->getModule(); - Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); + Function *Fshl = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::fshl, Ty); return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); } assert(IID == Intrinsic::fshl && @@ -2115,7 +2120,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { Module *Mod = II->getModule(); - Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); + Function *Bswap = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bswap, Ty); return CallInst::Create(Bswap, { Op0 }); } if (Instruction *BitOp = @@ -2824,7 +2830,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { CallArgs.push_back(II->getArgOperand(4)); } - Function *NewFn = Intrinsic::getDeclaration(II->getModule(), NewIntrin); + Function *NewFn = + Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin); return CallInst::Create(NewFn, CallArgs); } case Intrinsic::arm_neon_vtbl1: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9934c065ebf85f..6c2554ea73b7f8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -95,8 +95,8 @@ Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty, default: llvm_unreachable("Unsupported call!"); case Intrinsic::vscale: { - Function *Fn = - Intrinsic::getDeclaration(I->getModule(), Intrinsic::vscale, {Ty}); + Function *Fn = Intrinsic::getOrInsertDeclaration( + I->getModule(), Intrinsic::vscale, {Ty}); Res = CallInst::Create(Fn->getFunctionType(), Fn); break; } @@ -600,7 +600,8 @@ Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) { if (ShVal0 != ShVal1) Y = Builder.CreateTrunc(ShVal1, DestTy); Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; - Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy); + Function *F = + Intrinsic::getOrInsertDeclaration(Trunc.getModule(), IID, DestTy); return CallInst::Create(F, {X, Y, NarrowShAmt}); } @@ -1912,8 +1913,8 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { // Do unary FP operation on smaller type. // (fptrunc (fabs x)) -> (fabs (fptrunc x)) Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty); - Function *Overload = Intrinsic::getDeclaration(FPT.getModule(), - II->getIntrinsicID(), Ty); + Function *Overload = Intrinsic::getOrInsertDeclaration( + FPT.getModule(), II->getIntrinsicID(), Ty); SmallVector OpBundles; II->getOperandBundlesAsDefs(OpBundles); CallInst *NewCI = @@ -2855,8 +2856,8 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { if (IntrinsicNum != 0) { assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask"); assert(match(ShufOp1, m_Undef()) && "Unexpected shuffle op"); - Function *BswapOrBitreverse = - Intrinsic::getDeclaration(CI.getModule(), IntrinsicNum, DestTy); + Function *BswapOrBitreverse = Intrinsic::getOrInsertDeclaration( + CI.getModule(), IntrinsicNum, DestTy); Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy); return CallInst::Create(BswapOrBitreverse, {ScalarX}); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index d1eb84b5ca5c10..7129499e0f8f9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1125,7 +1125,7 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // use the sadd_with_overflow intrinsic to efficiently compute both the // result and the overflow bit. Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( I.getModule(), Intrinsic::sadd_with_overflow, NewType); InstCombiner::BuilderTy &Builder = IC.Builder; @@ -4790,11 +4790,11 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) { if (MulHadOtherUses) Builder.SetInsertPoint(Mul); - Function *F = Intrinsic::getDeclaration(I.getModule(), - Div->getOpcode() == Instruction::UDiv - ? Intrinsic::umul_with_overflow - : Intrinsic::smul_with_overflow, - X->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + I.getModule(), + Div->getOpcode() == Instruction::UDiv ? Intrinsic::umul_with_overflow + : Intrinsic::smul_with_overflow, + X->getType()); CallInst *Call = Builder.CreateCall(F, {X, Y}, "mul"); // If the multiplication was used elsewhere, to ensure that we don't leave @@ -6334,7 +6334,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, MulA = Builder.CreateZExt(A, MulType); if (WidthB < MulWidth) MulB = Builder.CreateZExt(B, MulType); - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( I.getModule(), Intrinsic::umul_with_overflow, MulType); CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul"); IC.addToWorklist(MulInstr); @@ -7121,8 +7121,8 @@ static Instruction *foldVectorCmp(CmpInst &Cmp, if (auto *I = dyn_cast(V)) I->copyIRFlags(&Cmp); Module *M = Cmp.getModule(); - Function *F = - Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reverse, V->getType()); return CallInst::Create(F, V); }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 3f780285efe423..358563a5fcd537 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1148,8 +1148,8 @@ static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal, if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X))))) return nullptr; - Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz, - II->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + II->getModule(), Intrinsic::cttz, II->getType()); return CallInst::Create(F, {X, II->getArgOperand(1)}); } @@ -2242,8 +2242,8 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) { else return nullptr; - Function *F = - Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType()); + Function *F = Intrinsic::getOrInsertDeclaration(SI.getModule(), + NewIntrinsicID, SI.getType()); return CallInst::Create(F, {X, Y}); } @@ -2537,7 +2537,8 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel, // This is a funnel/rotate that avoids shift-by-bitwidth UB in a suboptimal way. // Convert to funnel shift intrinsic. Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; - Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(Sel.getModule(), IID, Sel.getType()); ShAmt = Builder.CreateZExt(ShAmt, Sel.getType()); return CallInst::Create(F, { SV0, SV1, ShAmt }); } @@ -2580,8 +2581,8 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel, // Canonicalize the magnitude argument as the positive constant since we do // not care about its sign. Value *MagArg = ConstantFP::get(SelType, abs(*TC)); - Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign, - Sel.getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + Sel.getModule(), Intrinsic::copysign, Sel.getType()); return CallInst::Create(F, { MagArg, X }); } @@ -2600,8 +2601,8 @@ Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) { if (auto *I = dyn_cast(V)) I->copyIRFlags(&Sel); Module *M = Sel.getModule(); - Function *F = - Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reverse, V->getType()); return CallInst::Create(F, V); }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 3d4461dc1a87f6..8ca705ae1d364d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -898,7 +898,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, Value *X; if (DemandedMask == 1 && VTy->getScalarSizeInBits() % 2 == 0 && match(II->getArgOperand(0), m_Not(m_Value(X)))) { - Function *Ctpop = Intrinsic::getDeclaration( + Function *Ctpop = Intrinsic::getOrInsertDeclaration( II->getModule(), Intrinsic::ctpop, VTy); return InsertNewInstWith(CallInst::Create(Ctpop, {X}), I->getIterator()); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index d9b4faff4c004d..d68ae64f08aa90 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2474,8 +2474,8 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf, if (IsFNeg) return UnaryOperator::CreateFNegFMF(NewShuf, S0); - Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(), - Intrinsic::fabs, Shuf.getType()); + Function *FAbs = Intrinsic::getOrInsertDeclaration( + Shuf.getModule(), Intrinsic::fabs, Shuf.getType()); CallInst *NewF = CallInst::Create(FAbs, {NewShuf}); NewF->setFastMathFlags(S0->getFastMathFlags()); return NewF; @@ -2495,8 +2495,8 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf, if (IsFNeg) { NewF = UnaryOperator::CreateFNeg(NewShuf); } else { - Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(), - Intrinsic::fabs, Shuf.getType()); + Function *FAbs = Intrinsic::getOrInsertDeclaration( + Shuf.getModule(), Intrinsic::fabs, Shuf.getType()); NewF = CallInst::Create(FAbs, {NewShuf}); } NewF->copyIRFlags(S0); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 1f4a6f793404cf..954c4cf19c2077 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2084,8 +2084,8 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { if (auto *BO = dyn_cast(V)) BO->copyIRFlags(&Inst); Module *M = Inst.getModule(); - Function *F = - Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reverse, V->getType()); return CallInst::Create(F, V); }; @@ -3355,7 +3355,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { if (InvokeInst *II = dyn_cast(&MI)) { // Replace invoke with a NOP intrinsic to maintain the original CFG Module *M = II->getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); + Function *F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::donothing); InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), {}, "", II->getParent()); } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 2ad89b5ba753a5..02d9fab309d83b 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1109,7 +1109,7 @@ struct FunctionStackPoisoner : public InstVisitor { // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for // this purpose. if (!isa(InstBefore)) { - Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration( + Function *DynamicAreaOffsetFunc = Intrinsic::getOrInsertDeclaration( InstBefore->getModule(), Intrinsic::get_dynamic_area_offset, {IntptrTy}); @@ -1867,7 +1867,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex); Module *M = IRB.GetInsertBlock()->getParent()->getParent(); IRB.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::asan_check_memaccess), + Intrinsic::getOrInsertDeclaration(M, Intrinsic::asan_check_memaccess), {IRB.CreatePointerCast(Addr, PtrTy), ConstantInt::get(Int32Ty, AccessInfo.Packed)}); return; diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 618b6fe1aea474..63d580d2b9d512 100644 --- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -194,7 +194,7 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, IRB.SetInsertPoint(TrapBB); Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap; - auto *F = Intrinsic::getDeclaration(Fn->getParent(), IntrID); + auto *F = Intrinsic::getOrInsertDeclaration(Fn->getParent(), IntrID); CallInst *TrapCall; if (DebugTrapBB) { diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index cc7f20cffea771..5ec4973ea03d8f 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1042,14 +1042,14 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite, if (UseFixedShadowIntrinsic) { IRB.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( M, UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow : Intrinsic::hwasan_check_memaccess_fixedshadow), {Ptr, ConstantInt::get(Int32Ty, AccessInfo), ConstantInt::get(Int64Ty, Mapping.offset())}); } else { - IRB.CreateCall(Intrinsic::getDeclaration( + IRB.CreateCall(Intrinsic::getOrInsertDeclaration( M, UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules : Intrinsic::hwasan_check_memaccess), diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp index 28dc1c02b661ac..bbe0f4c6178192 100644 --- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp +++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp @@ -110,7 +110,8 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) { Instruction *ThenTerm = SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights); Builder.SetInsertPoint(ThenTerm); - Builder.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::debugtrap)); + Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap)); ++NumKCFIChecks; } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 07d667434e0710..19ec97c17f31c6 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2853,7 +2853,7 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)), S2->getType()); Value *V2 = I.getOperand(2); - Function *Intrin = Intrinsic::getDeclaration( + Function *Intrin = Intrinsic::getOrInsertDeclaration( I.getModule(), I.getIntrinsicID(), S2Conv->getType()); Value *Shift = IRB.CreateCall(Intrin, {S0, S1, V2}); setShadow(&I, IRB.CreateOr(Shift, S2Conv)); @@ -3057,7 +3057,7 @@ struct MemorySanitizerVisitor : public InstVisitor { IRBuilder<> IRB(&I); Value *Op = I.getArgOperand(0); Type *OpType = Op->getType(); - Function *BswapFunc = Intrinsic::getDeclaration( + Function *BswapFunc = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::bswap, ArrayRef(&OpType, 1)); setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op))); setOrigin(&I, getOrigin(Op)); @@ -3287,7 +3287,7 @@ struct MemorySanitizerVisitor : public InstVisitor { S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64)); } - Function *ShadowFn = Intrinsic::getDeclaration( + Function *ShadowFn = Intrinsic::getOrInsertDeclaration( F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID())); Value *S = diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 10442fa0bb9003..e6e474ed376069 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -919,7 +919,7 @@ void FunctionInstrumenter::instrument() { // llvm.instrprof.cover(i8* , i64 , i32 , // i32 ) Builder.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_cover), + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_cover), {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); return; } @@ -931,7 +931,7 @@ void FunctionInstrumenter::instrument() { if (IsCtxProf) { auto *CSIntrinsic = - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_callsite); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_callsite); // We want to count the instrumentable callsites, then instrument them. This // is because the llvm.instrprof.callsite intrinsic has an argument (like // the other instrprof intrinsics) capturing the total number of @@ -972,7 +972,7 @@ void FunctionInstrumenter::instrument() { // llvm.instrprof.timestamp(i8* , i64 , i32 , // i32 ) Builder.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_timestamp), + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_timestamp), {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)}); I += PGOBlockCoverage ? 8 : 1; @@ -984,12 +984,12 @@ void FunctionInstrumenter::instrument() { "Cannot get the Instrumentation point"); // llvm.instrprof.increment(i8* , i64 , i32 , // i32 ) - Builder.CreateCall( - Intrinsic::getDeclaration(&M, PGOBlockCoverage - ? Intrinsic::instrprof_cover - : Intrinsic::instrprof_increment), - {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), - Builder.getInt32(I++)}); + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + &M, PGOBlockCoverage + ? Intrinsic::instrprof_cover + : Intrinsic::instrprof_increment), + {NormalizedNamePtr, CFGHash, + Builder.getInt32(NumCounters), Builder.getInt32(I++)}); } // Now instrument select instructions: @@ -1038,7 +1038,8 @@ void FunctionInstrumenter::instrument() { SmallVector OpBundles; populateEHOperandBundle(Cand, BlockColors, OpBundles); Builder.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_value_profile), + Intrinsic::getOrInsertDeclaration(&M, + Intrinsic::instrprof_value_profile), {NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, OpBundles); @@ -1726,7 +1727,7 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { ConstantExpr::getPointerBitCastOrAddrSpaceCast( FuncNameVar, PointerType::get(M->getContext(), 0)); Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step), + Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step), {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); ++(*CurCtrIdx); diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index db4bf709c9cc9c..719806fdf37f58 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -999,7 +999,7 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, if (Options.StackDepth && IsEntryBB && !IsLeafFunc) { // Check stack depth. If it's the deepest so far, record it. Module *M = F.getParent(); - Function *GetFrameAddr = Intrinsic::getDeclaration( + Function *GetFrameAddr = Intrinsic::getOrInsertDeclaration( M, Intrinsic::frameaddress, IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace())); auto FrameAddrPtr = diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 68cf4e55301314..388addfab181a4 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -571,9 +571,10 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, // Instrument function entry/exit points if there were instrumented accesses. if ((Res || HasCalls) && ClInstrumentFuncEntryExit) { InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI()); - Value *ReturnAddress = IRB.CreateCall( - Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress), - IRB.getInt32(0)); + Value *ReturnAddress = + IRB.CreateCall(Intrinsic::getOrInsertDeclaration( + F.getParent(), Intrinsic::returnaddress), + IRB.getInt32(0)); IRB.CreateCall(TsanFuncEntry, ReturnAddress); EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions); diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index c11691c613ac78..0dedd0207571bf 100644 --- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -139,7 +139,7 @@ class ARCRuntimeEntryPoints { if (Decl) return Decl; - return Decl = Intrinsic::getDeclaration(TheModule, IntID); + return Decl = Intrinsic::getOrInsertDeclaration(TheModule, IntID); } }; diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 60fd2a286119b3..9317e0643079ea 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -380,7 +380,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, case Intrinsic::masked_load: { Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); - Function *NewDecl = Intrinsic::getDeclaration(M, IID, {DestTy, SrcTy}); + Function *NewDecl = + Intrinsic::getOrInsertDeclaration(M, IID, {DestTy, SrcTy}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; @@ -391,7 +392,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, case Intrinsic::masked_gather: { Type *RetTy = II->getType(); Type *NewPtrTy = NewV->getType(); - Function *NewDecl = Intrinsic::getDeclaration(M, IID, {RetTy, NewPtrTy}); + Function *NewDecl = + Intrinsic::getOrInsertDeclaration(M, IID, {RetTy, NewPtrTy}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; @@ -400,16 +402,16 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, case Intrinsic::masked_scatter: { Type *ValueTy = II->getOperand(0)->getType(); Type *NewPtrTy = NewV->getType(); - Function *NewDecl = - Intrinsic::getDeclaration(M, II->getIntrinsicID(), {ValueTy, NewPtrTy}); + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + M, II->getIntrinsicID(), {ValueTy, NewPtrTy}); II->setArgOperand(1, NewV); II->setCalledFunction(NewDecl); return true; } case Intrinsic::prefetch: case Intrinsic::is_constant: { - Function *NewDecl = - Intrinsic::getDeclaration(M, II->getIntrinsicID(), {NewV->getType()}); + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + M, II->getIntrinsicID(), {NewV->getType()}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index d85166e518f1db..4043c0e9a7ddc4 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -405,7 +405,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { IRBuilder<> Builder(P.InsertPt); Module *M = BB->getParent()->getParent(); Type *I32 = Type::getInt32Ty(BB->getContext()); - Function *PrefetchFunc = Intrinsic::getDeclaration( + Function *PrefetchFunc = Intrinsic::getOrInsertDeclaration( M, Intrinsic::prefetch, PrefPtrValue->getType()); Builder.CreateCall( PrefetchFunc, diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index d5e91d3c1decf8..30369ed7c245cf 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -978,8 +978,8 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, assert(match(Br->getCondition(), m_Zero()) && "Expected branch condition to be false"); IRBuilder<> Builder(Br); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, - FI.OuterTripCount->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::umul_with_overflow, FI.OuterTripCount->getType()); Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount}, "flatten.mul"); FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount"); diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 740e1e39b9ee77..56006d9ae6924a 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -2122,7 +2122,7 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, Type *Tys[] = {Val->getType()}; Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + Function *Func = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctpop, Tys); CallInst *CI = IRBuilder.CreateCall(Func, Ops); CI->setDebugLoc(DL); @@ -2136,7 +2136,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, Type *Tys[] = {Val->getType()}; Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getDeclaration(M, IID, Tys); + Function *Func = Intrinsic::getOrInsertDeclaration(M, IID, Tys); CallInst *CI = IRBuilder.CreateCall(Func, Ops); CI->setDebugLoc(DL); diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index a59ecdda1746f9..ce35349376c483 100644 --- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -44,7 +44,7 @@ static bool lowerGuardIntrinsic(Function &F) { if (ToLower.empty()) return false; - auto *DeoptIntrinsic = Intrinsic::getDeclaration( + auto *DeoptIntrinsic = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 0d98e844cf91ea..a4ab288b1bfee8 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1290,7 +1290,7 @@ class LowerMatrixIntrinsics { if (AllowContraction) { // Use fmuladd for floating point operations and let the backend decide // if that's profitable. - Function *FMulAdd = Intrinsic::getDeclaration( + Function *FMulAdd = Intrinsic::getOrInsertDeclaration( Func.getParent(), Intrinsic::fmuladd, A->getType()); return Builder.CreateCall(FMulAdd, {A, B, Sum}); } diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index aea17aa82a88a4..b9f88ba4e0780e 100644 --- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -69,7 +69,7 @@ static bool explicifyGuards(Function &F) { if (GuardIntrinsics.empty()) return false; - auto *DeoptIntrinsic = Intrinsic::getDeclaration( + auto *DeoptIntrinsic = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index b568811dcdbcac..557a75e8946dc3 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1855,8 +1855,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) { // If not, then we know we can transform this. Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType()}; - M->setCalledFunction( - Intrinsic::getDeclaration(M->getModule(), Intrinsic::memcpy, ArgTys)); + M->setCalledFunction(Intrinsic::getOrInsertDeclaration( + M->getModule(), Intrinsic::memcpy, ArgTys)); // For MemorySSA nothing really changes (except that memcpy may imply stricter // aliasing guarantees). diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index e3c12c971b9ab0..daf8fa28a71e59 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1525,8 +1525,8 @@ static void CreateGCRelocates(ArrayRef LiveVariables, if (auto *VT = dyn_cast(Ty)) NewTy = FixedVectorType::get(NewTy, cast(VT)->getNumElements()); - return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, - {NewTy}); + return Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_relocate, {NewTy}); }; // Lazily populated map from input types to the canonicalized form mentioned diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 72728c0f839e5d..b1e4c7e52d99a0 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -733,7 +733,8 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { ValueVector Res(VS->NumFragments); ValueVector ScalarCallOps(NumArgs); - Function *NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + Function *NewIntrin = + Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); IRBuilder<> Builder(&CI); // Perform actual scalarization, taking care to preserve any scalar operands. @@ -756,7 +757,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { } if (IsRemainder) - NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + NewIntrin = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); Res[I] = Builder.CreateCall(NewIntrin, ScalarCallOps, CI.getName() + ".i" + Twine(I)); diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index 3cf68e07da5be2..e1dd20478fd55f 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -225,7 +225,8 @@ struct AssumeBuilderState { return nullptr; if (!DebugCounter::shouldExecute(BuildAssumeCounter)) return nullptr; - Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume); + Function *FnAssume = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::assume); LLVMContext &C = M->getContext(); SmallVector OpBundle; for (auto &MapElem : AssumedKnowledgeMap) { diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index fc03643e3542cc..c6ba85bd9e57d4 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -425,8 +425,8 @@ PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) { // Create intrinsic call. LLVMContext &Ctx = NewFunc->getContext(); - Function *IFn = - Intrinsic::getDeclaration(NewFunc->getParent(), CIID, TParams); + Function *IFn = Intrinsic::getOrInsertDeclaration(NewFunc->getParent(), + CIID, TParams); SmallVector Args; unsigned NumOperands = OldInst.getNumOperands(); if (isa(OldInst)) diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index f58448dd9562d5..a090c5ed749205 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1124,7 +1124,8 @@ static void insertLifetimeMarkersSurroundingCall( TheCall->getFunction()) && "Input memory not defined in original function"); - Function *Func = Intrinsic::getDeclaration(M, MarkerFunc, Mem->getType()); + Function *Func = + Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType()); auto Marker = CallInst::Create(Func, {NegativeOne, Mem}); if (InsertBefore) Marker->insertBefore(TheCall); diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index d12c540f9a4d04..47bb31905d1ac8 100644 --- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -63,7 +63,7 @@ static void insertCall(Function &CurFn, StringRef Func, Func, FunctionType::get(Type::getVoidTy(C), ArgTypes, false)); Instruction *RetAddr = CallInst::Create( - Intrinsic::getDeclaration(&M, Intrinsic::returnaddress), + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress), ArrayRef(ConstantInt::get(Type::getInt32Ty(C), 0)), "", InsertionPt); RetAddr->setDebugLoc(DL); diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 671b0d0822a5d9..110fd6de5c6968 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2090,7 +2090,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, if (IsUnsafeClaimRV) { Builder.SetInsertPoint(II); Function *IFn = - Intrinsic::getDeclaration(Mod, Intrinsic::objc_release); + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_release); Builder.CreateCall(IFn, RetOpnd, ""); } II->eraseFromParent(); @@ -2125,7 +2125,8 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // matching autoreleaseRV or an annotated call in the callee. Emit a call // to objc_retain. Builder.SetInsertPoint(RI); - Function *IFn = Intrinsic::getDeclaration(Mod, Intrinsic::objc_retain); + Function *IFn = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_retain); Builder.CreateCall(IFn, RetOpnd, ""); } } @@ -3021,7 +3022,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, }); } else { SmallVector NormalReturns; - Function *NewDeoptIntrinsic = Intrinsic::getDeclaration( + Function *NewDeoptIntrinsic = Intrinsic::getOrInsertDeclaration( Caller->getParent(), Intrinsic::experimental_deoptimize, {Caller->getReturnType()}); diff --git a/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/llvm/lib/Transforms/Utils/IntegerDivision.cpp index 11956816a6ec3f..e95a7a9ae525ac 100644 --- a/llvm/lib/Transforms/Utils/IntegerDivision.cpp +++ b/llvm/lib/Transforms/Utils/IntegerDivision.cpp @@ -157,8 +157,8 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, BasicBlock *IBB = Builder.GetInsertBlock(); Function *F = IBB->getParent(); - Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, - DivTy); + Function *CTLZ = + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, DivTy); // Our CFG is going to look like: // +---------------------+ diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index f3b8623ebb0f8f..06813bac7c781f 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -4141,7 +4141,8 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( else return false; - Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy); + Function *F = + Intrinsic::getOrInsertDeclaration(I->getModule(), Intrin, DemandedTy); Value *Provider = Res->Provider; // We may need to truncate the provider. diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp index 55f9400d93d79b..cd79600657032e 100644 --- a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp +++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp @@ -215,8 +215,8 @@ static bool runImpl(Module &M) { // If `__cxa_atexit` hits out-of-memory, trap, so that we don't misbehave. // This should be very rare, because if the process is running out of // memory before main has even started, something is wrong. - CallInst::Create(Intrinsic::getDeclaration(&M, Intrinsic::trap), "", - FailBB); + CallInst::Create(Intrinsic::getOrInsertDeclaration(&M, Intrinsic::trap), + "", FailBB); new UnreachableInst(C, FailBB); ReturnInst::Create(C, RetBB); diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 1cb1a7b396badc..77abf160dc70f9 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -268,7 +268,7 @@ bool isLifetimeIntrinsic(Value *V) { Value *readRegister(IRBuilder<> &IRB, StringRef Name) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - Function *ReadRegister = Intrinsic::getDeclaration( + Function *ReadRegister = Intrinsic::getOrInsertDeclaration( M, Intrinsic::read_register, IRB.getIntPtrTy(M->getDataLayout())); MDNode *MD = MDNode::get(M->getContext(), {MDString::get(M->getContext(), Name)}); @@ -287,7 +287,7 @@ Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) { Value *getFP(IRBuilder<> &IRB) { Function *F = IRB.GetInsertBlock()->getParent(); Module *M = F->getParent(); - auto *GetStackPointerFn = Intrinsic::getDeclaration( + auto *GetStackPointerFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::frameaddress, IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace())); return IRB.CreatePtrToInt( @@ -301,7 +301,7 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) { // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER // in Bionic's libc/private/bionic_tls.h. Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); return IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 8 * Slot); } diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 186e17e166ba3d..2415118cad6fb3 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -559,7 +559,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, if (isa(ValInfo)) { IRBuilder<> B(getBranchTerminator(ValInfo)); auto NumDecls = F.getParent()->getNumNamedValues(); - Function *IF = Intrinsic::getDeclaration( + Function *IF = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::ssa_copy, Op->getType()); if (NumDecls != F.getParent()->getNumNamedValues()) PI.CreatedDeclarations.insert(IF); @@ -575,7 +575,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, // directly before it, assume(i1 true) is not a useful fact. IRBuilder<> B(PAssume->AssumeInst->getNextNode()); auto NumDecls = F.getParent()->getNumNamedValues(); - Function *IF = Intrinsic::getDeclaration( + Function *IF = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::ssa_copy, Op->getType()); if (NumDecls != F.getParent()->getNumNamedValues()) PI.CreatedDeclarations.insert(IF); diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 1b7912fdf5e304..656bb1ebd1161e 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -444,7 +444,7 @@ struct PromoteMem2Reg { /// Given a LoadInst LI this adds assume(LI != null) after it. static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) { Function *AssumeIntrinsic = - Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume); + Intrinsic::getOrInsertDeclaration(LI->getModule(), Intrinsic::assume); ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI, Constant::getNullValue(LI->getType())); LoadNotNull->insertAfter(LI); diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp index 6e84965370b248..2700b4307308cb 100644 --- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp +++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp @@ -151,7 +151,7 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) { // GEP might not be immediately followed by a LOAD, like it can be hoisted // outside the loop or another instruction might be inserted them in between. Builder.SetInsertPoint(Load); - Function *LoadRelIntrinsic = llvm::Intrinsic::getDeclaration( + Function *LoadRelIntrinsic = llvm::Intrinsic::getOrInsertDeclaration( &M, Intrinsic::load_relative, {Index->getType()}); // Create a call to load.relative intrinsic that computes the target address diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 1ff3cd78aa9877..de1864ef5b8d9b 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2134,8 +2134,8 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, MulV = TruncTripCount; OfMul = ConstantInt::getFalse(MulV->getContext()); } else { - auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), - Intrinsic::umul_with_overflow, Ty); + auto *MulF = Intrinsic::getOrInsertDeclaration( + Loc->getModule(), Intrinsic::umul_with_overflow, Ty); CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index e06ebb691d511c..db2acb9eed0938 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1960,7 +1960,7 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, if (IsIntrinsic) { Module *M = CI->getModule(); Intrinsic::ID IID = CalleeFn->getIntrinsicID(); - Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, IID, B.getFloatTy()); R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]); } else { AttributeList CalleeAttrs = CalleeFn->getAttributes(); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e2958c49b8ca9f..5c164075e83259 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15079,7 +15079,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { - CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl); + CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl); } SmallVector OpBundles; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ba94cd29587664..2948ecc580edc0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -984,7 +984,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { // Use vector version of the intrinsic. Module *M = State.Builder.GetInsertBlock()->getModule(); Function *VectorF = - Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); + Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); auto *CI = cast_or_null(getUnderlyingValue()); diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp index fb8729c36a6f2d..0e2a6decfbc9d5 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp @@ -30,7 +30,7 @@ static bool shouldIgnoreArgument(const Value *V) { static Value *replaceIntrinsic(Module &M, IntrinsicInst *II, Intrinsic::ID NewIID, ArrayRef Tys = {}) { - Function *NewFunc = Intrinsic::getDeclaration(&M, NewIID, Tys); + Function *NewFunc = Intrinsic::getOrInsertDeclaration(&M, NewIID, Tys); II->setCalledFunction(NewFunc); return II; } diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp index 6437e0c9491f7f..8ad15ca41510f2 100644 --- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp +++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp @@ -430,7 +430,8 @@ static void RunRandTest(uint64_t Seed, int Size, int MinCount, int MaxCount, BB->insertInto(F); Instruction *Ret = ReturnInst::Create(C); Ret->insertInto(BB, BB->begin()); - Function *FnAssume = Intrinsic::getDeclaration(Mod.get(), Intrinsic::assume); + Function *FnAssume = + Intrinsic::getOrInsertDeclaration(Mod.get(), Intrinsic::assume); std::vector ShuffledArgs; BitVector HasArg; diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp index 9e6c517242a269..81784bb2360975 100644 --- a/llvm/unittests/Analysis/MemorySSATest.cpp +++ b/llvm/unittests/Analysis/MemorySSATest.cpp @@ -1120,7 +1120,7 @@ TEST_F(MemorySSATest, LifetimeMarkersAreClobbers) { B.CreateStore(B.getInt8(0), Bar); auto GetLifetimeIntrinsic = [&](Intrinsic::ID ID) { - return Intrinsic::getDeclaration(&M, ID, {Foo->getType()}); + return Intrinsic::getOrInsertDeclaration(&M, ID, {Foo->getType()}); }; B.CreateCall(GetLifetimeIntrinsic(Intrinsic::lifetime_end), diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp index 77d966155dceff..0145ee70a14c17 100644 --- a/llvm/unittests/Analysis/ValueTrackingTest.cpp +++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp @@ -2481,8 +2481,8 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsAddWithRange) { TEST_F(ComputeKnownBitsTest, ComputeKnownBitsUnknownVScale) { Module M("", Context); IRBuilder<> Builder(Context); - Function *TheFn = - Intrinsic::getDeclaration(&M, Intrinsic::vscale, {Builder.getInt32Ty()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::vscale, + {Builder.getInt32Ty()}); CallInst *CI = Builder.CreateCall(TheFn, {}, {}, ""); KnownBits Known = computeKnownBits(CI, M.getDataLayout(), /* Depth */ 0); diff --git a/llvm/unittests/IR/BasicBlockTest.cpp b/llvm/unittests/IR/BasicBlockTest.cpp index eea2746a352aa6..88ac6611742ce9 100644 --- a/llvm/unittests/IR/BasicBlockTest.cpp +++ b/llvm/unittests/IR/BasicBlockTest.cpp @@ -109,8 +109,10 @@ TEST(BasicBlockTest, TestInstructionsWithoutDebug) { Argument *V = new Argument(Type::getInt32Ty(Ctx)); Function *F = Function::Create(FT, Function::ExternalLinkage, "", M); - Function *DbgDeclare = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare); - Function *DbgValue = Intrinsic::getDeclaration(M, Intrinsic::dbg_value); + Function *DbgDeclare = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare); + Function *DbgValue = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_value); Value *DIV = MetadataAsValue::get(Ctx, (Metadata *)nullptr); SmallVector Args = {DIV, DIV, DIV}; @@ -174,7 +176,7 @@ class InstrOrderInvalidationTest : public ::testing::Test { protected: void SetUp() override { M.reset(new Module("MyModule", Ctx)); - Nop = Intrinsic::getDeclaration(M.get(), Intrinsic::donothing); + Nop = Intrinsic::getOrInsertDeclaration(M.get(), Intrinsic::donothing); FunctionType *FT = FunctionType::get(Type::getVoidTy(Ctx), {}, false); Function *F = Function::Create(FT, Function::ExternalLinkage, "foo", *M); BB = BasicBlock::Create(Ctx, "entry", F); diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp index 953df224e84dcb..ea20c87d6b09b4 100644 --- a/llvm/unittests/IR/DebugInfoTest.cpp +++ b/llvm/unittests/IR/DebugInfoTest.cpp @@ -693,7 +693,8 @@ TEST(IRBuilder, GetSetInsertionPointWithEmptyBasicBlock) { std::unique_ptr BB(BasicBlock::Create(C, "start")); Module *M = new Module("module", C); IRBuilder<> Builder(BB.get()); - Function *DbgDeclare = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare); + Function *DbgDeclare = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare); Value *DIV = MetadataAsValue::get(C, (Metadata *)nullptr); SmallVector Args = {DIV, DIV, DIV}; Builder.CreateCall(DbgDeclare, Args); diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index d5239f21147cdb..690af62d18020d 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -413,8 +413,9 @@ TEST_F(IRBuilderTest, ConstrainedFPIntrinsics) { Builder.setDefaultConstrainedExcept(fp::ebStrict); Builder.setDefaultConstrainedRounding(RoundingMode::TowardZero); - Function *Fn = Intrinsic::getDeclaration(M.get(), - Intrinsic::experimental_constrained_roundeven, { Type::getDoubleTy(Ctx) }); + Function *Fn = Intrinsic::getOrInsertDeclaration( + M.get(), Intrinsic::experimental_constrained_roundeven, + {Type::getDoubleTy(Ctx)}); V = Builder.CreateConstrainedFPCall(Fn, { VDouble }); CII = cast(V); EXPECT_EQ(Intrinsic::experimental_constrained_roundeven, CII->getIntrinsicID()); diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp index 0c4af28a2ab57b..7fe0bd79b80a60 100644 --- a/llvm/unittests/IR/IntrinsicsTest.cpp +++ b/llvm/unittests/IR/IntrinsicsTest.cpp @@ -50,7 +50,7 @@ class IntrinsicsTest : public ::testing::Test { Instruction *makeIntrinsic(Intrinsic::ID ID) const { IRBuilder<> Builder(BB); SmallVector ProcessedArgs; - auto *Decl = Intrinsic::getDeclaration(M.get(), ID); + auto *Decl = Intrinsic::getOrInsertDeclaration(M.get(), ID); for (auto *Ty : Decl->getFunctionType()->params()) { auto *Val = Constant::getNullValue(Ty); ProcessedArgs.push_back(Val); diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp index 13f121a2b9c7dd..7dc4b9f448d386 100644 --- a/llvm/unittests/IR/PatternMatch.cpp +++ b/llvm/unittests/IR/PatternMatch.cpp @@ -1766,7 +1766,7 @@ TEST_F(PatternMatchTest, IntrinsicMatcher) { Value *Ops[] = {Name, Hash, Num, Index, Step}; Module *M = BB->getParent()->getParent(); Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step); Value *Intrinsic5 = CallInst::Create(TheFn, Ops, "", BB); diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp index 925a69bafa07ef..d6ad7599ce4610 100644 --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -420,7 +420,7 @@ TEST_F(VPIntrinsicTest, VPToNonPredIntrinsicRoundTrip) { ASSERT_TRUE(IsFullTrip); } -/// Check that VPIntrinsic::getDeclarationForParams works. +/// Check that VPIntrinsic::getOrInsertDeclarationForParams works. TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) { std::unique_ptr M = createVPDeclarationModule(); assert(M); @@ -436,7 +436,7 @@ TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) { Values.push_back(UndefValue::get(ParamTy)); ASSERT_NE(F.getIntrinsicID(), Intrinsic::not_intrinsic); - auto *NewDecl = VPIntrinsic::getDeclarationForParams( + auto *NewDecl = VPIntrinsic::getOrInsertDeclarationForParams( OutM.get(), F.getIntrinsicID(), FuncTy->getReturnType(), Values); ASSERT_TRUE(NewDecl); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 372c5aaea59382..376b00224eb574 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1195,7 +1195,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { // Test for a call to a function without side-effects. LLVMContext C; Module M("", C); - Function *TheFn = Intrinsic::getDeclaration(&M, Intrinsic::thread_pointer); + Function *TheFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer); auto *Call = CallInst::Create(TheFn->getFunctionType(), TheFn); VPValue Op1; diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index 5031426033aea1..448a171cf3e412 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -347,7 +347,7 @@ class LLVM_ConstrainedIntrgetType(); }); llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::Function *callee = - llvm::Intrinsic::getDeclaration(module, + llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::experimental_constrained_}] # mnem # [{, overloadedTypes); }] # !cond(!gt(hasRoundingMode, 0) : [{ @@ -541,7 +541,7 @@ class LLVM_DbgIntrOp traits = []> llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::LLVMContext &ctx = module->getContext(); llvm::Function *fn = - llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::}] + llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::}] # !subst(".", "_", name) # [{); builder.CreateCall(fn, { llvm::MetadataAsValue::get(ctx, @@ -594,7 +594,7 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> { llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::LLVMContext &ctx = module->getContext(); llvm::Function *fn = - llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::dbg_label); + llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::dbg_label); builder.CreateCall(fn, { llvm::MetadataAsValue::get(ctx, moduleTranslation.translateDebugInfo($label)) }); diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index 46b7b0a473c692..a8595d14ccf2e5 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -99,7 +99,8 @@ getOverloadedDeclaration(CallIntrinsicOp op, llvm::Intrinsic::ID id, } ArrayRef overloadedArgTysRef = overloadedArgTys; - return llvm::Intrinsic::getDeclaration(module, id, overloadedArgTysRef); + return llvm::Intrinsic::getOrInsertDeclaration(module, id, + overloadedArgTysRef); } static llvm::OperandBundleDef @@ -143,7 +144,7 @@ convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder, return failure(); fn = *fnOrFailure; } else { - fn = llvm::Intrinsic::getDeclaration(module, id, {}); + fn = llvm::Intrinsic::getOrInsertDeclaration(module, id, {}); } // Check the result type of the call. diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index a5de90160c4145..add0a31c114f8d 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -839,7 +839,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall( llvm::IRBuilderBase &builder, llvm::Intrinsic::ID intrinsic, ArrayRef args, ArrayRef tys) { llvm::Module *module = builder.GetInsertBlock()->getModule(); - llvm::Function *fn = llvm::Intrinsic::getDeclaration(module, intrinsic, tys); + llvm::Function *fn = + llvm::Intrinsic::getOrInsertDeclaration(module, intrinsic, tys); return builder.CreateCall(fn, args); } @@ -886,8 +887,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall( for (unsigned overloadedOperandIdx : overloadedOperands) overloadedTypes.push_back(args[overloadedOperandIdx]->getType()); llvm::Module *module = builder.GetInsertBlock()->getModule(); - llvm::Function *llvmIntr = - llvm::Intrinsic::getDeclaration(module, intrinsic, overloadedTypes); + llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration( + module, intrinsic, overloadedTypes); return builder.CreateCall(llvmIntr, args); } diff --git a/polly/lib/CodeGen/IslExprBuilder.cpp b/polly/lib/CodeGen/IslExprBuilder.cpp index aaafac14bf8065..1688c41c624b24 100644 --- a/polly/lib/CodeGen/IslExprBuilder.cpp +++ b/polly/lib/CodeGen/IslExprBuilder.cpp @@ -129,16 +129,16 @@ Value *IslExprBuilder::createBinOp(BinaryOperator::BinaryOps Opc, Value *LHS, Module *M = Builder.GetInsertBlock()->getModule(); switch (Opc) { case Instruction::Add: - F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow, - {LHS->getType()}); + F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sadd_with_overflow, + {LHS->getType()}); break; case Instruction::Sub: - F = Intrinsic::getDeclaration(M, Intrinsic::ssub_with_overflow, - {LHS->getType()}); + F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ssub_with_overflow, + {LHS->getType()}); break; case Instruction::Mul: - F = Intrinsic::getDeclaration(M, Intrinsic::smul_with_overflow, - {LHS->getType()}); + F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::smul_with_overflow, + {LHS->getType()}); break; default: llvm_unreachable("No overflow intrinsic for binary operator found!"); diff --git a/polly/lib/CodeGen/PerfMonitor.cpp b/polly/lib/CodeGen/PerfMonitor.cpp index 3cad8537f3ee19..1a791614685443 100644 --- a/polly/lib/CodeGen/PerfMonitor.cpp +++ b/polly/lib/CodeGen/PerfMonitor.cpp @@ -59,7 +59,7 @@ void PerfMonitor::addToGlobalConstructors(Function *Fn) { } Function *PerfMonitor::getRDTSCP() { - return Intrinsic::getDeclaration(M, Intrinsic::x86_rdtscp); + return Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_rdtscp); } PerfMonitor::PerfMonitor(const Scop &S, Module *M) From c84f75966af79a381e27e6ffc9481c1fae2fcb4f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 11 Oct 2024 05:38:17 -0700 Subject: [PATCH 161/177] [libc] Fix compilation of new trig functions (#111987) --- libc/src/math/generic/cos.cpp | 2 +- libc/src/math/generic/range_reduction_double_common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp index 923ea96852d889..568b1254c6f02b 100644 --- a/libc/src/math/generic/cos.cpp +++ b/libc/src/math/generic/cos.cpp @@ -93,7 +93,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { } return ans; }; - DoubleDouble sin_k = get_idx_dd(k + 128); + DoubleDouble msin_k = get_idx_dd(k + 128); DoubleDouble cos_k = get_idx_dd(k + 64); #else // Fast look up version, but needs 256-entry table. diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h index e23bbff144bee8..bcab82f6c9c3a8 100644 --- a/libc/src/math/generic/range_reduction_double_common.h +++ b/libc/src/math/generic/range_reduction_double_common.h @@ -278,6 +278,7 @@ struct LargeRangeReduction { DoubleDouble y_mid; }; +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS static Float128 range_reduction_small_f128(double x) { constexpr Float128 PI_OVER_128_F128 = { Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128}; @@ -300,7 +301,6 @@ static Float128 range_reduction_small_f128(double x) { return fputil::quick_mul(y, PI_OVER_128_F128); } -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS static constexpr Float128 SIN_K_PI_OVER_128_F128[65] = { {Sign::POS, 0, 0}, {Sign::POS, -133, 0xc90a'afbd'1b33'efc9'c539'edcb'fda0'cf2c_u128}, From 26b832a9ec03d0a35baaf00d81f607004fe2a8cf Mon Sep 17 00:00:00 2001 From: Daniel Mokeev Date: Fri, 11 Oct 2024 14:41:47 +0200 Subject: [PATCH 162/177] [RISCV] Add DAG combine to turn (sub (shl X, 8-Y), (shr X, Y)) into orc.b (#111828) This patch generalizes the DAG combine for `(sub (shl X, 8), X) => (orc.b X)` into the more general form of `(sub (shl X, 8 - Y), (srl X, Y)) => (orc.b X)`. Alive2 generalized proof: https://alive2.llvm.org/ce/z/dFcf_n Related issue: https://github.com/llvm/llvm-project/issues/96595 Related PR: https://github.com/llvm/llvm-project/pull/96680 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 44 ++- llvm/test/CodeGen/RISCV/orc-b-patterns.ll | 372 ++++++++++++++++++++ 2 files changed, 408 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/orc-b-patterns.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1f9fc984515cf6..e71c8c3dc1c759 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13569,8 +13569,10 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS); } -// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are -// non-zero. Replace with orc.b. +// Looks for (sub (shl X, 8-Y), (shr X, Y)) where the Y-th bit in each byte is +// potentially set. It is fine for Y to be 0, meaning that (sub (shl X, 8), X) +// is also valid. Replace with (orc.b X). For example, 0b0000_1000_0000_1000 is +// valid with Y=3, while 0b0000_1000_0000_0100 is not. static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { if (!Subtarget.hasStdExtZbb()) @@ -13584,18 +13586,44 @@ static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse()) + if (N0->getOpcode() != ISD::SHL) return SDValue(); - auto *ShAmtC = dyn_cast(N0.getOperand(1)); - if (!ShAmtC || ShAmtC->getZExtValue() != 8) + auto *ShAmtCLeft = dyn_cast(N0.getOperand(1)); + if (!ShAmtCLeft) return SDValue(); + unsigned ShiftedAmount = 8 - ShAmtCLeft->getZExtValue(); - APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe)); - if (!DAG.MaskedValueIsZero(N1, Mask)) + if (ShiftedAmount >= 8) return SDValue(); - return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1); + SDValue LeftShiftOperand = N0->getOperand(0); + SDValue RightShiftOperand = N1; + + if (ShiftedAmount != 0) { // Right operand must be a right shift. + if (N1->getOpcode() != ISD::SRL) + return SDValue(); + auto *ShAmtCRight = dyn_cast(N1.getOperand(1)); + if (!ShAmtCRight || ShAmtCRight->getZExtValue() != ShiftedAmount) + return SDValue(); + RightShiftOperand = N1.getOperand(0); + } + + // At least one shift should have a single use. + if (!N0.hasOneUse() && (ShiftedAmount == 0 || !N1.hasOneUse())) + return SDValue(); + + if (LeftShiftOperand != RightShiftOperand) + return SDValue(); + + APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0x1)); + Mask <<= ShiftedAmount; + // Check that X has indeed the right shape (only the Y-th bit can be set in + // every byte). + if (!DAG.MaskedValueIsZero(LeftShiftOperand, ~Mask)) + return SDValue(); + + return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, LeftShiftOperand); } static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll new file mode 100644 index 00000000000000..184e66c14b33fc --- /dev/null +++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll @@ -0,0 +1,372 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32I +; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32ZBB + +define i32 @orc_b_i32_mul255(i32 %x) nounwind { +; RV32I-LABEL: orc_b_i32_mul255: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 4112 +; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_mul255: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 4112 +; RV32ZBB-NEXT: addi a1, a1, 257 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %mul = mul nuw nsw i32 %and, 255 + ret i32 %mul +} + + +define i32 @orc_b_i32_sub_shl8x_x_lsb(i32 %x) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 4112 +; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 4112 +; RV32ZBB-NEXT: addi a1, a1, 257 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %sub = mul nuw i32 %and, 255 + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_lsb_preshifted(i32 %x){ +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: srli a0, a0, 11 +; RV32I-NEXT: lui a1, 16 +; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: srli a0, a0, 11 +; RV32ZBB-NEXT: lui a1, 16 +; RV32ZBB-NEXT: addi a1, a1, 257 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %shr = lshr i32 %x, 11 + %and = and i32 %shr, 16843009 + %sub = mul nuw i32 %and, 255 + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b1(i32 %x) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 8224 +; RV32I-NEXT: addi a1, a1, 514 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 7 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 8224 +; RV32ZBB-NEXT: addi a1, a1, 514 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b2(i32 %x) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 16448 +; RV32I-NEXT: addi a1, a1, 1028 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 6 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b2: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 16448 +; RV32ZBB-NEXT: addi a1, a1, 1028 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 67372036 + %shl = shl i32 %and, 6 + %shr = lshr exact i32 %and, 2 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b3(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 24672 +; CHECK-NEXT: addi a1, a1, 1542 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 5 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 101058054 + %shl = shl nuw i32 %and, 5 + %shr = lshr i32 %and, 3 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b4(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 32897 +; CHECK-NEXT: addi a1, a1, -2040 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: srli a0, a0, 4 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 134744072 + %shl = shl nuw i32 %and, 4 + %shr = lshr i32 %and, 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b5(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 65793 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 5 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 269488144 + %shl = shl nuw i32 %and, 3 + %shr = lshr i32 %and, 5 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b6(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 131586 +; CHECK-NEXT: addi a1, a1, 32 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: srli a0, a0, 6 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 538976288 + %shl = shl nuw i32 %and, 2 + %shr = lshr i32 %and, 6 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b7(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 263172 +; CHECK-NEXT: addi a1, a1, 64 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: srli a0, a0, 7 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 1077952576 + %shl = shl nuw i32 %and, 1 + %shr = lshr i32 %and, 7 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a2, 8224 +; RV32I-NEXT: addi a2, a2, 514 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a2, a0, 7 +; RV32I-NEXT: srli a3, a0, 1 +; RV32I-NEXT: sub a0, a2, a3 +; RV32I-NEXT: sw a3, 0(a1) +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a2, 8224 +; RV32ZBB-NEXT: addi a2, a2, 514 +; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: srli a2, a0, 1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: sw a2, 0(a1) +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + store i32 %shr, ptr %arr, align 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32 %x, ptr %arr) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a2, 8224 +; RV32I-NEXT: addi a2, a2, 514 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a2, a0, 7 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: sw a2, 0(a1) +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a2, 8224 +; RV32ZBB-NEXT: addi a2, a2, 514 +; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: slli a2, a0, 7 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: sw a2, 0(a1) +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + store i32 %shl, ptr %arr, align 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b1_not_used(i32 %x, ptr %arr) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 8224 +; RV32I-NEXT: addi a1, a1, 514 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 7 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 8224 +; RV32ZBB-NEXT: addi a1, a1, 514 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_shl_used(i32 %x, ptr %arr){ +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_shl_used: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 4112 +; CHECK-NEXT: addi a2, a2, 257 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: slli a2, a0, 8 +; CHECK-NEXT: sub a0, a2, a0 +; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %shl = shl i32 %and, 8 + store i32 %shl, ptr %arr, align 4 + %sub = mul nuw i32 %and, 255 + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_b1_both_used(i32 %x, ptr %arr) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b1_both_used: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 8224 +; CHECK-NEXT: addi a2, a2, 514 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: slli a2, a0, 7 +; CHECK-NEXT: srli a3, a0, 1 +; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: sub a0, a2, a3 +; CHECK-NEXT: sw a3, 4(a1) +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + store i32 %shl, ptr %arr, align 4 + %arrayidx1 = getelementptr inbounds i8, ptr %arr, i32 4 + store i32 %shr, ptr %arrayidx1, align 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_x_shr8x(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_x_shr8x: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 4112 +; CHECK-NEXT: addi a1, a1, 257 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: srli a1, a0, 8 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %shr = lshr i32 %and, 8 + %sub = sub nsw i32 %and, %shr + ret i32 %sub +} From 9a696b68b735fa01276d16d39370f9102fee4a0b Mon Sep 17 00:00:00 2001 From: Emilio Cota Date: Fri, 11 Oct 2024 08:18:11 -0400 Subject: [PATCH 163/177] Revert "[NVPTX] Prefer prmt.b32 over bfi.b32 (#110766)" This reverts commit 3f9998af4f79e95fe8be615df9d6b898008044b9. It breaks downstream tests with egregious numerical differences. Unfortunately no upstream tests are broken, but the fact that a prior iteration of the commit (pre-optimization) does work with our downstream tests (coming from the Triton repo) supports the claim that the final version of the commit is incorrect. Reverting now so that the original author can evaluate. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 31 +- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 614 +++++++++---------- llvm/test/CodeGen/NVPTX/sext-setcc.ll | 18 +- 3 files changed, 328 insertions(+), 335 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d95f8f214be557..57bc5fe0ac361c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2332,23 +2332,20 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us // to optimize calculation of constant parts. if (VT == MVT::v4i8) { - SDValue PRMT__10 = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), - DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), - DAG.getConstant(0x3340, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - SDValue PRMT32__ = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), - DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), - DAG.getConstant(0x4033, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - SDValue PRMT3210 = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {PRMT__10, PRMT32__, DAG.getConstant(0x5410, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); + SDValue C8 = DAG.getConstant(8, DL, MVT::i32); + SDValue E01 = DAG.getNode( + NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), + DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); + SDValue E012 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), + E01, DAG.getConstant(16, DL, MVT::i32), C8); + SDValue E0123 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), + E012, DAG.getConstant(24, DL, MVT::i32), C8); + return DAG.getNode(ISD::BITCAST, DL, VT, E0123); } return Op; } diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 84dde539ce4c47..96a4359d0ec43e 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_add( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, %b @@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_0( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 4; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 3; +; CHECK-NEXT: add.s16 %rs4, %rs3, 2; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 16435; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 2; +; CHECK-NEXT: add.s16 %rs6, %rs5, 3; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> , %a @@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_1( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 4; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 3; +; CHECK-NEXT: add.s16 %rs4, %rs3, 2; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 16435; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 2; +; CHECK-NEXT: add.s16 %rs6, %rs5, 3; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, @@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_sub( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = sub <4 x i8> %a, %b @@ -251,38 +251,38 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<26>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 16435; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; -; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; -; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b @@ -294,30 +294,30 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ugt <4 x i8> %a, %b @@ -329,38 +329,38 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<26>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 16435; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; -; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; -; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b @@ -372,30 +372,30 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ule <4 x i8> %a, %b @@ -407,35 +407,35 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_eq( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<23>; +; CHECK-NEXT: .reg .b32 %r<24>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; ; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; ; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 16435; -; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; +; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; -; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; -; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; -; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; +; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; +; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; ; CHECK-NEXT: ret; %cmp = icmp eq <4 x i8> %a, %b @@ -447,35 +447,35 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_ne( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<23>; +; CHECK-NEXT: .reg .b32 %r<24>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 16435; -; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; +; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; -; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; -; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; -; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; +; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; +; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; ; CHECK-NEXT: ret; %cmp = icmp ne <4 x i8> %a, %b @@ -487,38 +487,38 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_mul( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = mul <4 x i8> %a, %b @@ -548,13 +548,12 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: or.b32 %r8, %r6, %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: or.b32 %r8, %r6, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -614,13 +613,12 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: xor.b32 %r8, %r6, %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: xor.b32 %r8, %r6, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -680,13 +678,12 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: and.b32 %r8, %r6, %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: and.b32 %r8, %r6, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -929,40 +926,40 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> ; CHECK-LABEL: test_select_cc( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<28>; +; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; ; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; -; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8; -; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: bfe.u32 %r6, %r3, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r4, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r3, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r4, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r9, %r4, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r3, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r4, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r4, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11; -; CHECK-NEXT: bfe.u32 %r13, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; ; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; -; CHECK-NEXT: bfe.u32 %r16, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r16, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; -; CHECK-NEXT: prmt.b32 %r19, %r18, %r15, 16435; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r21, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r21, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r23, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r24, %r1, 0, 8; -; CHECK-NEXT: selp.b32 %r25, %r24, %r23, %p1; -; CHECK-NEXT: prmt.b32 %r26, %r25, %r22, 13120; -; CHECK-NEXT: prmt.b32 %r27, %r26, %r19, 21520; +; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r24, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r25, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1; +; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27; ; CHECK-NEXT: ret; %cc = icmp ne <4 x i8> %c, %d @@ -1009,32 +1006,32 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, ; CHECK-LABEL: test_select_cc_i8_i32( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<26>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; ; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0]; -; CHECK-NEXT: setp.ne.s32 %p1, %r3, %r7; -; CHECK-NEXT: setp.ne.s32 %p2, %r4, %r8; -; CHECK-NEXT: setp.ne.s32 %p3, %r5, %r9; -; CHECK-NEXT: setp.ne.s32 %p4, %r6, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 24, 8; +; CHECK-NEXT: setp.ne.s32 %p1, %r6, %r10; +; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9; +; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8; +; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7; +; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r15, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 16435; -; CHECK-NEXT: bfe.u32 %r18, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8; +; CHECK-NEXT: bfe.u32 %r18, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r19, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; -; CHECK-NEXT: bfe.u32 %r21, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r22, %r21, %p1; -; CHECK-NEXT: prmt.b32 %r24, %r23, %r20, 13120; -; CHECK-NEXT: prmt.b32 %r25, %r24, %r17, 21520; +; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8; +; CHECK-NEXT: bfe.u32 %r22, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r23, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; <4 x i32> %c, <4 x i32> %d) #0 { @@ -1047,13 +1044,13 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { ; CHECK-LABEL: test_trunc_2xi32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; -; CHECK-NEXT: prmt.b32 %r5, %r3, %r4, 16435; -; CHECK-NEXT: prmt.b32 %r6, %r1, %r2, 13120; -; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 21520; +; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8; +; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i32> %a to <4 x i8> @@ -1063,19 +1060,19 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { ; CHECK-LABEL: test_trunc_2xi64( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; -; CHECK-NEXT: cvt.u32.u64 %r1, %rd4; -; CHECK-NEXT: cvt.u32.u64 %r2, %rd3; -; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 16435; -; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd1; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 13120; -; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 21520; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd1; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd2; +; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; +; CHECK-NEXT: cvt.u32.u64 %r4, %rd3; +; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8; +; CHECK-NEXT: cvt.u32.u64 %r6, %rd4; +; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i64> %a to <4 x i8> @@ -1187,16 +1184,15 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; -; CHECK-NEXT: mov.b32 %r1, 6; -; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 5, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -1259,27 +1255,27 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; ; CHECK-NEXT: ret; %r = fptosi <4 x half> %a to <4 x i8> @@ -1290,27 +1286,27 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; ; CHECK-NEXT: ret; %r = fptoui <4 x half> %a to <4 x i8> @@ -1330,33 +1326,33 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs2, %r4; ; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs5, %r7; ; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs7, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs8, %r11; ; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs10, %r13; -; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.s8.s32 %rs10, %r14; +; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.s8.s32 %rs11, %r15; ; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: @@ -1377,7 +1373,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: test_srem_v3i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry @@ -1396,25 +1392,25 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs9; ; CHECK-NEXT: ld.s8 %rs10, [%rd2+2]; -; CHECK-NEXT: bfe.s32 %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.s32 %r5, %r3, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs11, %r5; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs12, %r6; ; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11; ; CHECK-NEXT: cvt.u32.u16 %r7, %rs13; -; CHECK-NEXT: bfe.s32 %r8, %r3, 0, 8; +; CHECK-NEXT: bfe.s32 %r8, %r3, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs14, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs15, %r9; ; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14; ; CHECK-NEXT: cvt.u32.u16 %r10, %rs16; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r7, 13120; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r7, 8, 8; ; CHECK-NEXT: // implicit-def: %r13 -; CHECK-NEXT: // implicit-def: %r14 -; CHECK-NEXT: prmt.b32 %r12, %r13, %r14, 16435; -; CHECK-NEXT: prmt.b32 %r15, %r11, %r12, 21520; +; CHECK-NEXT: bfi.b32 %r12, %r13, %r11, 16, 8; +; CHECK-NEXT: // implicit-def: %r15 +; CHECK-NEXT: bfi.b32 %r14, %r15, %r12, 24, 8; ; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; -; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; } +; CHECK-NEXT: cvt.u16.u32 %rs18, %r14; ; CHECK-NEXT: st.u8 [%rd3], %rs18; ; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; ; CHECK-NEXT: st.u8 [%rd3+1], %rs19; @@ -1441,25 +1437,25 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.s32 %r11, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r12, -1, 0, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; ; CHECK-NEXT: selp.s32 %r14, -1, 0, %p2; -; CHECK-NEXT: selp.s32 %r15, -1, 0, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.s32 %r16, -1, 0, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index 8b7e5235443f05..f471d47077cf0d 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -33,35 +33,35 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: and.b16 %rs2, %rs1, 255; ; CHECK-NEXT: setp.eq.s16 %p1, %rs2, 0; -; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; ; CHECK-NEXT: and.b16 %rs4, %rs3, 255; ; CHECK-NEXT: setp.eq.s16 %p2, %rs4, 0; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r4; ; CHECK-NEXT: and.b16 %rs6, %rs5, 255; ; CHECK-NEXT: setp.eq.s16 %p3, %rs6, 0; -; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; ; CHECK-NEXT: and.b16 %rs8, %rs7, 255; ; CHECK-NEXT: setp.eq.s16 %p4, %rs8, 0; ; CHECK-NEXT: selp.s32 %r6, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r7, -1, 0, %p3; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; ; CHECK-NEXT: selp.s32 %r9, -1, 0, %p2; -; CHECK-NEXT: selp.s32 %r10, -1, 0, %p1; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r9, 13120; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r8, 21520; +; CHECK-NEXT: bfi.b32 %r10, %r9, %r8, 16, 8; +; CHECK-NEXT: selp.s32 %r11, -1, 0, %p1; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r10, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; entry: From c8da2253f9aa4dff039e9ed766ff0f865632a0eb Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 11 Oct 2024 05:45:09 -0700 Subject: [PATCH 164/177] [Clang] Replace Intrinsic::getDeclaration with getOrInsertDeclaration (#111990) Fix build failure from the rename change. Looks like one additional reference sneaked in between pre-commit checks and the commit itself. --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 059c75fae284dd..465afd04740d89 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18882,7 +18882,7 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: { // for the DirectX intrinsic and the demangled builtin name switch (CGM.getTarget().getTriple().getArch()) { case llvm::Triple::dxil: - return EmitRuntimeCall(Intrinsic::getDeclaration( + return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( &CGM.getModule(), Intrinsic::dx_wave_getlaneindex)); case llvm::Triple::spirv: return EmitRuntimeCall(CGM.CreateRuntimeFunction( From ed7251b3aeb7c471dc50e9409e83a9ec01f40df5 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Fri, 11 Oct 2024 14:46:46 +0200 Subject: [PATCH 165/177] Revert "[clang] Implement TTP P0522 pack matching for deduced function template calls. (#111457)" See discussion in https://github.com/llvm/llvm-project/pull/111711 This reverts commit 4dadf42c1a74dd4e37db9ffd6fbb3027f59751a7. --- clang/include/clang/Sema/Overload.h | 10 +-- clang/include/clang/Sema/Sema.h | 23 +++---- clang/lib/Sema/SemaLookup.cpp | 1 - clang/lib/Sema/SemaOverload.cpp | 50 ++++++--------- clang/lib/Sema/SemaTemplate.cpp | 23 ++++--- clang/lib/Sema/SemaTemplateDeduction.cpp | 70 +++++++++----------- clang/test/SemaTemplate/cwg2398.cpp | 81 ------------------------ 7 files changed, 69 insertions(+), 189 deletions(-) diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index d38278c5041118..c716a25bb673b8 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -925,11 +925,6 @@ class Sema; bool TookAddressOfOverload : 1; - /// Have we matched any packs on the parameter side, versus any non-packs on - /// the argument side, in a context where the opposite matching is also - /// allowed? - bool HasMatchedPackOnParmToNonPackOnArg : 1; - /// True if the candidate was found using ADL. CallExpr::ADLCallKind IsADLCandidate : 1; @@ -1004,9 +999,8 @@ class Sema; friend class OverloadCandidateSet; OverloadCandidate() : IsSurrogate(false), IgnoreObjectArgument(false), - TookAddressOfOverload(false), - HasMatchedPackOnParmToNonPackOnArg(false), - IsADLCandidate(CallExpr::NotADL), RewriteKind(CRK_None) {} + TookAddressOfOverload(false), IsADLCandidate(CallExpr::NotADL), + RewriteKind(CRK_None) {} }; /// OverloadCandidateSet - A set of overload candidates, used in C++ diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index f8118ca64ad3f2..66b0846f286a81 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10134,8 +10134,7 @@ class Sema final : public SemaBase { ADLCallKind IsADLCandidate = ADLCallKind::NotADL, ConversionSequenceList EarlyConversions = std::nullopt, OverloadCandidateParamOrder PO = {}, - bool AggregateCandidateDeduction = false, - bool HasMatchedPackOnParmToNonPackOnArg = false); + bool AggregateCandidateDeduction = false); /// Add all of the function declarations in the given function set to /// the overload candidate set. @@ -10170,8 +10169,7 @@ class Sema final : public SemaBase { bool SuppressUserConversions = false, bool PartialOverloading = false, ConversionSequenceList EarlyConversions = std::nullopt, - OverloadCandidateParamOrder PO = {}, - bool HasMatchedPackOnParmToNonPackOnArg = false); + OverloadCandidateParamOrder PO = {}); /// Add a C++ member function template as a candidate to the candidate /// set, using template argument deduction to produce an appropriate member @@ -10217,8 +10215,7 @@ class Sema final : public SemaBase { CXXConversionDecl *Conversion, DeclAccessPair FoundDecl, CXXRecordDecl *ActingContext, Expr *From, QualType ToType, OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, - bool AllowExplicit, bool AllowResultConversion = true, - bool HasMatchedPackOnParmToNonPackOnArg = false); + bool AllowExplicit, bool AllowResultConversion = true); /// Adds a conversion function template specialization /// candidate to the overload set, using template argument deduction @@ -11641,7 +11638,7 @@ class Sema final : public SemaBase { SourceLocation RAngleLoc, unsigned ArgumentPackIndex, SmallVectorImpl &SugaredConverted, SmallVectorImpl &CanonicalConverted, - CheckTemplateArgumentKind CTAK, bool PartialOrdering, + CheckTemplateArgumentKind CTAK, bool *MatchedPackOnParmToNonPackOnArg); /// Check that the given template arguments can be provided to @@ -11724,8 +11721,7 @@ class Sema final : public SemaBase { /// It returns true if an error occurred, and false otherwise. bool CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param, TemplateParameterList *Params, - TemplateArgumentLoc &Arg, - bool PartialOrdering, + TemplateArgumentLoc &Arg, bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg); void NoteTemplateLocation(const NamedDecl &Decl, @@ -12237,8 +12233,8 @@ class Sema final : public SemaBase { SmallVectorImpl &Deduced, unsigned NumExplicitlySpecified, FunctionDecl *&Specialization, sema::TemplateDeductionInfo &Info, - SmallVectorImpl const *OriginalCallArgs, - bool PartialOverloading, bool PartialOrdering, + SmallVectorImpl const *OriginalCallArgs = nullptr, + bool PartialOverloading = false, llvm::function_ref CheckNonDependent = [] { return false; }); /// Perform template argument deduction from a function call @@ -12272,8 +12268,7 @@ class Sema final : public SemaBase { TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, FunctionDecl *&Specialization, sema::TemplateDeductionInfo &Info, bool PartialOverloading, bool AggregateDeductionCandidate, - bool PartialOrdering, QualType ObjectType, - Expr::Classification ObjectClassification, + QualType ObjectType, Expr::Classification ObjectClassification, llvm::function_ref)> CheckNonDependent); /// Deduce template arguments when taking the address of a function @@ -12428,7 +12423,7 @@ class Sema final : public SemaBase { bool isTemplateTemplateParameterAtLeastAsSpecializedAs( TemplateParameterList *PParam, TemplateDecl *PArg, TemplateDecl *AArg, const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, - bool PartialOrdering, bool *MatchedPackOnParmToNonPackOnArg); + bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg); /// Mark which template parameters are used in a given expression. /// diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index 60fa195221c938..31422c213ac249 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -3667,7 +3667,6 @@ Sema::LookupLiteralOperator(Scope *S, LookupResult &R, if (CheckTemplateArgument( Params->getParam(0), Arg, FD, R.getNameLoc(), R.getNameLoc(), 0, SugaredChecked, CanonicalChecked, CTAK_Specified, - /*PartialOrdering=*/false, /*MatchedPackOnParmToNonPackOnArg=*/nullptr) || Trap.hasErrorOccurred()) IsTemplate = false; diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index f545e9341e1ae6..2cde8131108fbe 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -6864,8 +6864,7 @@ void Sema::AddOverloadCandidate( OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, bool PartialOverloading, bool AllowExplicit, bool AllowExplicitConversions, ADLCallKind IsADLCandidate, ConversionSequenceList EarlyConversions, - OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction, - bool HasMatchedPackOnParmToNonPackOnArg) { + OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction) { const FunctionProtoType *Proto = dyn_cast(Function->getType()->getAs()); assert(Proto && "Functions without a prototype cannot be overloaded"); @@ -6884,8 +6883,7 @@ void Sema::AddOverloadCandidate( AddMethodCandidate(Method, FoundDecl, Method->getParent(), QualType(), Expr::Classification::makeSimpleLValue(), Args, CandidateSet, SuppressUserConversions, - PartialOverloading, EarlyConversions, PO, - HasMatchedPackOnParmToNonPackOnArg); + PartialOverloading, EarlyConversions, PO); return; } // We treat a constructor like a non-member function, since its object @@ -6928,8 +6926,6 @@ void Sema::AddOverloadCandidate( CandidateSet.getRewriteInfo().getRewriteKind(Function, PO); Candidate.IsADLCandidate = IsADLCandidate; Candidate.ExplicitCallArguments = Args.size(); - Candidate.HasMatchedPackOnParmToNonPackOnArg = - HasMatchedPackOnParmToNonPackOnArg; // Explicit functions are not actually candidates at all if we're not // allowing them in this context, but keep them around so we can point @@ -7457,13 +7453,16 @@ void Sema::AddMethodCandidate(DeclAccessPair FoundDecl, QualType ObjectType, } } -void Sema::AddMethodCandidate( - CXXMethodDecl *Method, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, QualType ObjectType, - Expr::Classification ObjectClassification, ArrayRef Args, - OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, - bool PartialOverloading, ConversionSequenceList EarlyConversions, - OverloadCandidateParamOrder PO, bool HasMatchedPackOnParmToNonPackOnArg) { +void +Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingContext, QualType ObjectType, + Expr::Classification ObjectClassification, + ArrayRef Args, + OverloadCandidateSet &CandidateSet, + bool SuppressUserConversions, + bool PartialOverloading, + ConversionSequenceList EarlyConversions, + OverloadCandidateParamOrder PO) { const FunctionProtoType *Proto = dyn_cast(Method->getType()->getAs()); assert(Proto && "Methods without a prototype cannot be overloaded"); @@ -7494,8 +7493,6 @@ void Sema::AddMethodCandidate( Candidate.TookAddressOfOverload = CandidateSet.getKind() == OverloadCandidateSet::CSK_AddressOfOverloadSet; Candidate.ExplicitCallArguments = Args.size(); - Candidate.HasMatchedPackOnParmToNonPackOnArg = - HasMatchedPackOnParmToNonPackOnArg; bool IgnoreExplicitObject = (Method->isExplicitObjectMemberFunction() && @@ -7666,8 +7663,8 @@ void Sema::AddMethodTemplateCandidate( ConversionSequenceList Conversions; if (TemplateDeductionResult Result = DeduceTemplateArguments( MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info, - PartialOverloading, /*AggregateDeductionCandidate=*/false, - /*PartialOrdering=*/false, ObjectType, ObjectClassification, + PartialOverloading, /*AggregateDeductionCandidate=*/false, ObjectType, + ObjectClassification, [&](ArrayRef ParamTypes) { return CheckNonDependentConversions( MethodTmpl, ParamTypes, Args, CandidateSet, Conversions, @@ -7705,8 +7702,7 @@ void Sema::AddMethodTemplateCandidate( AddMethodCandidate(cast(Specialization), FoundDecl, ActingContext, ObjectType, ObjectClassification, Args, CandidateSet, SuppressUserConversions, PartialOverloading, - Conversions, PO, - Info.hasMatchedPackOnParmToNonPackOnArg()); + Conversions, PO); } /// Determine whether a given function template has a simple explicit specifier @@ -7752,7 +7748,6 @@ void Sema::AddTemplateOverloadCandidate( if (TemplateDeductionResult Result = DeduceTemplateArguments( FunctionTemplate, ExplicitTemplateArgs, Args, Specialization, Info, PartialOverloading, AggregateCandidateDeduction, - /*PartialOrdering=*/false, /*ObjectType=*/QualType(), /*ObjectClassification=*/Expr::Classification(), [&](ArrayRef ParamTypes) { @@ -7793,8 +7788,7 @@ void Sema::AddTemplateOverloadCandidate( Specialization, FoundDecl, Args, CandidateSet, SuppressUserConversions, PartialOverloading, AllowExplicit, /*AllowExplicitConversions=*/false, IsADLCandidate, Conversions, PO, - Info.AggregateDeductionCandidateHasMismatchedArity, - Info.hasMatchedPackOnParmToNonPackOnArg()); + Info.AggregateDeductionCandidateHasMismatchedArity); } bool Sema::CheckNonDependentConversions( @@ -7916,8 +7910,7 @@ void Sema::AddConversionCandidate( CXXConversionDecl *Conversion, DeclAccessPair FoundDecl, CXXRecordDecl *ActingContext, Expr *From, QualType ToType, OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, - bool AllowExplicit, bool AllowResultConversion, - bool HasMatchedPackOnParmToNonPackOnArg) { + bool AllowExplicit, bool AllowResultConversion) { assert(!Conversion->getDescribedFunctionTemplate() && "Conversion function templates use AddTemplateConversionCandidate"); QualType ConvType = Conversion->getConversionType().getNonReferenceType(); @@ -7962,8 +7955,6 @@ void Sema::AddConversionCandidate( Candidate.FinalConversion.setAllToTypes(ToType); Candidate.Viable = true; Candidate.ExplicitCallArguments = 1; - Candidate.HasMatchedPackOnParmToNonPackOnArg = - HasMatchedPackOnParmToNonPackOnArg; // Explicit functions are not actually candidates at all if we're not // allowing them in this context, but keep them around so we can point @@ -8165,8 +8156,7 @@ void Sema::AddTemplateConversionCandidate( assert(Specialization && "Missing function template specialization?"); AddConversionCandidate(Specialization, FoundDecl, ActingDC, From, ToType, CandidateSet, AllowObjCConversionOnExplicit, - AllowExplicit, AllowResultConversion, - Info.hasMatchedPackOnParmToNonPackOnArg()); + AllowExplicit, AllowResultConversion); } void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion, @@ -10519,10 +10509,6 @@ bool clang::isBetterOverloadCandidate( isa(Cand2.Function)) return isa(Cand1.Function); - if (Cand1.HasMatchedPackOnParmToNonPackOnArg != - Cand2.HasMatchedPackOnParmToNonPackOnArg) - return Cand2.HasMatchedPackOnParmToNonPackOnArg; - // -- F1 is a non-template function and F2 is a function template // specialization, or, if not that, bool Cand1IsSpecialization = Cand1.Function && diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 62d0d0914fa306..4f13669c2490c0 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -5179,8 +5179,7 @@ bool Sema::CheckTemplateArgument( unsigned ArgumentPackIndex, SmallVectorImpl &SugaredConverted, SmallVectorImpl &CanonicalConverted, - CheckTemplateArgumentKind CTAK, bool PartialOrdering, - bool *MatchedPackOnParmToNonPackOnArg) { + CheckTemplateArgumentKind CTAK, bool *MatchedPackOnParmToNonPackOnArg) { // Check template type parameters. if (TemplateTypeParmDecl *TTP = dyn_cast(Param)) return CheckTemplateTypeArgument(TTP, Arg, SugaredConverted, @@ -5395,7 +5394,8 @@ bool Sema::CheckTemplateArgument( case TemplateArgument::Template: case TemplateArgument::TemplateExpansion: - if (CheckTemplateTemplateArgument(TempParm, Params, Arg, PartialOrdering, + if (CheckTemplateTemplateArgument(TempParm, Params, Arg, + /*IsDeduced=*/CTAK != CTAK_Specified, MatchedPackOnParmToNonPackOnArg)) return true; @@ -5546,11 +5546,10 @@ bool Sema::CheckTemplateArgumentList( if (ArgIdx < NumArgs) { // Check the template argument we were given. - if (CheckTemplateArgument(*Param, NewArgs[ArgIdx], Template, TemplateLoc, - RAngleLoc, SugaredArgumentPack.size(), - SugaredConverted, CanonicalConverted, - CTAK_Specified, /*PartialOrdering=*/false, - MatchedPackOnParmToNonPackOnArg)) + if (CheckTemplateArgument( + *Param, NewArgs[ArgIdx], Template, TemplateLoc, RAngleLoc, + SugaredArgumentPack.size(), SugaredConverted, CanonicalConverted, + CTAK_Specified, MatchedPackOnParmToNonPackOnArg)) return true; CanonicalConverted.back().setIsDefaulted( @@ -5708,7 +5707,7 @@ bool Sema::CheckTemplateArgumentList( // Check the default template argument. if (CheckTemplateArgument(*Param, Arg, Template, TemplateLoc, RAngleLoc, 0, SugaredConverted, CanonicalConverted, - CTAK_Specified, /*PartialOrdering=*/false, + CTAK_Specified, /*MatchedPackOnParmToNonPackOnArg=*/nullptr)) return true; @@ -7294,7 +7293,7 @@ static void DiagnoseTemplateParameterListArityMismatch( bool Sema::CheckTemplateTemplateArgument( TemplateTemplateParmDecl *Param, TemplateParameterList *Params, - TemplateArgumentLoc &Arg, bool PartialOrdering, + TemplateArgumentLoc &Arg, bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg) { TemplateName Name = Arg.getArgument().getAsTemplateOrTemplatePattern(); auto [Template, DefaultArgs] = Name.getTemplateDeclAndDefaultArgs(); @@ -7339,8 +7338,8 @@ bool Sema::CheckTemplateTemplateArgument( // A template-argument matches a template template-parameter P when P // is at least as specialized as the template-argument A. if (!isTemplateTemplateParameterAtLeastAsSpecializedAs( - Params, Param, Template, DefaultArgs, Arg.getLocation(), - PartialOrdering, MatchedPackOnParmToNonPackOnArg)) + Params, Param, Template, DefaultArgs, Arg.getLocation(), IsDeduced, + MatchedPackOnParmToNonPackOnArg)) return true; // P2113 // C++20[temp.func.order]p2 diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index e49d315f7186bc..48a39a90f72a8b 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -2955,7 +2955,7 @@ Sema::getIdentityTemplateArgumentLoc(NamedDecl *TemplateParm, /// fully-converted template arguments. static bool ConvertDeducedTemplateArgument( Sema &S, NamedDecl *Param, DeducedTemplateArgument Arg, NamedDecl *Template, - TemplateDeductionInfo &Info, bool IsDeduced, bool PartialOrdering, + TemplateDeductionInfo &Info, bool IsDeduced, SmallVectorImpl &SugaredOutput, SmallVectorImpl &CanonicalOutput) { auto ConvertArg = [&](DeducedTemplateArgument Arg, @@ -2976,7 +2976,7 @@ static bool ConvertDeducedTemplateArgument( ? (Arg.wasDeducedFromArrayBound() ? Sema::CTAK_DeducedFromArrayBound : Sema::CTAK_Deduced) : Sema::CTAK_Specified, - PartialOrdering, &MatchedPackOnParmToNonPackOnArg); + &MatchedPackOnParmToNonPackOnArg); if (MatchedPackOnParmToNonPackOnArg) Info.setMatchedPackOnParmToNonPackOnArg(); return Res; @@ -3062,9 +3062,9 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( SmallVectorImpl &Deduced, TemplateDeductionInfo &Info, SmallVectorImpl &SugaredBuilder, - SmallVectorImpl &CanonicalBuilder, bool PartialOrdering, - LocalInstantiationScope *CurrentInstantiationScope, - unsigned NumAlreadyConverted, bool *IsIncomplete) { + SmallVectorImpl &CanonicalBuilder, + LocalInstantiationScope *CurrentInstantiationScope = nullptr, + unsigned NumAlreadyConverted = 0, bool *IsIncomplete = nullptr) { TemplateParameterList *TemplateParams = Template->getTemplateParameters(); for (unsigned I = 0, N = TemplateParams->size(); I != N; ++I) { @@ -3107,8 +3107,8 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( // We may have deduced this argument, so it still needs to be // checked and converted. if (ConvertDeducedTemplateArgument(S, Param, Deduced[I], Template, Info, - IsDeduced, PartialOrdering, - SugaredBuilder, CanonicalBuilder)) { + IsDeduced, SugaredBuilder, + CanonicalBuilder)) { Info.Param = makeTemplateParameter(Param); // FIXME: These template arguments are temporary. Free them! Info.reset( @@ -3174,8 +3174,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( // Check whether we can actually use the default argument. if (S.CheckTemplateArgument( Param, DefArg, TD, TD->getLocation(), TD->getSourceRange().getEnd(), - /*ArgumentPackIndex=*/0, SugaredBuilder, CanonicalBuilder, - Sema::CTAK_Specified, /*PartialOrdering=*/false, + 0, SugaredBuilder, CanonicalBuilder, Sema::CTAK_Specified, /*MatchedPackOnParmToNonPackOnArg=*/nullptr)) { Info.Param = makeTemplateParameter( const_cast(TemplateParams->getParam(I))); @@ -3284,9 +3283,7 @@ FinishTemplateArgumentDeduction( SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( S, Partial, IsPartialOrdering, Deduced, Info, SugaredBuilder, - CanonicalBuilder, IsPartialOrdering, - /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, - /*IsIncomplete=*/nullptr); + CanonicalBuilder); Result != TemplateDeductionResult::Success) return Result; @@ -3386,10 +3383,10 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( // explicitly specified, template argument deduction fails. SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( - S, Template, /*IsDeduced=*/PartialOrdering, Deduced, Info, - SugaredBuilder, CanonicalBuilder, PartialOrdering, + S, Template, /*IsDeduced*/ PartialOrdering, Deduced, Info, + SugaredBuilder, CanonicalBuilder, /*CurrentInstantiationScope=*/nullptr, - /*NumAlreadyConverted=*/0U, /*IsIncomplete=*/nullptr); + /*NumAlreadyConverted=*/0U); Result != TemplateDeductionResult::Success) return Result; @@ -3454,9 +3451,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( S, TD, /*IsDeduced=*/false, Deduced, Info, SugaredBuilder, - CanonicalBuilder, /*PartialOrdering=*/false, - /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, - /*IsIncomplete=*/nullptr); + CanonicalBuilder); Result != TemplateDeductionResult::Success) return Result; @@ -3994,8 +3989,7 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( unsigned NumExplicitlySpecified, FunctionDecl *&Specialization, TemplateDeductionInfo &Info, SmallVectorImpl const *OriginalCallArgs, - bool PartialOverloading, bool PartialOrdering, - llvm::function_ref CheckNonDependent) { + bool PartialOverloading, llvm::function_ref CheckNonDependent) { // Unevaluated SFINAE context. EnterExpressionEvaluationContext Unevaluated( *this, Sema::ExpressionEvaluationContext::Unevaluated); @@ -4018,10 +4012,9 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( bool IsIncomplete = false; SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( - *this, FunctionTemplate, /*IsDeduced=*/true, Deduced, Info, - SugaredBuilder, CanonicalBuilder, PartialOrdering, - CurrentInstantiationScope, NumExplicitlySpecified, - PartialOverloading ? &IsIncomplete : nullptr); + *this, FunctionTemplate, /*IsDeduced*/ true, Deduced, Info, + SugaredBuilder, CanonicalBuilder, CurrentInstantiationScope, + NumExplicitlySpecified, PartialOverloading ? &IsIncomplete : nullptr); Result != TemplateDeductionResult::Success) return Result; @@ -4553,8 +4546,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, FunctionDecl *&Specialization, TemplateDeductionInfo &Info, bool PartialOverloading, bool AggregateDeductionCandidate, - bool PartialOrdering, QualType ObjectType, - Expr::Classification ObjectClassification, + QualType ObjectType, Expr::Classification ObjectClassification, llvm::function_ref)> CheckNonDependent) { if (FunctionTemplate->isInvalidDecl()) return TemplateDeductionResult::Invalid; @@ -4769,8 +4761,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( runWithSufficientStackSpace(Info.getLocation(), [&] { Result = FinishTemplateArgumentDeduction( FunctionTemplate, Deduced, NumExplicitlySpecified, Specialization, Info, - &OriginalCallArgs, PartialOverloading, PartialOrdering, - [&, CallingCtx]() { + &OriginalCallArgs, PartialOverloading, [&, CallingCtx]() { ContextRAII SavedContext(*this, CallingCtx); return CheckNonDependent(ParamTypesForArgChecking); }); @@ -4882,10 +4873,9 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( TemplateDeductionResult Result; runWithSufficientStackSpace(Info.getLocation(), [&] { - Result = FinishTemplateArgumentDeduction( - FunctionTemplate, Deduced, NumExplicitlySpecified, Specialization, Info, - /*OriginalCallArgs=*/nullptr, /*PartialOverloading=*/false, - /*PartialOrdering=*/true); + Result = FinishTemplateArgumentDeduction(FunctionTemplate, Deduced, + NumExplicitlySpecified, + Specialization, Info); }); if (Result != TemplateDeductionResult::Success) return Result; @@ -5065,10 +5055,9 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( FunctionDecl *ConversionSpecialized = nullptr; TemplateDeductionResult Result; runWithSufficientStackSpace(Info.getLocation(), [&] { - Result = FinishTemplateArgumentDeduction( - ConversionTemplate, Deduced, 0, ConversionSpecialized, Info, - &OriginalCallArgs, /*PartialOverloading=*/false, - /*PartialOrdering=*/false); + Result = FinishTemplateArgumentDeduction(ConversionTemplate, Deduced, 0, + ConversionSpecialized, Info, + &OriginalCallArgs); }); Specialization = cast_or_null(ConversionSpecialized); return Result; @@ -5645,8 +5634,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( S, FTD, /*IsDeduced=*/true, Deduced, Info, SugaredBuilder, - CanonicalBuilder, /*PartialOrdering=*/true, - /*CurrentInstantiationScope=*/nullptr, + CanonicalBuilder, /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, &IsIncomplete); Result != TemplateDeductionResult::Success) return Result; @@ -6491,8 +6479,8 @@ bool Sema::isMoreSpecializedThanPrimary( bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs( TemplateParameterList *P, TemplateDecl *PArg, TemplateDecl *AArg, - const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, - bool PartialOrdering, bool *MatchedPackOnParmToNonPackOnArg) { + const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, bool IsDeduced, + bool *MatchedPackOnParmToNonPackOnArg) { // C++1z [temp.arg.template]p4: (DR 150) // A template template-parameter P is at least as specialized as a // template template-argument A if, given the following rewrite to two @@ -6571,7 +6559,7 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs( switch (::DeduceTemplateArguments( *this, A, AArgs, PArgs, Info, Deduced, /*NumberOfArgumentsMustMatch=*/false, /*PartialOrdering=*/true, - PartialOrdering ? PackFold::ArgumentToParameter : PackFold::Both, + IsDeduced ? PackFold::ArgumentToParameter : PackFold::Both, /*HasDeducedAnyParam=*/nullptr)) { case clang::TemplateDeductionResult::Success: if (MatchedPackOnParmToNonPackOnArg && diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp index 3825239de4a285..56091e84cf4e95 100644 --- a/clang/test/SemaTemplate/cwg2398.cpp +++ b/clang/test/SemaTemplate/cwg2398.cpp @@ -405,87 +405,6 @@ namespace packs { } // namespace t4 } // namespace packs -namespace fun_tmpl_call { - namespace match_func { - template