From 4cc3522ff2b4593c823115a2b2ace08f8b4d12d8 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Thu, 11 Jan 2024 23:00:53 +0000 Subject: [PATCH] [AMDGPU] Allow buffer intrinsics to be marked volatile at the IR level In order to ensure the correctness of ptr addrspace(7) lowering, we need a backwards-compatible way to flag buffer intrinsics as volatile that can't be dropped (unlike metadata). To acheive this in a backwards-compatible way, we use bit 31 of the auxilliary immediates of buffer intrinsics as the volatile flag. When this bit is set, the MachineMemOperand for said intrinsic is marked volatile. Existing code will ensure that this results in the appropriate use of flags like glc and dlc. This commit also harmorizes the handling of the auxilliary immediate for atomic intrinsics, which new go through extract_cpol like loads and stores, which masks off the volatile bit. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 79 ++++++++++++------- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 13 +-- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 4 +- llvm/lib/Target/AMDGPU/BUFInstructions.td | 39 ++++----- llvm/lib/Target/AMDGPU/SIDefines.h | 4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 7 +- .../llvm.amdgcn.raw.ptr.buffer.load.ll | 19 +++++ .../llvm.amdgcn.raw.ptr.buffer.store.ll | 19 +++++ .../llvm.amdgcn.raw.ptr.buffer.atomic.ll | 20 +++++ .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 36 +++++++++ 12 files changed, 194 insertions(+), 58 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e5596258847f9f1..2c5c21d3787e0bc 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1072,6 +1072,7 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // byte offset llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) + // Note: volatile bit is **not** permitted here. [IntrNoMem, ImmArg>]>, AMDGPURsrcIntrinsic<0>; @@ -1099,6 +1100,10 @@ def int_amdgcn_buffer_store : AMDGPUBufferStore; // The versions of these intrinsics that take <4 x i32> arguments are deprecated // in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, // which allow for improved reasoning about memory accesses. +// +// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved +// through to final assembly selection and is used to signal that the buffer +// operation is volatile. class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < [data_ty], [llvm_v4i32_ty, // rsrc(SGPR) @@ -1107,7 +1112,8 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsi llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrReadMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -1121,7 +1127,9 @@ class AMDGPURawPtrBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1137,7 +1145,8 @@ class AMDGPUStructBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrReadMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -1152,7 +1161,8 @@ class AMDGPUStructPtrBufferLoad : DefaultAttrsIn llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1168,7 +1178,8 @@ class AMDGPURawBufferStore : DefaultAttrsIntrins llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -1183,7 +1194,8 @@ class AMDGPURawPtrBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1200,7 +1212,8 @@ class AMDGPUStructBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -1216,7 +1229,8 @@ class AMDGPUStructPtrBufferStore : DefaultAttrsI llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1229,7 +1243,7 @@ class AMDGPURawBufferAtomic : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; @@ -1253,7 +1267,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1263,7 +1277,7 @@ class AMDGPURawPtrBufferAtomic : Intrinsic < AMDGPUBufferRsrcTy, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; @@ -1289,7 +1303,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< AMDGPUBufferRsrcTy, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1305,7 +1319,7 @@ class AMDGPUStructBufferAtomic : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; @@ -1328,7 +1342,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1339,7 +1353,7 @@ class AMDGPUStructPtrBufferAtomic : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; @@ -1363,7 +1377,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1440,7 +1454,8 @@ def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1455,7 +1470,8 @@ def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1470,7 +1486,8 @@ def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1485,7 +1502,8 @@ def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrReadMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1500,7 +1518,8 @@ def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1516,7 +1535,8 @@ def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1532,7 +1552,8 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1593,7 +1614,8 @@ class AMDGPURawBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; @@ -1609,7 +1631,8 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, IntrArgMemOnly, ReadOnly>, NoCapture>, WriteOnly>, NoCapture>, @@ -1629,7 +1652,8 @@ class AMDGPUStructBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; @@ -1646,7 +1670,8 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, IntrArgMemOnly, ReadOnly>, NoCapture>, WriteOnly>, NoCapture>, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 2b85024a9b40be0..f1da3995136a716 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -379,8 +379,8 @@ def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">, def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">, GISDNodeXFormEquiv; -def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">, - GISDNodeXFormEquiv; +def gi_extract_cpol_set_glc : GICustomOperandRenderer<"renderExtractCpolSetGLC">, + GISDNodeXFormEquiv; def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">, GISDNodeXFormEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ad8dcda93c365a6..677ecc5f2fc989d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1917,7 +1917,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) + if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | + AMDGPU::CPol::VOLATILE)) return false; int NumVAddrRegs = 0; @@ -5496,11 +5497,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, MIB.addImm(Swizzle); } -void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { +void AMDGPUInstructionSelector::renderExtractCpolSetGLC( + MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC); + const uint32_t Cpol = MI.getOperand(OpIdx).getImm() & + (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12); + MIB.addImm(Cpol | AMDGPU::CPol::GLC); } void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index ab7cc0a6beb8c27..12f9b8629464058 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -331,8 +331,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { int OpIdx) const; void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; - void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const; + void renderExtractCpolSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 9e99d382ed9b31a..d2769992b3c1a33 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1628,12 +1628,12 @@ multiclass SIBufferAtomicPat_Common(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1641,7 +1641,7 @@ multiclass SIBufferAtomicPat_Common(Inst # "_IDXEN" # InstSuffix) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1649,7 +1649,7 @@ multiclass SIBufferAtomicPat_Common(Inst # "_OFFEN" # InstSuffix) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1657,7 +1657,7 @@ multiclass SIBufferAtomicPat_Common(Inst # "_BOTHEN" # InstSuffix) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1726,35 +1726,35 @@ multiclass BufferAtomicPatterns_NO_RTN_Common vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary)) >; } @@ -1791,8 +1791,9 @@ multiclass SIBufferAtomicCmpSwapPat_Common(SIbuffer_atomic_cmpswap # !if(!eq(RtnMode, "ret"), "", "_noret")); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); - defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), - (timm:$cachepolicy)); + defvar CachePolicy = !if(!eq(RtnMode, "ret"), + (extract_cpol_set_glc $auxiliary), + (extract_cpol $auxiliary)); defvar SrcRC = getVregSrcForVT.ret; defvar DataRC = getVregSrcForVT.ret; defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1); @@ -1804,7 +1805,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common(CI.getArgOperand(CI.arg_size() - 1)); + if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) + Info.flags |= MachineMemOperand::MOVolatile; Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { unsigned MaxNumLanes = 4; @@ -7639,7 +7642,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) + if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | + AMDGPU::CPol::VOLATILE)) return Op; SmallVector Ops; @@ -8005,6 +8009,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { unsigned CPol = Op.getConstantOperandVal(3); + // s_buffer_load, because of how it's optimized, can't be volatile + // so reject ones with the volatile bit set. if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 04c92155f5aada5..86b679746450b2b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -885,8 +885,11 @@ def extract_swz : SDNodeXFormgetTargetConstant(Swizzle, SDLoc(N), MVT::i8); }]>; -def set_glc : SDNodeXFormgetTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8); +def extract_cpol_set_glc : SDNodeXFormgetZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12); + return CurDAG->getTargetConstant(cpol | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8); }]>; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll index cea9a132215790f..7b8b028128dd3d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -270,6 +270,25 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ret float %val } +define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (volatile dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648) + ret float %val +} + ; Natural mapping define amdgpu_ps <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll index 6f35e3bad3eaf10..2c99ce8694bcc1e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -327,6 +327,25 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__ ret void } +define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (volatile dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: S_ENDPGM 0 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648) + ret void +} + define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32 ; CHECK: bb.1 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll index 2b7ef147cae0f04..e40e6f8410ee756 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll @@ -129,6 +129,26 @@ main_body: ret float %out } +;CHECK-LABEL: {{^}}test_volatile: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK-DAG: s_waitcnt vmcnt(0) +define amdgpu_ps float @test_volatile(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648) + %out = bitcast i32 %t1 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test_volatile_noret: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen{{$}} +define amdgpu_ps void @test_volatile_noret(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648) + ret void +} + declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0 declare float @llvm.amdgcn.raw.ptr.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index 6a04a0cfed3551d..1670f41638d500d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -76,6 +76,42 @@ main_body: ret {<4 x float>, <4 x float>, <4 x float>} %r2 } +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(ptr addrspace(8) inreg) { +; PREGFX10-LABEL: buffer_load_volatile: +; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc +; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc +; PREGFX10-NEXT: s_waitcnt vmcnt(0) +; PREGFX10-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: buffer_load_volatile: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: buffer_load_volatile: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483648) + %data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483647) + %data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483646) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_immoffs: ; PREGFX10: ; %bb.0: ; %main_body