Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Allow buffer intrinsics to be marked volatile at the IR level #77847

Merged
merged 1 commit into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 52 additions & 27 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,8 @@ def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">,
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
GISDNodeXFormEquiv<extract_swz>;

def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">,
GISDNodeXFormEquiv<set_glc>;
def gi_extract_cpol_set_glc : GICustomOperandRenderer<"renderExtractCpolSetGLC">,
GISDNodeXFormEquiv<extract_cpol_set_glc>;

def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
Expand Down
13 changes: 8 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1917,7 +1917,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return false;

int NumVAddrRegs = 0;
Expand Down Expand Up @@ -5496,11 +5497,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
MIB.addImm(Swizzle);
}

void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
(AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12);
MIB.addImm(Cpol | AMDGPU::CPol::GLC);
}

void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
int OpIdx) const;
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderExtractCpolSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;

void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
Expand Down
39 changes: 20 additions & 19 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1628,36 +1628,36 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,

defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar CachePolicy = !if(!eq(RtnMode, "ret"),
(set_glc $cachepolicy), (timm:$cachepolicy));
(extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary));

let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, 0)),
timm:$offset, timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, CachePolicy)
>;

def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, timm)),
timm:$offset, timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
>;

def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
(BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)),
(BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
>;

def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
(BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)),
(BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
Expand Down Expand Up @@ -1726,35 +1726,35 @@ multiclass BufferAtomicPatterns_NO_RTN_Common<SDPatternOperator name, ValueType
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0),
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
timm:$offset, (extract_cpol $auxiliary))
>;

def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm),
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
timm:$offset, (extract_cpol $auxiliary))
>;

def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0),
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
timm:$offset, (extract_cpol $auxiliary))
>;

def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm),
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy)
SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary))
>;
}

Expand Down Expand Up @@ -1791,8 +1791,9 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
# !if(!eq(RtnMode, "ret"), "", "_noret"));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
(timm:$cachepolicy));
defvar CachePolicy = !if(!eq(RtnMode, "ret"),
(extract_cpol_set_glc $auxiliary),
(extract_cpol $auxiliary));
defvar SrcRC = getVregSrcForVT<vt>.ret;
defvar DataRC = getVregSrcForVT<data_vt>.ret;
defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
Expand All @@ -1804,7 +1805,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
def : GCNPat<
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, 0)),
timm:$offset, timm:$auxiliary, 0)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG OffsetResDag, SubLo),
OffsetResDag)
Expand All @@ -1818,7 +1819,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm)),
timm:$auxiliary, timm)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG IdxenResDag, SubLo),
IdxenResDag)
Expand All @@ -1832,7 +1833,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, 0,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0)),
timm:$auxiliary, 0)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG OffenResDag, SubLo),
OffenResDag)
Expand All @@ -1846,7 +1847,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm)),
timm:$auxiliary, timm)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG BothenResDag, SubLo),
BothenResDag)
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,10 @@ enum CPol {
TH_TYPE_STORE = 1 << 8, // TH_STORE policy
TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy
TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not

// Volatile (used to preserve/signal operation volatility for buffer
// operations not a real instruction bit)
VOLATILE = 1 << 31,
};

} // namespace CPol
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1183,6 +1183,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = RsrcArg;
}

auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
Info.flags |= MachineMemOperand::MOVolatile;
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
unsigned MaxNumLanes = 4;
Expand Down Expand Up @@ -7639,7 +7642,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return Op;

SmallVector<SDValue, 26> Ops;
Expand Down Expand Up @@ -8005,6 +8009,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
unsigned CPol = Op.getConstantOperandVal(3);
// s_buffer_load, because of how it's optimized, can't be volatile
// so reject ones with the volatile bit set.
if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12))
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -885,8 +885,11 @@ def extract_swz : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(Swizzle, SDLoc(N), MVT::i8);
}]>;

def set_glc : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
def extract_cpol_set_glc : SDNodeXForm<timm, [{
const uint32_t cpol = N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12
? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12);
return CurDAG->getTargetConstant(cpol | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
}]>;

//===----------------------------------------------------------------------===//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,25 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s
ret float %val
}

define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (volatile dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648)
ret float %val
}

; Natural mapping
define amdgpu_ps <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,25 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__
ret void
}

define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (volatile dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648)
ret void
}

define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
; CHECK: bb.1 (%ir-block.0):
Expand Down
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,26 @@ main_body:
ret float %out
}

;CHECK-LABEL: {{^}}test_volatile:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}}
;CHECK-DAG: s_waitcnt vmcnt(0)
define amdgpu_ps float @test_volatile(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) {
main_body:
%t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648)
%out = bitcast i32 %t1 to float
ret float %out
}

;CHECK-LABEL: {{^}}test_volatile_noret:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen{{$}}
define amdgpu_ps void @test_volatile_noret(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) {
main_body:
%t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648)
ret void
}

declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0
declare float @llvm.amdgcn.raw.ptr.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0
Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,42 @@ main_body:
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}

define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(ptr addrspace(8) inreg) {
; PREGFX10-LABEL: buffer_load_volatile:
; PREGFX10: ; %bb.0: ; %main_body
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: buffer_load_volatile:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: buffer_load_volatile:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483648)
%data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483647)
%data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483646)
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}

define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
; PREGFX10-LABEL: buffer_load_immoffs:
; PREGFX10: ; %bb.0: ; %main_body
Expand Down
Loading