Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Fix setting nontemporal in memory legalizer #83815

Merged
merged 2 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2392,6 +2392,11 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(

bool Changed = false;

if (IsNonTemporal) {
// Set non-temporal hint for all cache levels.
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
}

if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);

Expand All @@ -2407,11 +2412,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
Position::AFTER);
}

if (IsNonTemporal) {
// Set non-temporal hint for all cache levels.
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
}

return Changed;
}

Expand Down
165 changes: 165 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -684,5 +684,170 @@ entry:
ret void
}

define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX7-LABEL: flat_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
store i32 %val, ptr %out
ret void
}

!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
158 changes: 158 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -674,5 +674,163 @@ entry:
ret void
}

define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX6-LABEL: global_nontemporal_volatile_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, s2
; GFX6-NEXT: s_mov_b32 s5, s3
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_nontemporal_volatile_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_nop 0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_volatile_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_nop 0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-WGP-NEXT: s_nop 0
; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-CU-NEXT: s_nop 0
; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
ret void
}

!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
Loading
Loading