Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Add IR-level pass to rewrite away address space 7 #77952

Merged
merged 1 commit into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
ModulePass *createAMDGPULowerBufferFatPointersPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsPass();

Expand Down Expand Up @@ -136,6 +137,18 @@ struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

void initializeAMDGPULowerBufferFatPointersPass(PassRegistry &);
extern char &AMDGPULowerBufferFatPointersID;

struct AMDGPULowerBufferFatPointersPass
: PassInfoMixin<AMDGPULowerBufferFatPointersPass> {
AMDGPULowerBufferFatPointersPass(const TargetMachine &TM) : TM(TM) {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);

private:
const TargetMachine &TM;
};

void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;

Expand Down
2,012 changes: 2,012 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
Expand Down Expand Up @@ -413,6 +414,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
Expand Down Expand Up @@ -646,6 +648,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
PM.addPass(AMDGPULowerModuleLDSPass(*this));
return true;
}
if (PassName == "amdgpu-lower-buffer-fat-pointers") {
PM.addPass(AMDGPULowerBufferFatPointersPass(*this));
return true;
}
if (PassName == "amdgpu-lower-ctor-dtor") {
PM.addPass(AMDGPUCtorDtorLoweringPass());
return true;
Expand Down Expand Up @@ -1113,6 +1119,29 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());

if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
// It could be placed anywhere before uniformity annotations (an analysis
// that it changes by splitting up fat pointers into their components)
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
//
// FIXME: This should ideally be put after the LoadStoreVectorizer.
piotrAMD marked this conversation as resolved.
Show resolved Hide resolved
// However, due to some annoying facts about ResourceUsageAnalysis,
// (especially as exercised in the resource-usage-dead-function test),
// we need all the function passes codegenprepare all the way through
// said resource usage analysis to run on the call graph produced
// before codegenprepare runs (because codegenprepare will knock some
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
// In accordance with the above FIXME, manually force all the
// function-level passes into a CGSCCPassManager.
addPass(new DummyCGSCCPass());
}

TargetPassConfig::addCodeGenPrepare();

if (isPassEnabled(EnableLoadStoreVectorizer))
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULibCalls.cpp
AMDGPUImageIntrinsicOptimizer.cpp
AMDGPULibFunc.cpp
AMDGPULowerBufferFatPointers.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
Expand Down
14 changes: 9 additions & 5 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15923,7 +15923,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;

if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
if ((AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) &&
Subtarget->hasAtomicFaddNoRtnInsts()) {
if (Subtarget->hasGFX940Insts())
return AtomicExpansionKind::None;
Expand All @@ -15935,11 +15936,13 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (HasSystemScope)
return AtomicExpansionKind::CmpXChg;

if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) {
// global atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
if ((AS == AMDGPUAS::GLOBAL_ADDRESS ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) &&
Ty->isFloatTy()) {
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
// global atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
}
Expand Down Expand Up @@ -15994,7 +15997,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::Max:
case AtomicRMWInst::UMin:
case AtomicRMWInst::UMax: {
if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
if (RMW->getType()->isFloatTy() &&
unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,73 @@
; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator < %s
; REQUIRES: asserts

; Confirm that no one's gotten vectors of addrspace(7) pointers to go through the
; IR translater incidentally.
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator < %s | FileCheck %s

define <2 x ptr addrspace(7)> @no_auto_constfold_gep_vector() {
; CHECK-LABEL: name: no_auto_constfold_gep_vector
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[C:%[0-9]+]]:_(p8) = G_CONSTANT i128 0
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p8>) = G_BUILD_VECTOR [[C]](p8), [[C]](p8)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 123
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p8>)
; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<2 x s32>)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32)
; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32)
; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32)
; CHECK-NEXT: $vgpr6 = COPY [[UV6]](s32)
; CHECK-NEXT: $vgpr7 = COPY [[UV7]](s32)
; CHECK-NEXT: $vgpr8 = COPY [[UV8]](s32)
; CHECK-NEXT: $vgpr9 = COPY [[UV9]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9
%gep = getelementptr i8, <2 x ptr addrspace(7)> zeroinitializer, <2 x i32> <i32 123, i32 123>
ret <2 x ptr addrspace(7)> %gep
}

define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i64 %idx) {
; CHECK-LABEL: name: gep_vector_splat
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p8) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p8) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p8>) = G_BUILD_VECTOR [[MV]](p8), [[MV1]](p8)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<2 x p8>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV2]](s64), [[C]](s64)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0)
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[SHUF]](<2 x s64>)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s32>) = G_ADD [[BUILD_VECTOR1]], [[TRUNC]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p8>)
; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD]](<2 x s32>)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32)
; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32)
; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32)
; CHECK-NEXT: $vgpr6 = COPY [[UV6]](s32)
; CHECK-NEXT: $vgpr7 = COPY [[UV7]](s32)
; CHECK-NEXT: $vgpr8 = COPY [[UV8]](s32)
; CHECK-NEXT: $vgpr9 = COPY [[UV9]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9
%gep = getelementptr i8, <2 x ptr addrspace(7)> %ptrs, i64 %idx
ret <2 x ptr addrspace(7)> %gep
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
define ptr addrspace(7) @no_auto_constfold_gep() {
; CHECK-LABEL: name: no_auto_constfold_gep
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: [[C:%[0-9]+]]:_(p7) = G_CONSTANT i160 0
; CHECK-NEXT: [[C:%[0-9]+]]:_(p8) = G_CONSTANT i128 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 123
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[C]], [[C1]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTR_ADD]](p7)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](p8)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32)
; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32)
; CHECK-NEXT: $vgpr4 = COPY [[C1]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4
%gep = getelementptr i8, ptr addrspace(7) null, i32 123
ret ptr addrspace(7) %gep
Expand Down
25 changes: 25 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@
; GCN-O0-NEXT: AMDGPU Annotate Kernel Features
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O0-NEXT: CallGraph Construction
; GCN-O0-NEXT: Call Graph SCC Pass Manager
; GCN-O0-NEXT: DummyCGSCCPass
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Lazy Value Information Analysis
; GCN-O0-NEXT: Lower SwitchInst's to branches
; GCN-O0-NEXT: Lower invoke and unwind, for unwindless code generators
Expand Down Expand Up @@ -229,6 +234,11 @@
; GCN-O1-NEXT: AMDGPU Annotate Kernel Features
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
; GCN-O1-NEXT: DummyCGSCCPass
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: CodeGen Prepare
Expand Down Expand Up @@ -513,6 +523,11 @@
; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-OPTS-NEXT: CallGraph Construction
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
; GCN-O1-OPTS-NEXT: DummyCGSCCPass
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: CodeGen Prepare
Expand Down Expand Up @@ -815,6 +830,11 @@
; GCN-O2-NEXT: AMDGPU Annotate Kernel Features
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Call Graph SCC Pass Manager
; GCN-O2-NEXT: DummyCGSCCPass
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: CodeGen Prepare
Expand Down Expand Up @@ -1131,6 +1151,11 @@
; GCN-O3-NEXT: AMDGPU Annotate Kernel Features
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Call Graph SCC Pass Manager
; GCN-O3-NEXT: DummyCGSCCPass
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: CodeGen Prepare
Expand Down
Loading
Loading