Skip to content

Commit

Permalink
[AMDGPU] Add transformation of kills to demotes in simple scenarios
Browse files Browse the repository at this point in the history
Convert conditional kills to demotes.
Enable this using -amdgpu-conditional-discard-transformations
and -amdgpu-transform-discard-to-demote combined.

V2: simplify options
V3: remove extraneous change details and improve comments
V4: fix pass naming

Change-Id: Ibe152dadd4728462855fe8a413ea55c41e981f1c
  • Loading branch information
perlfu committed Mar 12, 2020
1 parent ca9afc9 commit 80e004f
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 13 deletions.
35 changes: 28 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUConditionalDiscard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@
/// The pass should ideally be placed after code sinking, because some sinking
/// opportunities get lost after the transformation due to the basic block
/// removal.
///
/// Additionally this pass can be used to transform kill intrinsics optimized
/// as above to demote operations.
/// This provides a workaround for applications which perform a non-uniform
/// "kill" and later compute (implicit) derivatives.
/// Note that in Vulkan, such applications should be fixed to use demote
/// (OpDemoteToHelperInvocationEXT) instead of kill (OpKill).
///

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
Expand All @@ -59,6 +67,13 @@
using namespace llvm;
using namespace llvm::AMDGPU;

// Enable conditional discard to demote transformations
static cl::opt<bool> EnableTransformDiscardToDemote(
"amdgpu-transform-discard-to-demote",
cl::desc("Enable transformation of optimized discards to demotes"),
cl::init(false),
cl::Hidden);

namespace {

class AMDGPUConditionalDiscard : public FunctionPass {
Expand All @@ -78,10 +93,9 @@ class AMDGPUConditionalDiscard : public FunctionPass {
AU.addRequiredTransitive<LoopInfoWrapperPass>();
}


StringRef getPassName() const override { return "AMDGPUConditionalDiscard"; }

void optimizeBlock(BasicBlock &BB);
void optimizeBlock(BasicBlock &BB, bool ConvertToDemote);
};

} // namespace
Expand All @@ -94,7 +108,7 @@ char &llvm::AMDGPUConditionalDiscardID = AMDGPUConditionalDiscard::ID;
// first instruction is a call to amdgcn_kill, with "false" as argument.
// Transform the branch condition of the block's predecessor and mark
// the block for removal. Clone the call to amdgcn_kill to the predecessor.
void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB, bool ConvertToDemote) {

if (auto *KillCand = dyn_cast<CallInst>(&BB.front())) {
auto *Callee = KillCand->getCalledFunction();
Expand All @@ -111,8 +125,10 @@ void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {
return;

// Skip if the kill is in a loop.
if (LI->getLoopFor(PredBlock))
if (LI->getLoopFor(PredBlock)) {
LLVM_DEBUG(dbgs() << "Cannot optimize " << BB.getName() << " due to loop\n");
return;
}

auto *PredTerminator = PredBlock->getTerminator();
auto *PredBranchInst = dyn_cast<BranchInst>(PredTerminator);
Expand All @@ -134,6 +150,11 @@ void AMDGPUConditionalDiscard::optimizeBlock(BasicBlock &BB) {

auto *NewKill = cast<CallInst>(KillCand->clone());

if (ConvertToDemote) {
NewKill->setCalledFunction(Intrinsic::getDeclaration(
KillCand->getModule(), Intrinsic::amdgcn_wqm_demote));
}

NewKill->setArgOperand(0, Cond);
NewKill->insertBefore(PredTerminator);

Expand All @@ -157,7 +178,7 @@ bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

for (auto &BB : F)
optimizeBlock(BB);
optimizeBlock(BB, EnableTransformDiscardToDemote);

for (auto *BB : KillBlocksToRemove) {
for (auto *Succ : successors(BB)) {
Expand All @@ -173,10 +194,10 @@ bool AMDGPUConditionalDiscard::runOnFunction(Function &F) {
}

INITIALIZE_PASS_BEGIN(AMDGPUConditionalDiscard, DEBUG_TYPE,
"Transform conditional discard", false, false)
"AMDGPUConditionalDiscard", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUConditionalDiscard, DEBUG_TYPE,
"Transform conditional discard", false, false)
"AMDGPUConditionalDiscard", false, false)

FunctionPass *llvm::createAMDGPUConditionalDiscardPass() {
return new AMDGPUConditionalDiscard();
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,9 @@ bool GCNPassConfig::addPreISel() {
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.

if (EnableConditionalDiscardTransformations)
addPass(createAMDGPUConditionalDiscardPass());

// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
addPass(&AMDGPUUnifyDivergentExitNodesID);
Expand All @@ -866,9 +869,6 @@ bool GCNPassConfig::addPreISel() {
// outside of the same library needs to be resolved in llvm core code.
addPass(createLCSSAPass());

if (EnableConditionalDiscardTransformations)
addPass(createAMDGPUConditionalDiscardPass());

addPass(createAMDGPUAnnotateUniformValues());
if (!LateCFGStructurize) {
addPass(createSIAnnotateControlFlowPass());
Expand Down
70 changes: 67 additions & 3 deletions llvm/test/CodeGen/AMDGPU/discard-optimization.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
; RUN: llc -amdgpu-conditional-discard-transformations=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -amdgpu-conditional-discard-transformations=1 --march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,KILL %s
; RUN: llc -amdgpu-conditional-discard-transformations=1 -amdgpu-transform-discard-to-demote --march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEMOTE %s

; Check that the branch is removed by the discard opt.

; GCN-LABEL: {{^}}if_with_kill_true_cond:
; GCN: v_cmp_ne_u32_e32 vcc,
; GCN-NEXT: s_and_b64 exec, exec, vcc
; GCN-NOT: branch
define amdgpu_ps void @if_with_kill_true_cond(i32 %arg) {
.entry:
%cmp = icmp eq i32 %arg, 32
Expand All @@ -24,7 +24,6 @@ endif:
; GCN-LABEL: {{^}}if_with_kill_false_cond:
; GCN: v_cmp_eq_u32_e32 vcc,
; GCN-NEXT: s_and_b64 exec, exec, vcc
; GCN-NOT: branch
define amdgpu_ps void @if_with_kill_false_cond(i32 %arg) {
.entry:
%cmp = icmp eq i32 %arg, 32
Expand Down Expand Up @@ -127,8 +126,73 @@ endif:
ret void
}


; GCN-LABEL: {{^}}wqm_kill_to_demote1:
; GCN-NEXT: ; %.entry
; GCN: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_wqm_b64 exec, exec
; DEMOTE: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc
; GCN: image_sample
; GCN: v_add_f32_e32
; DEMOTE: s_and_b64 exec, exec, [[LIVE]]
; KILL: s_and_b64 exec, exec, [[ORIG]]
; GCN: image_sample
define amdgpu_ps <4 x float> @wqm_kill_to_demote1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
.entry:
%z.cmp = fcmp olt float %z, 0.0
br i1 %z.cmp, label %.continue, label %.kill

.kill:
call void @llvm.amdgcn.kill(i1 false)
br label %.export

.continue:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%tex1 = extractelement <4 x float> %tex, i32 0
%coord1 = fadd float %tex0, %tex1
%rtex.src = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
br label %.export

.export:
%rtex = phi <4 x float> [ undef, %.kill ], [ %rtex.src, %.continue ]
ret <4 x float> %rtex
}


; GCN-LABEL: {{^}}wqm_kill_to_demote2:
; GCN-NEXT: ; %.entry
; GCN: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_wqm_b64 exec, exec
; GCN: image_sample
; DEMOTE: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc
; GCN: v_add_f32_e32
; DEMOTE: s_and_b64 exec, exec, [[LIVE]]
; KILL: s_and_b64 exec, exec, [[ORIG]]
; GCN: image_sample
define amdgpu_ps <4 x float> @wqm_kill_to_demote2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
.entry:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%tex1 = extractelement <4 x float> %tex, i32 0
%z.cmp = fcmp olt float %tex0, 0.0
br i1 %z.cmp, label %.continue, label %.kill

.kill:
call void @llvm.amdgcn.kill(i1 false)
br label %.continue

.continue:
%coord1 = fadd float %tex0, %tex1
%rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0

ret <4 x float> %rtex
}

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
declare void @llvm.amdgcn.kill(i1) #0
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

0 comments on commit 80e004f

Please sign in to comment.