Skip to content

Commit

Permalink
[AMDGPU] Add llvm.amdgcn.wqm.demote intrinsic
Browse files Browse the repository at this point in the history
Add intrinsic which demotes all active lanes to helper lanes.
This is used to implement demote to helper vulkan extension.

In practice demoting a lane to helper simply means removing it
from the mask of live lanes used for WQM/WWM/Exact mode.
Where the shader does not use WQM demotes become kills.

To support this live lanes must now be tracked through entire
shader.  This involves adding PHI nodes during WQM pass,
which can expose exec mask change issues.  This is overcome
by split blocks on changes from WQM/WWM to Exact mode.
As a result the WQM pass no longer preserves CFG, slot indexes
or live intervals as these is no way of maintaining when blocks
are split.

Change-Id: I086fa95a1ba0880aa2799c0430242ecd02eb0a11
  • Loading branch information
perlfu authored and Tim Renouf committed Oct 10, 2019
1 parent cc72be2 commit 8e41591
Show file tree
Hide file tree
Showing 7 changed files with 840 additions and 60 deletions.
8 changes: 4 additions & 4 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1276,10 +1276,7 @@ def int_amdgcn_interp_p2_f16 :

// Pixel shaders only: whether the current pixel is live (i.e. not a helper
// invocation for derivative computation).
def int_amdgcn_ps_live : Intrinsic <
[llvm_i1_ty],
[],
[IntrNoMem]>;
def int_amdgcn_ps_live : Intrinsic <[llvm_i1_ty], [], []>;

def int_amdgcn_mbcnt_lo :
GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
Expand Down Expand Up @@ -1493,6 +1490,9 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
// If false, set EXEC=0 for the current thread until the end of program.
def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;

// If false, mark all active lanes as helper lanes until the end of program.
def int_amdgcn_wqm_demote : Intrinsic<[], [llvm_i1_ty], []>;

// Copies the active channels of the source value to the destination value,
// with the guarantee that the source value is computed as if the entire
// program were executed in Whole Wavefront Mode, i.e. with all channels
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ void SIInsertSkips::kill(MachineInstr &MI) {
}
break;
}
case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
case AMDGPU::SI_KILL_I1_TERMINATOR: {
const MachineFunction *MF = MI.getParent()->getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
Expand Down Expand Up @@ -486,10 +487,12 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {

case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
case AMDGPU::SI_KILL_I1_TERMINATOR:
case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
MadeChange = true;
kill(MI);

if (ExecBranchStack.empty()) {
if (ExecBranchStack.empty() &&
MI.getOpcode() != AMDGPU::SI_DEMOTE_I1_TERMINATOR) {
if (NextBB != BE && skipIfDead(MI, *NextBB)) {
HaveSkipBlock = true;
NextBB = std::next(BI);
Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_ANDN2_B32));
break;

case AMDGPU::S_AND_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_AND_B64));
break;

case AMDGPU::S_AND_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_AND_B32));
break;

case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Expand Down Expand Up @@ -1907,15 +1919,18 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_AND_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
case AMDGPU::S_AND_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
case AMDGPU::SI_KILL_I1_TERMINATOR:
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
// FIXME: It's messy that these need to be considered here at all.
return true;
default:
Expand Down Expand Up @@ -6129,6 +6144,7 @@ bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
case AMDGPU::SI_KILL_I1_TERMINATOR:
case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
return true;
default:
return false;
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,15 @@ let WaveSizePredicate = isWave64 in {
def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
}

let WaveSizePredicate = isWave32 in {
def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
}

def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
Expand Down Expand Up @@ -323,12 +325,24 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
let isBranch = 1;
}

let Uses = [EXEC] in {

def SI_PS_LIVE : PseudoInstSI <
(outs SReg_1:$dst), (ins),
[(set i1:$dst, (int_amdgcn_ps_live))]> {
let SALU = 1;
}

let Defs = [EXEC] in {
def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
}
def SI_DEMOTE_I1_TERMINATOR : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
let isTerminator = 1;
}
} // End Defs = [EXEC]

} // End Uses = [EXEC]

def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
[(int_amdgcn_unreachable)],
"; divergent unreachable"> {
Expand Down Expand Up @@ -652,6 +666,16 @@ def : Pat <
(SI_KILL_I1_PSEUDO $src, -1)
>;

def : Pat <
(int_amdgcn_wqm_demote i1:$src),
(SI_DEMOTE_I1 $src, 0)
>;

def : Pat <
(int_amdgcn_wqm_demote (i1 (not i1:$src))),
(SI_DEMOTE_I1 $src, -1)
>;

def : Pat <
(AMDGPUkill i32:$src),
(SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,18 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
return true;
}
case AMDGPU::S_AND_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(TII.get(AMDGPU::S_AND_B64));
return true;
}
case AMDGPU::S_AND_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(TII.get(AMDGPU::S_AND_B32));
return true;
}
default:
return false;
}
Expand Down
Loading

0 comments on commit 8e41591

Please sign in to comment.