[AMDGPU] Add llvm.amdgcn.wqm.demote intrinsic

Add intrinsic which demotes all active lanes to helper lanes. This is used to implement demote to helper vulkan extension. In practice demoting a lane to helper simply means removing it from the mask of live lanes used for WQM/WWM/Exact mode. Where the shader does not use WQM demotes become kills. To support this live lanes must now be tracked through entire shader. This involves adding PHI nodes during WQM pass, which can expose exec mask change issues. This is overcome by split blocks on changes from WQM/WWM to Exact mode. As a result the WQM pass no longer preserves CFG, slot indexes or live intervals as these is no way of maintaining when blocks are split. Change-Id: I086fa95a1ba0880aa2799c0430242ecd02eb0a11
jaebaek · Oct 10, 2019 · 8e41591 · 8e41591
1 parent cc72be2
commit 8e41591
Show file tree

Hide file tree

Showing 7 changed files with 840 additions and 60 deletions.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1276,10 +1276,7 @@ def int_amdgcn_interp_p2_f16 :
 
 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
 // invocation for derivative computation).
-def int_amdgcn_ps_live : Intrinsic <
-  [llvm_i1_ty],
-  [],
-  [IntrNoMem]>;
+def int_amdgcn_ps_live : Intrinsic <[llvm_i1_ty], [], []>;
 
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
@@ -1493,6 +1490,9 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
 // If false, set EXEC=0 for the current thread until the end of program.
 def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
 
+// If false, mark all active lanes as helper lanes until the end of program.
+def int_amdgcn_wqm_demote : Intrinsic<[], [llvm_i1_ty], []>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels

diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -270,6 +270,7 @@ void SIInsertSkips::kill(MachineInstr &MI) {
     }
     break;
   }
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     const MachineFunction *MF = MI.getParent()->getParent();
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -486,10 +487,12 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
 
       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       case AMDGPU::SI_KILL_I1_TERMINATOR:
+      case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
-        if (ExecBranchStack.empty()) {
+        if (ExecBranchStack.empty() &&
+            MI.getOpcode() != AMDGPU::SI_DEMOTE_I1_TERMINATOR) {
           if (NextBB != BE && skipIfDead(MI, *NextBB)) {
             HaveSkipBlock = true;
             NextBB = std::next(BI);

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1426,6 +1426,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
     break;
 
+  case AMDGPU::S_AND_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B64));
+    break;
+
+  case AMDGPU::S_AND_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B32));
+    break;
+
   case AMDGPU::V_MOV_B64_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1907,15 +1919,18 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     case AMDGPU::S_MOV_B64_term:
     case AMDGPU::S_XOR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
+    case AMDGPU::S_AND_B64_term:
     case AMDGPU::S_MOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
     case AMDGPU::S_OR_B32_term:
     case AMDGPU::S_ANDN2_B32_term:
+    case AMDGPU::S_AND_B32_term:
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+    case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
       // FIXME: It's messy that these need to be considered here at all.
       return true;
     default:
@@ -6129,6 +6144,7 @@ bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
   switch (Opcode) {
   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   case AMDGPU::SI_KILL_I1_TERMINATOR:
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
     return true;
   default:
     return false;

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -195,13 +195,15 @@ let WaveSizePredicate = isWave64 in {
 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
 }
 
 let WaveSizePredicate = isWave32 in {
 def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
 def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
 }
 
 def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
@@ -323,12 +325,24 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
   let isBranch = 1;
 }
 
+let Uses = [EXEC] in {
+
 def SI_PS_LIVE : PseudoInstSI <
   (outs SReg_1:$dst), (ins),
   [(set i1:$dst, (int_amdgcn_ps_live))]> {
   let SALU = 1;
 }
 
+let Defs = [EXEC] in {
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
+}
+def SI_DEMOTE_I1_TERMINATOR : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
+  let isTerminator = 1;
+}
+} // End Defs = [EXEC]
+
+} // End Uses = [EXEC]
+
 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   [(int_amdgcn_unreachable)],
   "; divergent unreachable"> {
@@ -652,6 +666,16 @@ def : Pat <
   (SI_KILL_I1_PSEUDO $src, -1)
 >;
 
+def : Pat <
+  (int_amdgcn_wqm_demote i1:$src),
+  (SI_DEMOTE_I1 $src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+  (SI_DEMOTE_I1 $src, -1)
+>;
+
 def : Pat <
   (AMDGPUkill i32:$src),
   (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE

diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -214,6 +214,18 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
     return true;
   }
+  case AMDGPU::S_AND_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+    return true;
+  }
+  case AMDGPU::S_AND_B32_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+    return true;
+  }
   default:
     return false;
   }