diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index bc02d22c2f74f1..8b27ae1dfa700d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1276,10 +1276,7 @@ def int_amdgcn_interp_p2_f16 : // Pixel shaders only: whether the current pixel is live (i.e. not a helper // invocation for derivative computation). -def int_amdgcn_ps_live : Intrinsic < - [llvm_i1_ty], - [], - [IntrNoMem]>; +def int_amdgcn_ps_live : Intrinsic <[llvm_i1_ty], [], []>; def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, @@ -1493,6 +1490,9 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], // If false, set EXEC=0 for the current thread until the end of program. def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; +// If false, mark all active lanes as helper lanes until the end of program. +def int_amdgcn_wqm_demote : Intrinsic<[], [llvm_i1_ty], []>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 87e63fcc4a04fa..8c85309a3bd0c8 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -270,6 +270,7 @@ void SIInsertSkips::kill(MachineInstr &MI) { } break; } + case AMDGPU::SI_DEMOTE_I1_TERMINATOR: case AMDGPU::SI_KILL_I1_TERMINATOR: { const MachineFunction *MF = MI.getParent()->getParent(); const GCNSubtarget &ST = MF->getSubtarget(); @@ -486,10 +487,12 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_DEMOTE_I1_TERMINATOR: MadeChange = true; kill(MI); - if (ExecBranchStack.empty()) { + if (ExecBranchStack.empty() && + MI.getOpcode() != AMDGPU::SI_DEMOTE_I1_TERMINATOR) { if (NextBB != BE && skipIfDead(MI, *NextBB)) { HaveSkipBlock = true; NextBB = std::next(BI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a802c816847452..3ef6a13750cde0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1426,6 +1426,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_ANDN2_B32)); break; + case AMDGPU::S_AND_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B64)); + break; + + case AMDGPU::S_AND_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -1907,15 +1919,18 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_AND_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: + case AMDGPU::S_AND_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: case AMDGPU::SI_KILL_I1_TERMINATOR: case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_DEMOTE_I1_TERMINATOR: // FIXME: It's messy that these need to be considered here at all. return true; default: @@ -6129,6 +6144,7 @@ bool SIInstrInfo::isKillTerminator(unsigned Opcode) { switch (Opcode) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_DEMOTE_I1_TERMINATOR: return true; default: return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 5f0cbc3310d672..16c8dd6cca1361 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -195,6 +195,7 @@ let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; +def S_AND_B64_term : WrapTerminatorInst; } let WaveSizePredicate = isWave32 in { @@ -202,6 +203,7 @@ def S_MOV_B32_term : WrapTerminatorInst; def S_XOR_B32_term : WrapTerminatorInst; def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; +def S_AND_B32_term : WrapTerminatorInst; } def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), @@ -323,12 +325,24 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { let isBranch = 1; } +let Uses = [EXEC] in { + def SI_PS_LIVE : PseudoInstSI < (outs SReg_1:$dst), (ins), [(set i1:$dst, (int_amdgcn_ps_live))]> { let SALU = 1; } +let Defs = [EXEC] in { +def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> { +} +def SI_DEMOTE_I1_TERMINATOR : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> { + let isTerminator = 1; +} +} // End Defs = [EXEC] + +} // End Uses = [EXEC] + def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), [(int_amdgcn_unreachable)], "; divergent unreachable"> { @@ -652,6 +666,16 @@ def : Pat < (SI_KILL_I1_PSEUDO $src, -1) >; +def : Pat < + (int_amdgcn_wqm_demote i1:$src), + (SI_DEMOTE_I1 $src, 0) +>; + +def : Pat < + (int_amdgcn_wqm_demote (i1 (not i1:$src))), + (SI_DEMOTE_I1 $src, -1) +>; + def : Pat < (AMDGPUkill i32:$src), (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index cc9b46a755823b..be45c44fe5b785 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -214,6 +214,18 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); return true; } + case AMDGPU::S_AND_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + return true; + } + case AMDGPU::S_AND_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + return true; + } default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index cb4cf68d709ad6..7b452ba447059f 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -132,6 +132,9 @@ struct BlockInfo { char Needs = 0; char InNeeds = 0; char OutNeeds = 0; + char InitialState = 0; + unsigned LiveMaskIn = 0; // Initial live mask register + unsigned LiveMaskOut = 0; // Outgoing live mask register }; struct WorkItem { @@ -154,8 +157,15 @@ class SIWholeQuadMode : public MachineFunctionPass { DenseMap Instructions; DenseMap Blocks; - SmallVector LiveMaskQueries; + + // Tracks live mask output of instructions + DenseMap LiveMaskRegs; + // Tracks state (WQM/WWM/Exact) after a given instruction + DenseMap StateTransition; + + SmallVector LiveMaskQueries; SmallVector LowerToCopyInstrs; + SmallVector DemoteInstrs; void printInfo(); @@ -168,6 +178,10 @@ class SIWholeQuadMode : public MachineFunctionPass { void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); + void scanLiveLanes(MachineBasicBlock &MBB, + std::vector &Worklist); + void analyzeLiveLanes(MachineFunction &MF); + bool requiresCorrectState(const MachineInstr &MI) const; MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, @@ -175,7 +189,7 @@ class SIWholeQuadMode : public MachineFunctionPass { MachineBasicBlock::iterator prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, MachineBasicBlock::iterator Last, bool PreferLast, - bool SaveSCC); + bool SaveSCC, bool CheckPhys); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, @@ -183,11 +197,28 @@ class SIWholeQuadMode : public MachineFunctionPass { void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveOrig); void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig); - void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + unsigned SavedOrig, char NonWWMState); + + bool canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI); + MachineBasicBlock *splitBlock(MachineBasicBlock *BB, + MachineInstr *TermMI); + void lowerBlock(MachineBasicBlock &MBB); + + unsigned findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI, + MachineBasicBlock::iterator &Before); + void processBlock(MachineBasicBlock &MBB, bool isEntry); - void lowerLiveMaskQueries(unsigned LiveMaskReg); + bool lowerLiveMaskQueries(unsigned LiveMaskReg); void lowerCopyInstrs(); + bool lowerDemoteInstrs(); + + void lowerLiveMaskQuery(MachineBasicBlock &MBB, + MachineInstr &MI, + unsigned LiveMaskReg, + bool isWQM); + bool lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI, + unsigned LiveMaskIn, unsigned LiveMaskOut, + bool isWQM); public: static char ID; @@ -201,9 +232,6 @@ class SIWholeQuadMode : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addPreserved(); - AU.addPreserved(); - AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -382,6 +410,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } else { if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); + } else if (Opcode == AMDGPU::SI_DEMOTE_I1) { + DemoteInstrs.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -523,6 +553,115 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { return GlobalFlags; } +// Trace live mask manipulate through block, creating new virtual registers. +// Additionally insert PHI nodes when block has multiple predecessors +// which manipulated the mask. +void SIWholeQuadMode::scanLiveLanes(MachineBasicBlock &MBB, + std::vector &Worklist) { + BlockInfo &BI = Blocks[&MBB]; + + if (BI.LiveMaskIn && BI.LiveMaskOut) + return; // Block has been fully traced already. + + if (!BI.LiveMaskIn) { + // Find the incoming live mask, or insert PHI if there are multiple. + unsigned LastPredReg = 0; + unsigned Count = 0; + bool Valid = true; + + // Find predecessor live masks while performing basic deduplication. + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (!PredBI.LiveMaskOut) { + Valid = false; + break; + } + if (PredBI.LiveMaskOut != LastPredReg) { + LastPredReg = PredBI.LiveMaskOut; + Count++; + } + } + + if (Valid) { + // All predecessors have live mask outputs. + if (Count > 1) { + BI.LiveMaskIn = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstrBuilder PHI = BuildMI(MBB, MBB.begin(), DebugLoc(), + TII->get(TargetOpcode::PHI), + BI.LiveMaskIn); + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + PHI.addReg(PredBI.LiveMaskOut); + PHI.addMBB(Pred); + } + LIS->InsertMachineInstrInMaps(*PHI); + } else { + BI.LiveMaskIn = LastPredReg; + } + } else { + // Not all predecessor blocks have live mask outputs, + // so this block will need to be revisited. + + if (!BI.LiveMaskOut) { + // Give this block a live mask output to ensure forward progress. + BI.LiveMaskOut = MRI->createVirtualRegister(TRI->getBoolRC()); + } + + // Queue this block to be revisited and visit predecessors. + Worklist.push_back(&MBB); + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (!PredBI.LiveMaskOut) + Worklist.push_back(Pred); + } + return; + } + } + + assert(BI.LiveMaskIn); + + // Now that the initial live mask register is known the block can + // be traced and intermediate live mask registers assigned for instructions + // which manipulate the mask. + unsigned CurrentLive = BI.LiveMaskIn; + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + MachineInstr &MI = *II; + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + unsigned NewLive = MRI->createVirtualRegister(TRI->getBoolRC()); + LiveMaskRegs[&MI] = NewLive; + CurrentLive = NewLive; + } + II++; + } + + // If an output register was assigned to guarantee forward progress + // then it is possible the current live register will not become the output + // live mask register. This will be resolved during block lowering. + if (!BI.LiveMaskOut) { + BI.LiveMaskOut = CurrentLive; + } +} + +// Scan blocks for live mask manipulation operations in reverse post order +// to minimise rescans: a block will have to be rescanned if it's +// predecessors live mask output is not defined. +void SIWholeQuadMode::analyzeLiveLanes(MachineFunction &MF) { + std::vector Worklist; + + ReversePostOrderTraversal RPOT(&MF); + for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = **BI; + scanLiveLanes(MBB, Worklist); + } + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.back(); + Worklist.pop_back(); + scanLiveLanes(*MBB, Worklist); + } +} + /// Whether \p MI really requires the exec state computed during analysis. /// /// Scalar instructions must occasionally be marked WQM for correct propagation @@ -577,7 +716,8 @@ SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, // instructions we want to add necessarily clobber SCC. MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( MachineBasicBlock &MBB, MachineBasicBlock::iterator First, - MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { + MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC, + bool CheckPhys) { if (!SaveSCC) return PreferLast ? Last : First; @@ -610,9 +750,24 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( MachineBasicBlock::iterator MBBI; - if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) + if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) { MBBI = MI; - else { + if (CheckPhys) { + // Make sure insertion point is after any COPY instructions + // accessing physical live in registers. This is ensures that + // block splitting does not occur before all live ins have been copied. + while (MBBI != Last) { + if (MBBI->getOpcode() != AMDGPU::COPY) + break; + unsigned Src = MBBI->getOperand(1).getReg(); + if (!Register::isVirtualRegister(Src) && MBB.isLiveIn(Src)) { + MBBI++; + } else { + break; + } + } + } + } else { assert(Idx == LIS->getMBBEndIdx(&MBB)); MBBI = MBB.end(); } @@ -643,6 +798,7 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateExact; } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, @@ -662,6 +818,7 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWQM; } void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, @@ -673,11 +830,13 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWWM; } void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig) { + unsigned SavedOrig, + char NonWWMState) { MachineInstr *MI; assert(SavedOrig); @@ -685,20 +844,272 @@ void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = NonWWMState; +} + +void SIWholeQuadMode::lowerLiveMaskQuery(MachineBasicBlock &MBB, + MachineInstr &MI, + unsigned LiveMaskReg, + bool isWQM) { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Dest = MI.getOperand(0).getReg(); + MachineInstr *Copy = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + LIS->ReplaceMachineInstrInMaps(MI, *Copy); + MBB.remove(&MI); +} + +// Lower an instruction which demotes lanes to helpers by adding +// appropriate live mask manipulation. Note this is also applied to kills. +bool SIWholeQuadMode::lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI, + unsigned LiveMaskIn, unsigned LiveMaskOut, + bool isWQM) { + const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned AndN2 = + ST->isWave32() ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + const unsigned And = + ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineInstr *NewMI = nullptr; + bool NeedSplit = false; + + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + if (Imm == KillVal) { + NewMI = BuildMI(MBB, MI, DL, + TII->get(AndN2), + LiveMaskOut) + .addReg(LiveMaskIn) + .addReg(Exec); + } + } else { + unsigned Opcode = KillVal ? AndN2 : And; + NewMI = BuildMI(MBB, MI, DL, + TII->get(Opcode), + LiveMaskOut) + .addReg(LiveMaskIn) + .add(Op); + } + + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + if (isWQM) { + // Inside WQM demotes are replaced with live mask manipulation + LIS->RemoveMachineInstrFromMaps(MI); + MBB.remove(&MI); + } else { + // Outside WQM demotes become kills terminating the block + NeedSplit = true; + } + } + + if (NewMI) { + LIS->InsertMachineInstrInMaps(*NewMI); + } + + return NeedSplit; +} + +bool SIWholeQuadMode::canSplitBlockAt(MachineBasicBlock *BB, + MachineInstr *MI) { + // Cannot split immediately before the epilog + // because there are values in physical registers + if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + return false; + } + + // Do not split inside a waterfall intrinsic pair + MachineBasicBlock::iterator II = BB->getFirstNonPHI(); + MachineBasicBlock::iterator IE = BB->end(); + bool InWaterfall = false; + while (II != IE) { + if (&*II == MI) + return !InWaterfall; + + switch (II->getOpcode()) { + case AMDGPU::SI_WATERFALL_BEGIN_V1: + case AMDGPU::SI_WATERFALL_BEGIN_V2: + case AMDGPU::SI_WATERFALL_BEGIN_V4: + case AMDGPU::SI_WATERFALL_BEGIN_V8: + InWaterfall = true; + break; + case AMDGPU::SI_WATERFALL_END_V1: + case AMDGPU::SI_WATERFALL_END_V2: + case AMDGPU::SI_WATERFALL_END_V4: + case AMDGPU::SI_WATERFALL_END_V8: + InWaterfall = false; + break; + default: + break; + } + II++; + } + + return true; } -void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, - bool isEntry) { +MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, + MachineInstr *TermMI) { + MachineBasicBlock::iterator SplitPoint(TermMI); + SplitPoint++; + + LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) + << " @ " << *TermMI << "\n"); + + MachineBasicBlock *SplitBB = nullptr; + + // Only split the block if the split point is not + // already the end of the block. + if (SplitPoint != BB->getFirstTerminator()) { + MachineFunction *MF = BB->getParent(); + SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + MachineFunction::iterator MBBI(BB); + ++MBBI; + MF->insert(MBBI, SplitBB); + + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + SplitBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(SplitBB); + } + + // Convert last instruction in to a terminator. + // Note: this only covers the expected patterns + switch (TermMI->getOpcode()) { + case AMDGPU::S_AND_B32: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B32_term)); + break; + case AMDGPU::S_AND_B64: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B64_term)); + break; + case AMDGPU::SI_DEMOTE_I1: + TermMI->setDesc(TII->get(AMDGPU::SI_DEMOTE_I1_TERMINATOR)); + break; + case AMDGPU::SI_DEMOTE_I1_TERMINATOR: + break; + default: + if (BB->getFirstTerminator() == BB->end()) { + assert(SplitBB != nullptr); + BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(SplitBB); + } + break; + } + + return SplitBB; +} + +// Replace (or supplement) instructions accessing live mask. +// This can only happen once all the live mask registers have been created +// and the execute state (WQM/WWM/Exact) of instructions is known. +void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) return; + LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) + << ":\n"); + const BlockInfo &BI = BII->second; + SmallVector SplitPoints; + unsigned LiveMaskReg = BI.LiveMaskIn; + char State = BI.InitialState; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + auto Next = std::next(II); + MachineInstr &MI = *II; + + if (StateTransition.count(&MI)) { + // Mark transitions to Exact mode as split points so they become + // block terminators. + if (State != StateTransition[&MI] && StateTransition[&MI] == StateExact) { + if (State != StateWWM && canSplitBlockAt(&MBB, &MI)) + SplitPoints.push_back(&MI); + } + State = StateTransition[&MI]; + } + + switch (MI.getOpcode()) { + case AMDGPU::SI_PS_LIVE: + lowerLiveMaskQuery(MBB, MI, LiveMaskReg, State == StateWQM); + break; + case AMDGPU::SI_DEMOTE_I1: { + bool NeedSplit = lowerDemote(MBB, MI, LiveMaskReg, + LiveMaskRegs[&MI], + State == StateWQM); + if (NeedSplit) + SplitPoints.push_back(&MI); + break; + } + default: + break; + } + + if (LiveMaskRegs.count(&MI)) + LiveMaskReg = LiveMaskRegs[&MI]; + + II = Next; + } + + if (BI.LiveMaskOut != LiveMaskReg) { + // If the final live mask register does not match the expected + // register of successor blocks then insert a copy. + MachineBasicBlock::instr_iterator Terminator = + MBB.getFirstInstrTerminator(); + MachineInstr *MI = BuildMI(MBB, Terminator, DebugLoc(), + TII->get(AMDGPU::COPY), BI.LiveMaskOut) + .addReg(LiveMaskReg); + LIS->InsertMachineInstrInMaps(*MI); + } + + // Perform splitting after instruction scan to simplify iteration. + if (!SplitPoints.empty()) { + MachineBasicBlock *BB = &MBB; + for (MachineInstr *MI : SplitPoints) { + BB = splitBlock(BB, MI); + } + } +} + +unsigned SIWholeQuadMode::findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI, + MachineBasicBlock::iterator &Before) { + assert(BI.LiveMaskIn); + if (BI.LiveMaskIn == BI.LiveMaskOut) + return BI.LiveMaskIn; + + // FIXME: make this more efficient than scanning all instructions in a block + unsigned LiveMaskReg = BI.LiveMaskIn; + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + + while ((II != IE) && (II != Before)) { + MachineInstr *I = &*II; + if (LiveMaskRegs.count(I)) + LiveMaskReg = LiveMaskRegs[I]; + II++; + } + + assert(LiveMaskReg); + return LiveMaskReg; +} + +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + BlockInfo &BI = BII->second; + // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { + BI.InitialState = StateWQM; return; + } LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); @@ -723,6 +1134,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, // FirstWQM since if it's safe to switch to/from WWM, it must be safe to // switch to/from WQM as well. MachineBasicBlock::iterator FirstWWM = IE; + + // Record initial state is block information. + BI.InitialState = State; + for (;;) { MachineBasicBlock::iterator Next = II; char Needs = StateExact | StateWQM; // WWM is disabled by default @@ -786,11 +1201,12 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, MachineBasicBlock::iterator Before = prepareInsertion(MBB, First, II, Needs == StateWQM, - Needs == StateExact || WQMFromExec); + Needs == StateExact || WQMFromExec, + Needs == StateExact && isEntry); if (State == StateWWM) { assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); State = NonWWMState; } @@ -804,7 +1220,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(BoolRC); - toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + toExact(MBB, Before, SavedWQMReg, findLiveMaskReg(MBB, BI, Before)); State = StateExact; } else if (State == StateExact && (Needs & StateWQM) && !(Needs & StateExact)) { @@ -833,11 +1249,13 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (II == IE) break; + II = Next; } } -void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { +bool SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { + bool Changed = false; for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -847,7 +1265,19 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { LIS->ReplaceMachineInstrInMaps(*MI, *Copy); MI->eraseFromParent(); + Changed = true; + } + return Changed; +} + +bool SIWholeQuadMode::lowerDemoteInstrs() { + bool Changed = false; + for (MachineInstr *MI : DemoteInstrs) { + MachineBasicBlock *MBB = MI->getParent(); + splitBlock(MBB, MI); + Changed = true; } + return Changed; } void SIWholeQuadMode::lowerCopyInstrs() { @@ -878,6 +1308,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Blocks.clear(); LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); + DemoteInstrs.clear(); + LiveMaskRegs.clear(); + StateTransition.clear(); + CallingConv = MF.getFunction().getCallingConv(); ST = &MF.getSubtarget(); @@ -887,38 +1321,52 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); LIS = &getAnalysis(); - char GlobalFlags = analyzeFunction(MF); - unsigned LiveMaskReg = 0; - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty()) - return !LiveMaskQueries.empty(); - } else { - // Store a copy of the original live mask when required - MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - - if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); - MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); - } + const char GlobalFlags = analyzeFunction(MF); + const bool NeedsLiveMask = + !(DemoteInstrs.empty() && LiveMaskQueries.empty()); + const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned LiveMaskReg = Exec; + + if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty()) { + // Shader only needs Exact mode + const bool LoweredQueries = lowerLiveMaskQueries(LiveMaskReg); + const bool LoweredDemotes = lowerDemoteInstrs(); + return LoweredQueries || LoweredDemotes; + } - lowerLiveMaskQueries(LiveMaskReg); + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); + + // Store a copy of the original live mask when required + if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); + } + + if ((GlobalFlags == StateWQM) && DemoteInstrs.empty()) { + // Shader only needs WQM + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); - if (GlobalFlags == StateWQM) { - // For a shader that needs only WQM, we can just set it once. - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); + lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); + return true; + } - lowerCopyInstrs(); - // EntryMI may become invalid here - return true; + if (NeedsLiveMask && (GlobalFlags & StateWQM)) { + BlockInfo &BI = Blocks[&Entry]; + BI.LiveMaskIn = LiveMaskReg; + analyzeLiveLanes(MF); + } else { + for (auto BII : Blocks) { + BlockInfo &BI = Blocks[&*BII.first]; + BI.LiveMaskIn = LiveMaskReg; + BI.LiveMaskOut = LiveMaskReg; } } @@ -926,14 +1374,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { lowerCopyInstrs(); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + for (auto BII : Blocks) { + processBlock(*BII.first, BII.first == &Entry); + } - // Physical registers like SCC aren't tracked by default anyway, so just - // removing the ranges we computed is the simplest option for maintaining - // the analysis results. - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); + if (NeedsLiveMask && (GlobalFlags & StateWQM)) { + // Lowering blocks causes block splitting. + // Hence live ranges and slot indexes cease to be valid here. + for (auto BII : Blocks) { + lowerBlock(*BII.first); + } + } else { + lowerLiveMaskQueries(LiveMaskReg); + lowerDemoteInstrs(); + } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll new file mode 100644 index 00000000000000..1a4bde2c9f31e6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,271 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,GCN-64 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GCN-64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-64 %s + +; GCN-LABEL: {{^}}static_exact: +; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0 +; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0 +; GCN-32: s_mov_b32 exec_lo, 0 +; GCN-64: s_mov_b64 exec, 0 +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; GCN-LABEL: {{^}}dynamic_exact: +; GCN-32: v_cmp_le_f32_e64 [[CND:s[0-9]+]], 0, v1 +; GCN-64: v_cmp_le_f32_e64 [[CND:s\[[0-9]+:[0-9]+\]]], 0, v1 +; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0 +; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[CND]] +; GCN-64: s_and_b64 exec, exec, [[CND]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; GCN-LABEL: {{^}}branch: +; GCN-32: s_and_saveexec_b32 s1, s0 +; GCN-64: s_and_saveexec_b64 s[2:3], s[0:1] +; GCN-32: s_xor_b32 s0, exec_lo, s1 +; GCN-64: s_xor_b64 s[0:1], exec, s[2:3] +; GCN-32: s_mov_b32 exec_lo, 0 +; GCN-64: s_mov_b64 exec, 0 +; GCN-32: s_or_b32 exec_lo, exec_lo, s0 +; GCN-64: s_or_b64 exec, exec, s[0:1] +; GCN: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @branch(float %arg0, float %arg1) { +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + + +; GCN-LABEL: {{^}}wqm_demote_1: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN: ; %.continue +; GCN: image_sample +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +; GCN-LABEL: {{^}}wqm_demote_2: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: image_sample +; GCN: ; %.demote +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN: ; %.continue +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + + +; GCN-LABEL: {{^}}wqm_deriv: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote0 +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN: ; %.continue0 +; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], 1.0, 0, [[LIVE]] +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: ; %.demote1 +; GCN-32-NEXT: s_mov_b32 exec_lo, 0 +; GCN-64-NEXT: s_mov_b64 exec, 0 +; GCN: ; %.continue1 +; GCN: exp mrt0 +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.ps.live() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +; GCN-LABEL: {{^}}wqm_deriv_loop: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote0 +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN: ; %.continue0.preheader +; GCN: ; %.continue0 +; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:v[0-9]+]], 0, [[LIVE]] +; GCN: ; %.demote1 +; GCN-32: s_andn2_b32 [[LIVE]], [[LIVE]], exec +; GCN-64: s_andn2_b64 [[LIVE]], [[LIVE]], exec +; GCN: ; %.return +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: exp mrt0 +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.ps.live() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.ps.live() #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone }