From 8e415910ec73dae58099d0dc9de0f4be143bc671 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Fri, 5 Jul 2019 17:56:45 +0100
Subject: [PATCH] [AMDGPU] Add llvm.amdgcn.wqm.demote intrinsic

Add intrinsic which demotes all active lanes to helper lanes.
This is used to implement demote to helper vulkan extension.

In practice demoting a lane to helper simply means removing it
from the mask of live lanes used for WQM/WWM/Exact mode.
Where the shader does not use WQM demotes become kills.

To support this live lanes must now be tracked through entire
shader.  This involves adding PHI nodes during WQM pass,
which can expose exec mask change issues.  This is overcome
by split blocks on changes from WQM/WWM to Exact mode.
As a result the WQM pass no longer preserves CFG, slot indexes
or live intervals as these is no way of maintaining when blocks
are split.

Change-Id: I086fa95a1ba0880aa2799c0430242ecd02eb0a11
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   8 +-
 llvm/lib/Target/AMDGPU/SIInsertSkips.cpp      |   5 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  16 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  24 +
 .../Target/AMDGPU/SIOptimizeExecMasking.cpp   |  12 +
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    | 564 ++++++++++++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll  | 271 +++++++++
 7 files changed, 840 insertions(+), 60 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bc02d22c2f74f1..8b27ae1dfa700d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1276,10 +1276,7 @@ def int_amdgcn_interp_p2_f16 :
 
 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
 // invocation for derivative computation).
-def int_amdgcn_ps_live : Intrinsic <
-  [llvm_i1_ty],
-  [],
-  [IntrNoMem]>;
+def int_amdgcn_ps_live : Intrinsic <[llvm_i1_ty], [], []>;
 
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
@@ -1493,6 +1490,9 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
 // If false, set EXEC=0 for the current thread until the end of program.
 def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
 
+// If false, mark all active lanes as helper lanes until the end of program.
+def int_amdgcn_wqm_demote : Intrinsic<[], [llvm_i1_ty], []>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 87e63fcc4a04fa..8c85309a3bd0c8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -270,6 +270,7 @@ void SIInsertSkips::kill(MachineInstr &MI) {
     }
     break;
   }
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     const MachineFunction *MF = MI.getParent()->getParent();
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -486,10 +487,12 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
 
       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       case AMDGPU::SI_KILL_I1_TERMINATOR:
+      case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
-        if (ExecBranchStack.empty()) {
+        if (ExecBranchStack.empty() &&
+            MI.getOpcode() != AMDGPU::SI_DEMOTE_I1_TERMINATOR) {
           if (NextBB != BE && skipIfDead(MI, *NextBB)) {
             HaveSkipBlock = true;
             NextBB = std::next(BI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a802c816847452..3ef6a13750cde0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1426,6 +1426,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
     break;
 
+  case AMDGPU::S_AND_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B64));
+    break;
+
+  case AMDGPU::S_AND_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B32));
+    break;
+
   case AMDGPU::V_MOV_B64_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1907,15 +1919,18 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     case AMDGPU::S_MOV_B64_term:
     case AMDGPU::S_XOR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
+    case AMDGPU::S_AND_B64_term:
     case AMDGPU::S_MOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
     case AMDGPU::S_OR_B32_term:
     case AMDGPU::S_ANDN2_B32_term:
+    case AMDGPU::S_AND_B32_term:
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+    case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
       // FIXME: It's messy that these need to be considered here at all.
       return true;
     default:
@@ -6129,6 +6144,7 @@ bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
   switch (Opcode) {
   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   case AMDGPU::SI_KILL_I1_TERMINATOR:
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5f0cbc3310d672..16c8dd6cca1361 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -195,6 +195,7 @@ let WaveSizePredicate = isWave64 in {
 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
 }
 
 let WaveSizePredicate = isWave32 in {
@@ -202,6 +203,7 @@ def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
 def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
 }
 
 def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
@@ -323,12 +325,24 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
   let isBranch = 1;
 }
 
+let Uses = [EXEC] in {
+
 def SI_PS_LIVE : PseudoInstSI <
   (outs SReg_1:$dst), (ins),
   [(set i1:$dst, (int_amdgcn_ps_live))]> {
   let SALU = 1;
 }
 
+let Defs = [EXEC] in {
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
+}
+def SI_DEMOTE_I1_TERMINATOR : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
+  let isTerminator = 1;
+}
+} // End Defs = [EXEC]
+
+} // End Uses = [EXEC]
+
 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   [(int_amdgcn_unreachable)],
   "; divergent unreachable"> {
@@ -652,6 +666,16 @@ def : Pat <
   (SI_KILL_I1_PSEUDO $src, -1)
 >;
 
+def : Pat <
+  (int_amdgcn_wqm_demote i1:$src),
+  (SI_DEMOTE_I1 $src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+  (SI_DEMOTE_I1 $src, -1)
+>;
+
 def : Pat <
   (AMDGPUkill i32:$src),
   (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index cc9b46a755823b..be45c44fe5b785 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -214,6 +214,18 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
     return true;
   }
+  case AMDGPU::S_AND_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+    return true;
+  }
+  case AMDGPU::S_AND_B32_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+    return true;
+  }
   default:
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index cb4cf68d709ad6..7b452ba447059f 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -132,6 +132,9 @@ struct BlockInfo {
   char Needs = 0;
   char InNeeds = 0;
   char OutNeeds = 0;
+  char InitialState = 0;
+  unsigned LiveMaskIn = 0; // Initial live mask register
+  unsigned LiveMaskOut = 0; // Outgoing live mask register
 };
 
 struct WorkItem {
@@ -154,8 +157,15 @@ class SIWholeQuadMode : public MachineFunctionPass {
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
-  SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+  // Tracks live mask output of instructions
+  DenseMap<const MachineInstr *, unsigned> LiveMaskRegs;
+  // Tracks state (WQM/WWM/Exact) after a given instruction
+  DenseMap<const MachineInstr *, char> StateTransition;
+
+  SmallVector<MachineInstr *, 2> LiveMaskQueries;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+  SmallVector<MachineInstr *, 4> DemoteInstrs;
 
   void printInfo();
 
@@ -168,6 +178,10 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
 
+  void scanLiveLanes(MachineBasicBlock &MBB,
+                     std::vector<MachineBasicBlock *> &Worklist);
+  void analyzeLiveLanes(MachineFunction &MF);
+
   bool requiresCorrectState(const MachineInstr &MI) const;
 
   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
@@ -175,7 +189,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   MachineBasicBlock::iterator
   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
                    MachineBasicBlock::iterator Last, bool PreferLast,
-                   bool SaveSCC);
+                   bool SaveSCC, bool CheckPhys);
   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
                unsigned SaveWQM, unsigned LiveMaskReg);
   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
@@ -183,11 +197,28 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
              unsigned SaveOrig);
   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
-               unsigned SavedOrig);
-  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+               unsigned SavedOrig, char NonWWMState);
+
+  bool canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI);
+  MachineBasicBlock *splitBlock(MachineBasicBlock *BB,
+                                MachineInstr *TermMI);
+  void lowerBlock(MachineBasicBlock &MBB);
+
+  unsigned findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI,
+                           MachineBasicBlock::iterator &Before);
+  void processBlock(MachineBasicBlock &MBB, bool isEntry);
 
-  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+  bool lowerLiveMaskQueries(unsigned LiveMaskReg);
   void lowerCopyInstrs();
+  bool lowerDemoteInstrs();
+
+  void lowerLiveMaskQuery(MachineBasicBlock &MBB,
+                          MachineInstr &MI,
+                          unsigned LiveMaskReg,
+                          bool isWQM);
+  bool lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI,
+                   unsigned LiveMaskIn, unsigned LiveMaskOut,
+                   bool isWQM);
 
 public:
   static char ID;
@@ -201,9 +232,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervals>();
-    AU.addPreserved<SlotIndexes>();
-    AU.addPreserved<LiveIntervals>();
-    AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -382,6 +410,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       } else {
         if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
+        } else if (Opcode == AMDGPU::SI_DEMOTE_I1) {
+          DemoteInstrs.push_back(&MI);
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
           // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -523,6 +553,115 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
   return GlobalFlags;
 }
 
+// Trace live mask manipulate through block, creating new virtual registers.
+// Additionally insert PHI nodes when block has multiple predecessors
+// which manipulated the mask.
+void SIWholeQuadMode::scanLiveLanes(MachineBasicBlock &MBB,
+                                    std::vector<MachineBasicBlock *> &Worklist) {
+  BlockInfo &BI = Blocks[&MBB];
+
+  if (BI.LiveMaskIn && BI.LiveMaskOut)
+    return; // Block has been fully traced already.
+
+  if (!BI.LiveMaskIn) {
+    // Find the incoming live mask, or insert PHI if there are multiple.
+    unsigned LastPredReg = 0;
+    unsigned Count = 0;
+    bool Valid = true;
+
+    // Find predecessor live masks while performing basic deduplication.
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      BlockInfo &PredBI = Blocks[Pred];
+      if (!PredBI.LiveMaskOut) {
+        Valid = false;
+        break;
+      }
+      if (PredBI.LiveMaskOut != LastPredReg) {
+        LastPredReg = PredBI.LiveMaskOut;
+        Count++;
+      }
+    }
+
+    if (Valid) {
+      // All predecessors have live mask outputs.
+      if (Count > 1) {
+        BI.LiveMaskIn = MRI->createVirtualRegister(TRI->getBoolRC());
+        MachineInstrBuilder PHI = BuildMI(MBB, MBB.begin(), DebugLoc(),
+                                    TII->get(TargetOpcode::PHI),
+                                    BI.LiveMaskIn);
+        for (MachineBasicBlock *Pred : MBB.predecessors()) {
+          BlockInfo &PredBI = Blocks[Pred];
+          PHI.addReg(PredBI.LiveMaskOut);
+          PHI.addMBB(Pred);
+        }
+        LIS->InsertMachineInstrInMaps(*PHI);
+      } else {
+        BI.LiveMaskIn = LastPredReg;
+      }
+    } else {
+      // Not all predecessor blocks have live mask outputs,
+      // so this block will need to be revisited.
+
+      if (!BI.LiveMaskOut) {
+        // Give this block a live mask output to ensure forward progress.
+        BI.LiveMaskOut = MRI->createVirtualRegister(TRI->getBoolRC());
+      }
+
+      // Queue this block to be revisited and visit predecessors.
+      Worklist.push_back(&MBB);
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        BlockInfo &PredBI = Blocks[Pred];
+        if (!PredBI.LiveMaskOut)
+          Worklist.push_back(Pred);
+      }
+      return;
+    }
+  }
+
+  assert(BI.LiveMaskIn);
+
+  // Now that the initial live mask register is known the block can
+  // be traced and intermediate live mask registers assigned for instructions
+  // which manipulate the mask.
+  unsigned CurrentLive = BI.LiveMaskIn;
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+      unsigned NewLive = MRI->createVirtualRegister(TRI->getBoolRC());
+      LiveMaskRegs[&MI] = NewLive;
+      CurrentLive = NewLive;
+    }
+    II++;
+  }
+
+  // If an output register was assigned to guarantee forward progress
+  // then it is possible the current live register will not become the output
+  // live mask register.  This will be resolved during block lowering.
+  if (!BI.LiveMaskOut) {
+    BI.LiveMaskOut = CurrentLive;
+  }
+}
+
+// Scan blocks for live mask manipulation operations in reverse post order
+// to minimise rescans: a block will have to be rescanned if it's
+// predecessors live mask output is not defined.
+void SIWholeQuadMode::analyzeLiveLanes(MachineFunction &MF) {
+  std::vector<MachineBasicBlock *> Worklist;
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
+    MachineBasicBlock &MBB = **BI;
+    scanLiveLanes(MBB, Worklist);
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.back();
+    Worklist.pop_back();
+    scanLiveLanes(*MBB, Worklist);
+  }
+}
+
 /// Whether \p MI really requires the exec state computed during analysis.
 ///
 /// Scalar instructions must occasionally be marked WQM for correct propagation
@@ -577,7 +716,8 @@ SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
 // instructions we want to add necessarily clobber SCC.
 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
-    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
+    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC,
+    bool CheckPhys) {
   if (!SaveSCC)
     return PreferLast ? Last : First;
 
@@ -610,9 +750,24 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
 
   MachineBasicBlock::iterator MBBI;
 
-  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
+  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) {
     MBBI = MI;
-  else {
+    if (CheckPhys) {
+      // Make sure insertion point is after any COPY instructions
+      // accessing physical live in registers.  This is ensures that
+      // block splitting does not occur before all live ins have been copied.
+      while (MBBI != Last) {
+        if (MBBI->getOpcode() != AMDGPU::COPY)
+          break;
+        unsigned Src = MBBI->getOperand(1).getReg();
+        if (!Register::isVirtualRegister(Src) && MBB.isLiveIn(Src)) {
+          MBBI++;
+        } else {
+          break;
+        }
+      }
+    }
+  } else {
     assert(Idx == LIS->getMBBEndIdx(&MBB));
     MBBI = MBB.end();
   }
@@ -643,6 +798,7 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
   }
 
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateExact;
 }
 
 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
@@ -662,6 +818,7 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
   }
 
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateWQM;
 }
 
 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
@@ -673,11 +830,13 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
            .addImm(-1);
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateWWM;
 }
 
 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
-                              unsigned SavedOrig) {
+                              unsigned SavedOrig,
+                              char NonWWMState) {
   MachineInstr *MI;
 
   assert(SavedOrig);
@@ -685,20 +844,272 @@ void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
            .addReg(SavedOrig);
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = NonWWMState;
+}
+
+void SIWholeQuadMode::lowerLiveMaskQuery(MachineBasicBlock &MBB,
+                                         MachineInstr &MI,
+                                         unsigned LiveMaskReg,
+                                         bool isWQM) {
+  const DebugLoc &DL = MI.getDebugLoc();
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineInstr *Copy =
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dest)
+          .addReg(LiveMaskReg);
+  LIS->ReplaceMachineInstrInMaps(MI, *Copy);
+  MBB.remove(&MI);
+}
+
+// Lower an instruction which demotes lanes to helpers by adding
+// appropriate live mask manipulation.  Note this is also applied to kills.
+bool SIWholeQuadMode::lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI,
+                                  unsigned LiveMaskIn, unsigned LiveMaskOut,
+                                  bool isWQM) {
+  const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  const unsigned AndN2 =
+    ST->isWave32() ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+  const unsigned And =
+    ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstr *NewMI = nullptr;
+  bool NeedSplit = false;
+
+  const MachineOperand &Op = MI.getOperand(0);
+  int64_t KillVal = MI.getOperand(1).getImm();
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    if (Imm == KillVal) {
+      NewMI = BuildMI(MBB, MI, DL,
+                      TII->get(AndN2),
+                      LiveMaskOut)
+        .addReg(LiveMaskIn)
+        .addReg(Exec);
+    }
+  } else {
+    unsigned Opcode = KillVal ? AndN2 : And;
+    NewMI = BuildMI(MBB, MI, DL,
+                    TII->get(Opcode),
+                    LiveMaskOut)
+      .addReg(LiveMaskIn)
+      .add(Op);
+  }
+
+  if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+    if (isWQM) {
+      // Inside WQM demotes are replaced with live mask manipulation
+      LIS->RemoveMachineInstrFromMaps(MI);
+      MBB.remove(&MI);
+    } else {
+      // Outside WQM demotes become kills terminating the block
+      NeedSplit = true;
+    }
+  }
+
+  if (NewMI) {
+    LIS->InsertMachineInstrInMaps(*NewMI);
+  }
+
+  return NeedSplit;
+}
+
+bool SIWholeQuadMode::canSplitBlockAt(MachineBasicBlock *BB,
+                                      MachineInstr *MI) {
+  // Cannot split immediately before the epilog
+  // because there are values in physical registers
+  if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+    return false;
+  }
+
+  // Do not split inside a waterfall intrinsic pair
+  MachineBasicBlock::iterator II = BB->getFirstNonPHI();
+  MachineBasicBlock::iterator IE = BB->end();
+  bool InWaterfall = false;
+  while (II != IE) {
+    if (&*II == MI)
+      return !InWaterfall;
+
+    switch (II->getOpcode()) {
+    case AMDGPU::SI_WATERFALL_BEGIN_V1:
+    case AMDGPU::SI_WATERFALL_BEGIN_V2:
+    case AMDGPU::SI_WATERFALL_BEGIN_V4:
+    case AMDGPU::SI_WATERFALL_BEGIN_V8:
+      InWaterfall = true;
+      break;
+    case AMDGPU::SI_WATERFALL_END_V1:
+    case AMDGPU::SI_WATERFALL_END_V2:
+    case AMDGPU::SI_WATERFALL_END_V4:
+    case AMDGPU::SI_WATERFALL_END_V8:
+      InWaterfall = false;
+      break;
+    default:
+      break;
+    }
+    II++;
+  }
+
+  return true;
 }
 
-void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
-                                   bool isEntry) {
+MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
+                                               MachineInstr *TermMI) {
+  MachineBasicBlock::iterator SplitPoint(TermMI);
+  SplitPoint++;
+
+  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB)
+                    << " @ " << *TermMI << "\n");
+
+  MachineBasicBlock *SplitBB = nullptr;
+
+  // Only split the block if the split point is not
+  // already the end of the block.
+  if (SplitPoint != BB->getFirstTerminator()) {
+    MachineFunction *MF = BB->getParent();
+    SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+    MachineFunction::iterator MBBI(BB);
+    ++MBBI;
+    MF->insert(MBBI, SplitBB);
+
+    SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+    SplitBB->transferSuccessorsAndUpdatePHIs(BB);
+    BB->addSuccessor(SplitBB);
+  }
+
+  // Convert last instruction in to a terminator.
+  // Note: this only covers the expected patterns
+  switch (TermMI->getOpcode()) {
+  case AMDGPU::S_AND_B32:
+    TermMI->setDesc(TII->get(AMDGPU::S_AND_B32_term));
+    break;
+  case AMDGPU::S_AND_B64:
+    TermMI->setDesc(TII->get(AMDGPU::S_AND_B64_term));
+    break;
+  case AMDGPU::SI_DEMOTE_I1:
+    TermMI->setDesc(TII->get(AMDGPU::SI_DEMOTE_I1_TERMINATOR));
+    break;
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
+    break;
+  default:
+    if (BB->getFirstTerminator() == BB->end()) {
+      assert(SplitBB != nullptr);
+      BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
+        .addMBB(SplitBB);
+    }
+    break;
+  }
+
+  return SplitBB;
+}
+
+// Replace (or supplement) instructions accessing live mask.
+// This can only happen once all the live mask registers have been created
+// and the execute state (WQM/WWM/Exact) of instructions is known.
+void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   auto BII = Blocks.find(&MBB);
   if (BII == Blocks.end())
     return;
 
+  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB)
+                    << ":\n");
+
   const BlockInfo &BI = BII->second;
 
+  SmallVector<MachineInstr *, 4> SplitPoints;
+  unsigned LiveMaskReg = BI.LiveMaskIn;
+  char State = BI.InitialState;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    auto Next = std::next(II);
+    MachineInstr &MI = *II;
+
+    if (StateTransition.count(&MI)) {
+      // Mark transitions to Exact mode as split points so they become
+      // block terminators.
+      if (State != StateTransition[&MI] && StateTransition[&MI] == StateExact) {
+        if (State != StateWWM && canSplitBlockAt(&MBB, &MI))
+          SplitPoints.push_back(&MI);
+      }
+      State = StateTransition[&MI];
+    }
+
+    switch (MI.getOpcode()) {
+    case AMDGPU::SI_PS_LIVE:
+      lowerLiveMaskQuery(MBB, MI, LiveMaskReg, State == StateWQM);
+      break;
+    case AMDGPU::SI_DEMOTE_I1: {
+      bool NeedSplit = lowerDemote(MBB, MI, LiveMaskReg,
+                                   LiveMaskRegs[&MI],
+                                   State == StateWQM);
+      if (NeedSplit)
+        SplitPoints.push_back(&MI);
+      break;
+    }
+    default:
+      break;
+    }
+
+    if (LiveMaskRegs.count(&MI))
+      LiveMaskReg = LiveMaskRegs[&MI];
+
+    II = Next;
+  }
+
+  if (BI.LiveMaskOut != LiveMaskReg) {
+    // If the final live mask register does not match the expected
+    // register of successor blocks then insert a copy.
+    MachineBasicBlock::instr_iterator Terminator =
+      MBB.getFirstInstrTerminator();
+    MachineInstr *MI = BuildMI(MBB, Terminator, DebugLoc(),
+                               TII->get(AMDGPU::COPY), BI.LiveMaskOut)
+      .addReg(LiveMaskReg);
+    LIS->InsertMachineInstrInMaps(*MI);
+  }
+
+  // Perform splitting after instruction scan to simplify iteration.
+  if (!SplitPoints.empty()) {
+    MachineBasicBlock *BB = &MBB;
+    for (MachineInstr *MI : SplitPoints) {
+      BB = splitBlock(BB, MI);
+    }
+  }
+}
+
+unsigned SIWholeQuadMode::findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI,
+                                          MachineBasicBlock::iterator &Before) {
+  assert(BI.LiveMaskIn);
+  if (BI.LiveMaskIn == BI.LiveMaskOut)
+    return BI.LiveMaskIn;
+
+  // FIXME: make this more efficient than scanning all instructions in a block
+  unsigned LiveMaskReg = BI.LiveMaskIn;
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+
+  while ((II != IE) && (II != Before)) {
+    MachineInstr *I = &*II;
+    if (LiveMaskRegs.count(I))
+      LiveMaskReg = LiveMaskRegs[I];
+    II++;
+  }
+
+  assert(LiveMaskReg);
+  return LiveMaskReg;
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  BlockInfo &BI = BII->second;
+
   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
-  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
+  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
+    BI.InitialState = StateWQM;
     return;
+  }
 
   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
                     << ":\n");
@@ -723,6 +1134,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
   // switch to/from WQM as well.
   MachineBasicBlock::iterator FirstWWM = IE;
+
+  // Record initial state is block information.
+  BI.InitialState = State;
+
   for (;;) {
     MachineBasicBlock::iterator Next = II;
     char Needs = StateExact | StateWQM; // WWM is disabled by default
@@ -786,11 +1201,12 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 
       MachineBasicBlock::iterator Before =
           prepareInsertion(MBB, First, II, Needs == StateWQM,
-                           Needs == StateExact || WQMFromExec);
+                           Needs == StateExact || WQMFromExec,
+                           Needs == StateExact && isEntry);
 
       if (State == StateWWM) {
         assert(SavedNonWWMReg);
-        fromWWM(MBB, Before, SavedNonWWMReg);
+        fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState);
         State = NonWWMState;
       }
 
@@ -804,7 +1220,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
           if (!WQMFromExec && (OutNeeds & StateWQM))
             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
 
-          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+          toExact(MBB, Before, SavedWQMReg, findLiveMaskReg(MBB, BI, Before));
           State = StateExact;
         } else if (State == StateExact && (Needs & StateWQM) &&
                    !(Needs & StateExact)) {
@@ -833,11 +1249,13 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 
     if (II == IE)
       break;
+
     II = Next;
   }
 }
 
-void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+bool SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  bool Changed = false;
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
     Register Dest = MI->getOperand(0).getReg();
@@ -847,7 +1265,19 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
 
     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
     MI->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+bool SIWholeQuadMode::lowerDemoteInstrs() {
+  bool Changed = false;
+  for (MachineInstr *MI : DemoteInstrs) {
+    MachineBasicBlock *MBB = MI->getParent();
+    splitBlock(MBB, MI);
+    Changed = true;
   }
+  return Changed;
 }
 
 void SIWholeQuadMode::lowerCopyInstrs() {
@@ -878,6 +1308,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   Blocks.clear();
   LiveMaskQueries.clear();
   LowerToCopyInstrs.clear();
+  DemoteInstrs.clear();
+  LiveMaskRegs.clear();
+  StateTransition.clear();
+
   CallingConv = MF.getFunction().getCallingConv();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -887,38 +1321,52 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
 
-  char GlobalFlags = analyzeFunction(MF);
-  unsigned LiveMaskReg = 0;
-  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  if (!(GlobalFlags & StateWQM)) {
-    lowerLiveMaskQueries(Exec);
-    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
-      return !LiveMaskQueries.empty();
-  } else {
-    // Store a copy of the original live mask when required
-    MachineBasicBlock &Entry = MF.front();
-    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
-
-    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
-      LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
-      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
-                                 TII->get(AMDGPU::COPY), LiveMaskReg)
-                             .addReg(Exec);
-      LIS->InsertMachineInstrInMaps(*MI);
-    }
+  const char GlobalFlags = analyzeFunction(MF);
+  const bool NeedsLiveMask =
+    !(DemoteInstrs.empty() && LiveMaskQueries.empty());
+  const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  unsigned LiveMaskReg = Exec;
+
+  if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty()) {
+    // Shader only needs Exact mode
+    const bool LoweredQueries = lowerLiveMaskQueries(LiveMaskReg);
+    const bool LoweredDemotes = lowerDemoteInstrs();
+    return LoweredQueries || LoweredDemotes;
+  }
 
-    lowerLiveMaskQueries(LiveMaskReg);
+  MachineBasicBlock &Entry = MF.front();
+  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+
+  // Store a copy of the original live mask when required
+  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
+    LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
+    MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+                               TII->get(AMDGPU::COPY), LiveMaskReg)
+                           .addReg(Exec);
+    LIS->InsertMachineInstrInMaps(*MI);
+  }
+
+  if ((GlobalFlags == StateWQM) && DemoteInstrs.empty()) {
+    // Shader only needs WQM
+    BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
+              AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
+            Exec)
+        .addReg(Exec);
 
-    if (GlobalFlags == StateWQM) {
-      // For a shader that needs only WQM, we can just set it once.
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
-                AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
-              Exec)
-          .addReg(Exec);
+    lowerLiveMaskQueries(LiveMaskReg);
+    lowerCopyInstrs();
+    return true;
+  }
 
-      lowerCopyInstrs();
-      // EntryMI may become invalid here
-      return true;
+  if (NeedsLiveMask && (GlobalFlags & StateWQM)) {
+    BlockInfo &BI = Blocks[&Entry];
+    BI.LiveMaskIn = LiveMaskReg;
+    analyzeLiveLanes(MF);
+  } else {
+    for (auto BII : Blocks) {
+      BlockInfo &BI = Blocks[&*BII.first];
+      BI.LiveMaskIn = LiveMaskReg;
+      BI.LiveMaskOut = LiveMaskReg;
     }
   }
 
@@ -926,14 +1374,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
   lowerCopyInstrs();
 
-  // Handle the general case
-  for (auto BII : Blocks)
-    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+  for (auto BII : Blocks) {
+    processBlock(*BII.first, BII.first == &Entry);
+  }
 
-  // Physical registers like SCC aren't tracked by default anyway, so just
-  // removing the ranges we computed is the simplest option for maintaining
-  // the analysis results.
-  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+  if (NeedsLiveMask && (GlobalFlags & StateWQM)) {
+    // Lowering blocks causes block splitting.
+    // Hence live ranges and slot indexes cease to be valid here.
+    for (auto BII : Blocks) {
+      lowerBlock(*BII.first);
+    }
+  } else {
+    lowerLiveMaskQueries(LiveMaskReg);
+    lowerDemoteInstrs();
+  }
 
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
new file mode 100644
index 00000000000000..1a4bde2c9f31e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -0,0 +1,271 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,GCN-64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GCN-64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-64 %s
+
+; GCN-LABEL: {{^}}static_exact:
+; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0
+; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0
+; GCN-32: s_mov_b32 exec_lo, 0
+; GCN-64: s_mov_b64 exec, 0
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_exact:
+; GCN-32: v_cmp_le_f32_e64 [[CND:s[0-9]+]], 0, v1
+; GCN-64: v_cmp_le_f32_e64 [[CND:s\[[0-9]+:[0-9]+\]]], 0, v1
+; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0
+; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[CND]]
+; GCN-64: s_and_b64 exec, exec, [[CND]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}branch:
+; GCN-32: s_and_saveexec_b32 s1, s0
+; GCN-64: s_and_saveexec_b64 s[2:3], s[0:1]
+; GCN-32: s_xor_b32 s0, exec_lo, s1
+; GCN-64: s_xor_b64 s[0:1], exec, s[2:3]
+; GCN-32: s_mov_b32 exec_lo, 0
+; GCN-64: s_mov_b64 exec, 0
+; GCN-32: s_or_b32 exec_lo, exec_lo, s0
+; GCN-64: s_or_b64 exec, exec, s[0:1]
+; GCN: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @branch(float %arg0, float %arg1) {
+.entry:
+  %i0 = fptosi float %arg0 to i32
+  %i1 = fptosi float %arg1 to i32
+  %c0 = or i32 %i0, %i1
+  %c1 = and i32 %c0, 1
+  %c2 = icmp eq i32 %c1, 0
+  br i1 %c2, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}wqm_demote_1:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue
+; GCN: image_sample
+; GCN: v_add_f32_e32
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+; GCN-LABEL: {{^}}wqm_demote_2:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: image_sample
+; GCN: ; %.demote
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue
+; GCN: v_add_f32_e32
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+
+; GCN-LABEL: {{^}}wqm_deriv:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote0
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue0
+; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], 1.0, 0, [[LIVE]]
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: ; %.demote1
+; GCN-32-NEXT: s_mov_b32 exec_lo, 0
+; GCN-64-NEXT: s_mov_b64 exec, 0
+; GCN: ; %.continue1
+; GCN: exp mrt0
+define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.cond = select i1 %live, i32 0, i32 1065353216
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+; GCN-LABEL: {{^}}wqm_deriv_loop:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote0
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue0.preheader
+; GCN: ; %.continue0
+; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:v[0-9]+]], 0, [[LIVE]]
+; GCN: ; %.demote1
+; GCN-32: s_andn2_b32 [[LIVE]], [[LIVE]], exec
+; GCN-64: s_andn2_b64 [[LIVE]], [[LIVE]], exec
+; GCN: ; %.return
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: exp mrt0
+define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.cond = select i1 %live, i32 0, i32 %count
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  %next = add i32 %count, 1
+  %loop.cond = icmp slt i32 %next, %limit
+  br i1 %loop.cond, label %.continue0, label %.return
+
+.return:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.wqm.demote(i1) #0
+declare i1 @llvm.amdgcn.ps.live() #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare float @llvm.amdgcn.wqm.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind }
+attributes #4 = { convergent nounwind readnone }