diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bc02d22c2f74f1..8b27ae1dfa700d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1276,10 +1276,7 @@ def int_amdgcn_interp_p2_f16 :
 
 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
 // invocation for derivative computation).
-def int_amdgcn_ps_live : Intrinsic <
-  [llvm_i1_ty],
-  [],
-  [IntrNoMem]>;
+def int_amdgcn_ps_live : Intrinsic <[llvm_i1_ty], [], []>;
 
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
@@ -1493,6 +1490,9 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
 // If false, set EXEC=0 for the current thread until the end of program.
 def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
 
+// If false, mark all active lanes as helper lanes until the end of program.
+def int_amdgcn_wqm_demote : Intrinsic<[], [llvm_i1_ty], []>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 87e63fcc4a04fa..8c85309a3bd0c8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -270,6 +270,7 @@ void SIInsertSkips::kill(MachineInstr &MI) {
     }
     break;
   }
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     const MachineFunction *MF = MI.getParent()->getParent();
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -486,10 +487,12 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
 
       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       case AMDGPU::SI_KILL_I1_TERMINATOR:
+      case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
-        if (ExecBranchStack.empty()) {
+        if (ExecBranchStack.empty() &&
+            MI.getOpcode() != AMDGPU::SI_DEMOTE_I1_TERMINATOR) {
           if (NextBB != BE && skipIfDead(MI, *NextBB)) {
             HaveSkipBlock = true;
             NextBB = std::next(BI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a802c816847452..3ef6a13750cde0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1426,6 +1426,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
     break;
 
+  case AMDGPU::S_AND_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B64));
+    break;
+
+  case AMDGPU::S_AND_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B32));
+    break;
+
   case AMDGPU::V_MOV_B64_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1907,15 +1919,18 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     case AMDGPU::S_MOV_B64_term:
     case AMDGPU::S_XOR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
+    case AMDGPU::S_AND_B64_term:
     case AMDGPU::S_MOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
     case AMDGPU::S_OR_B32_term:
     case AMDGPU::S_ANDN2_B32_term:
+    case AMDGPU::S_AND_B32_term:
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+    case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
       // FIXME: It's messy that these need to be considered here at all.
       return true;
     default:
@@ -6129,6 +6144,7 @@ bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
   switch (Opcode) {
   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   case AMDGPU::SI_KILL_I1_TERMINATOR:
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5f0cbc3310d672..16c8dd6cca1361 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -195,6 +195,7 @@ let WaveSizePredicate = isWave64 in {
 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
 }
 
 let WaveSizePredicate = isWave32 in {
@@ -202,6 +203,7 @@ def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
 def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
 }
 
 def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
@@ -323,12 +325,24 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
   let isBranch = 1;
 }
 
+let Uses = [EXEC] in {
+
 def SI_PS_LIVE : PseudoInstSI <
   (outs SReg_1:$dst), (ins),
   [(set i1:$dst, (int_amdgcn_ps_live))]> {
   let SALU = 1;
 }
 
+let Defs = [EXEC] in {
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
+}
+def SI_DEMOTE_I1_TERMINATOR : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
+  let isTerminator = 1;
+}
+} // End Defs = [EXEC]
+
+} // End Uses = [EXEC]
+
 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   [(int_amdgcn_unreachable)],
   "; divergent unreachable"> {
@@ -652,6 +666,16 @@ def : Pat <
   (SI_KILL_I1_PSEUDO $src, -1)
 >;
 
+def : Pat <
+  (int_amdgcn_wqm_demote i1:$src),
+  (SI_DEMOTE_I1 $src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+  (SI_DEMOTE_I1 $src, -1)
+>;
+
 def : Pat <
   (AMDGPUkill i32:$src),
   (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index cc9b46a755823b..be45c44fe5b785 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -214,6 +214,18 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
     return true;
   }
+  case AMDGPU::S_AND_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+    return true;
+  }
+  case AMDGPU::S_AND_B32_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+    return true;
+  }
   default:
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index cb4cf68d709ad6..7b452ba447059f 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -132,6 +132,9 @@ struct BlockInfo {
   char Needs = 0;
   char InNeeds = 0;
   char OutNeeds = 0;
+  char InitialState = 0;
+  unsigned LiveMaskIn = 0; // Initial live mask register
+  unsigned LiveMaskOut = 0; // Outgoing live mask register
 };
 
 struct WorkItem {
@@ -154,8 +157,15 @@ class SIWholeQuadMode : public MachineFunctionPass {
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
-  SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+  // Tracks live mask output of instructions
+  DenseMap<const MachineInstr *, unsigned> LiveMaskRegs;
+  // Tracks state (WQM/WWM/Exact) after a given instruction
+  DenseMap<const MachineInstr *, char> StateTransition;
+
+  SmallVector<MachineInstr *, 2> LiveMaskQueries;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+  SmallVector<MachineInstr *, 4> DemoteInstrs;
 
   void printInfo();
 
@@ -168,6 +178,10 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
 
+  void scanLiveLanes(MachineBasicBlock &MBB,
+                     std::vector<MachineBasicBlock *> &Worklist);
+  void analyzeLiveLanes(MachineFunction &MF);
+
   bool requiresCorrectState(const MachineInstr &MI) const;
 
   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
@@ -175,7 +189,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   MachineBasicBlock::iterator
   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
                    MachineBasicBlock::iterator Last, bool PreferLast,
-                   bool SaveSCC);
+                   bool SaveSCC, bool CheckPhys);
   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
                unsigned SaveWQM, unsigned LiveMaskReg);
   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
@@ -183,11 +197,28 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
              unsigned SaveOrig);
   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
-               unsigned SavedOrig);
-  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+               unsigned SavedOrig, char NonWWMState);
+
+  bool canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI);
+  MachineBasicBlock *splitBlock(MachineBasicBlock *BB,
+                                MachineInstr *TermMI);
+  void lowerBlock(MachineBasicBlock &MBB);
+
+  unsigned findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI,
+                           MachineBasicBlock::iterator &Before);
+  void processBlock(MachineBasicBlock &MBB, bool isEntry);
 
-  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+  bool lowerLiveMaskQueries(unsigned LiveMaskReg);
   void lowerCopyInstrs();
+  bool lowerDemoteInstrs();
+
+  void lowerLiveMaskQuery(MachineBasicBlock &MBB,
+                          MachineInstr &MI,
+                          unsigned LiveMaskReg,
+                          bool isWQM);
+  bool lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI,
+                   unsigned LiveMaskIn, unsigned LiveMaskOut,
+                   bool isWQM);
 
 public:
   static char ID;
@@ -201,9 +232,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervals>();
-    AU.addPreserved<SlotIndexes>();
-    AU.addPreserved<LiveIntervals>();
-    AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -382,6 +410,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       } else {
         if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
+        } else if (Opcode == AMDGPU::SI_DEMOTE_I1) {
+          DemoteInstrs.push_back(&MI);
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
           // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -523,6 +553,115 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
   return GlobalFlags;
 }
 
+// Trace live mask manipulate through block, creating new virtual registers.
+// Additionally insert PHI nodes when block has multiple predecessors
+// which manipulated the mask.
+void SIWholeQuadMode::scanLiveLanes(MachineBasicBlock &MBB,
+                                    std::vector<MachineBasicBlock *> &Worklist) {
+  BlockInfo &BI = Blocks[&MBB];
+
+  if (BI.LiveMaskIn && BI.LiveMaskOut)
+    return; // Block has been fully traced already.
+
+  if (!BI.LiveMaskIn) {
+    // Find the incoming live mask, or insert PHI if there are multiple.
+    unsigned LastPredReg = 0;
+    unsigned Count = 0;
+    bool Valid = true;
+
+    // Find predecessor live masks while performing basic deduplication.
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      BlockInfo &PredBI = Blocks[Pred];
+      if (!PredBI.LiveMaskOut) {
+        Valid = false;
+        break;
+      }
+      if (PredBI.LiveMaskOut != LastPredReg) {
+        LastPredReg = PredBI.LiveMaskOut;
+        Count++;
+      }
+    }
+
+    if (Valid) {
+      // All predecessors have live mask outputs.
+      if (Count > 1) {
+        BI.LiveMaskIn = MRI->createVirtualRegister(TRI->getBoolRC());
+        MachineInstrBuilder PHI = BuildMI(MBB, MBB.begin(), DebugLoc(),
+                                    TII->get(TargetOpcode::PHI),
+                                    BI.LiveMaskIn);
+        for (MachineBasicBlock *Pred : MBB.predecessors()) {
+          BlockInfo &PredBI = Blocks[Pred];
+          PHI.addReg(PredBI.LiveMaskOut);
+          PHI.addMBB(Pred);
+        }
+        LIS->InsertMachineInstrInMaps(*PHI);
+      } else {
+        BI.LiveMaskIn = LastPredReg;
+      }
+    } else {
+      // Not all predecessor blocks have live mask outputs,
+      // so this block will need to be revisited.
+
+      if (!BI.LiveMaskOut) {
+        // Give this block a live mask output to ensure forward progress.
+        BI.LiveMaskOut = MRI->createVirtualRegister(TRI->getBoolRC());
+      }
+
+      // Queue this block to be revisited and visit predecessors.
+      Worklist.push_back(&MBB);
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        BlockInfo &PredBI = Blocks[Pred];
+        if (!PredBI.LiveMaskOut)
+          Worklist.push_back(Pred);
+      }
+      return;
+    }
+  }
+
+  assert(BI.LiveMaskIn);
+
+  // Now that the initial live mask register is known the block can
+  // be traced and intermediate live mask registers assigned for instructions
+  // which manipulate the mask.
+  unsigned CurrentLive = BI.LiveMaskIn;
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+      unsigned NewLive = MRI->createVirtualRegister(TRI->getBoolRC());
+      LiveMaskRegs[&MI] = NewLive;
+      CurrentLive = NewLive;
+    }
+    II++;
+  }
+
+  // If an output register was assigned to guarantee forward progress
+  // then it is possible the current live register will not become the output
+  // live mask register.  This will be resolved during block lowering.
+  if (!BI.LiveMaskOut) {
+    BI.LiveMaskOut = CurrentLive;
+  }
+}
+
+// Scan blocks for live mask manipulation operations in reverse post order
+// to minimise rescans: a block will have to be rescanned if it's
+// predecessors live mask output is not defined.
+void SIWholeQuadMode::analyzeLiveLanes(MachineFunction &MF) {
+  std::vector<MachineBasicBlock *> Worklist;
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
+    MachineBasicBlock &MBB = **BI;
+    scanLiveLanes(MBB, Worklist);
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.back();
+    Worklist.pop_back();
+    scanLiveLanes(*MBB, Worklist);
+  }
+}
+
 /// Whether \p MI really requires the exec state computed during analysis.
 ///
 /// Scalar instructions must occasionally be marked WQM for correct propagation
@@ -577,7 +716,8 @@ SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
 // instructions we want to add necessarily clobber SCC.
 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
-    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
+    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC,
+    bool CheckPhys) {
   if (!SaveSCC)
     return PreferLast ? Last : First;
 
@@ -610,9 +750,24 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
 
   MachineBasicBlock::iterator MBBI;
 
-  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
+  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) {
     MBBI = MI;
-  else {
+    if (CheckPhys) {
+      // Make sure insertion point is after any COPY instructions
+      // accessing physical live in registers.  This is ensures that
+      // block splitting does not occur before all live ins have been copied.
+      while (MBBI != Last) {
+        if (MBBI->getOpcode() != AMDGPU::COPY)
+          break;
+        unsigned Src = MBBI->getOperand(1).getReg();
+        if (!Register::isVirtualRegister(Src) && MBB.isLiveIn(Src)) {
+          MBBI++;
+        } else {
+          break;
+        }
+      }
+    }
+  } else {
     assert(Idx == LIS->getMBBEndIdx(&MBB));
     MBBI = MBB.end();
   }
@@ -643,6 +798,7 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
   }
 
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateExact;
 }
 
 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
@@ -662,6 +818,7 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
   }
 
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateWQM;
 }
 
 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
@@ -673,11 +830,13 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
            .addImm(-1);
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateWWM;
 }
 
 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
-                              unsigned SavedOrig) {
+                              unsigned SavedOrig,
+                              char NonWWMState) {
   MachineInstr *MI;
 
   assert(SavedOrig);
@@ -685,20 +844,272 @@ void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
            .addReg(SavedOrig);
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = NonWWMState;
+}
+
+void SIWholeQuadMode::lowerLiveMaskQuery(MachineBasicBlock &MBB,
+                                         MachineInstr &MI,
+                                         unsigned LiveMaskReg,
+                                         bool isWQM) {
+  const DebugLoc &DL = MI.getDebugLoc();
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineInstr *Copy =
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dest)
+          .addReg(LiveMaskReg);
+  LIS->ReplaceMachineInstrInMaps(MI, *Copy);
+  MBB.remove(&MI);
+}
+
+// Lower an instruction which demotes lanes to helpers by adding
+// appropriate live mask manipulation.  Note this is also applied to kills.
+bool SIWholeQuadMode::lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI,
+                                  unsigned LiveMaskIn, unsigned LiveMaskOut,
+                                  bool isWQM) {
+  const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  const unsigned AndN2 =
+    ST->isWave32() ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+  const unsigned And =
+    ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstr *NewMI = nullptr;
+  bool NeedSplit = false;
+
+  const MachineOperand &Op = MI.getOperand(0);
+  int64_t KillVal = MI.getOperand(1).getImm();
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    if (Imm == KillVal) {
+      NewMI = BuildMI(MBB, MI, DL,
+                      TII->get(AndN2),
+                      LiveMaskOut)
+        .addReg(LiveMaskIn)
+        .addReg(Exec);
+    }
+  } else {
+    unsigned Opcode = KillVal ? AndN2 : And;
+    NewMI = BuildMI(MBB, MI, DL,
+                    TII->get(Opcode),
+                    LiveMaskOut)
+      .addReg(LiveMaskIn)
+      .add(Op);
+  }
+
+  if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+    if (isWQM) {
+      // Inside WQM demotes are replaced with live mask manipulation
+      LIS->RemoveMachineInstrFromMaps(MI);
+      MBB.remove(&MI);
+    } else {
+      // Outside WQM demotes become kills terminating the block
+      NeedSplit = true;
+    }
+  }
+
+  if (NewMI) {
+    LIS->InsertMachineInstrInMaps(*NewMI);
+  }
+
+  return NeedSplit;
+}
+
+bool SIWholeQuadMode::canSplitBlockAt(MachineBasicBlock *BB,
+                                      MachineInstr *MI) {
+  // Cannot split immediately before the epilog
+  // because there are values in physical registers
+  if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+    return false;
+  }
+
+  // Do not split inside a waterfall intrinsic pair
+  MachineBasicBlock::iterator II = BB->getFirstNonPHI();
+  MachineBasicBlock::iterator IE = BB->end();
+  bool InWaterfall = false;
+  while (II != IE) {
+    if (&*II == MI)
+      return !InWaterfall;
+
+    switch (II->getOpcode()) {
+    case AMDGPU::SI_WATERFALL_BEGIN_V1:
+    case AMDGPU::SI_WATERFALL_BEGIN_V2:
+    case AMDGPU::SI_WATERFALL_BEGIN_V4:
+    case AMDGPU::SI_WATERFALL_BEGIN_V8:
+      InWaterfall = true;
+      break;
+    case AMDGPU::SI_WATERFALL_END_V1:
+    case AMDGPU::SI_WATERFALL_END_V2:
+    case AMDGPU::SI_WATERFALL_END_V4:
+    case AMDGPU::SI_WATERFALL_END_V8:
+      InWaterfall = false;
+      break;
+    default:
+      break;
+    }
+    II++;
+  }
+
+  return true;
 }
 
-void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
-                                   bool isEntry) {
+MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
+                                               MachineInstr *TermMI) {
+  MachineBasicBlock::iterator SplitPoint(TermMI);
+  SplitPoint++;
+
+  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB)
+                    << " @ " << *TermMI << "\n");
+
+  MachineBasicBlock *SplitBB = nullptr;
+
+  // Only split the block if the split point is not
+  // already the end of the block.
+  if (SplitPoint != BB->getFirstTerminator()) {
+    MachineFunction *MF = BB->getParent();
+    SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+    MachineFunction::iterator MBBI(BB);
+    ++MBBI;
+    MF->insert(MBBI, SplitBB);
+
+    SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+    SplitBB->transferSuccessorsAndUpdatePHIs(BB);
+    BB->addSuccessor(SplitBB);
+  }
+
+  // Convert last instruction in to a terminator.
+  // Note: this only covers the expected patterns
+  switch (TermMI->getOpcode()) {
+  case AMDGPU::S_AND_B32:
+    TermMI->setDesc(TII->get(AMDGPU::S_AND_B32_term));
+    break;
+  case AMDGPU::S_AND_B64:
+    TermMI->setDesc(TII->get(AMDGPU::S_AND_B64_term));
+    break;
+  case AMDGPU::SI_DEMOTE_I1:
+    TermMI->setDesc(TII->get(AMDGPU::SI_DEMOTE_I1_TERMINATOR));
+    break;
+  case AMDGPU::SI_DEMOTE_I1_TERMINATOR:
+    break;
+  default:
+    if (BB->getFirstTerminator() == BB->end()) {
+      assert(SplitBB != nullptr);
+      BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
+        .addMBB(SplitBB);
+    }
+    break;
+  }
+
+  return SplitBB;
+}
+
+// Replace (or supplement) instructions accessing live mask.
+// This can only happen once all the live mask registers have been created
+// and the execute state (WQM/WWM/Exact) of instructions is known.
+void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   auto BII = Blocks.find(&MBB);
   if (BII == Blocks.end())
     return;
 
+  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB)
+                    << ":\n");
+
   const BlockInfo &BI = BII->second;
 
+  SmallVector<MachineInstr *, 4> SplitPoints;
+  unsigned LiveMaskReg = BI.LiveMaskIn;
+  char State = BI.InitialState;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    auto Next = std::next(II);
+    MachineInstr &MI = *II;
+
+    if (StateTransition.count(&MI)) {
+      // Mark transitions to Exact mode as split points so they become
+      // block terminators.
+      if (State != StateTransition[&MI] && StateTransition[&MI] == StateExact) {
+        if (State != StateWWM && canSplitBlockAt(&MBB, &MI))
+          SplitPoints.push_back(&MI);
+      }
+      State = StateTransition[&MI];
+    }
+
+    switch (MI.getOpcode()) {
+    case AMDGPU::SI_PS_LIVE:
+      lowerLiveMaskQuery(MBB, MI, LiveMaskReg, State == StateWQM);
+      break;
+    case AMDGPU::SI_DEMOTE_I1: {
+      bool NeedSplit = lowerDemote(MBB, MI, LiveMaskReg,
+                                   LiveMaskRegs[&MI],
+                                   State == StateWQM);
+      if (NeedSplit)
+        SplitPoints.push_back(&MI);
+      break;
+    }
+    default:
+      break;
+    }
+
+    if (LiveMaskRegs.count(&MI))
+      LiveMaskReg = LiveMaskRegs[&MI];
+
+    II = Next;
+  }
+
+  if (BI.LiveMaskOut != LiveMaskReg) {
+    // If the final live mask register does not match the expected
+    // register of successor blocks then insert a copy.
+    MachineBasicBlock::instr_iterator Terminator =
+      MBB.getFirstInstrTerminator();
+    MachineInstr *MI = BuildMI(MBB, Terminator, DebugLoc(),
+                               TII->get(AMDGPU::COPY), BI.LiveMaskOut)
+      .addReg(LiveMaskReg);
+    LIS->InsertMachineInstrInMaps(*MI);
+  }
+
+  // Perform splitting after instruction scan to simplify iteration.
+  if (!SplitPoints.empty()) {
+    MachineBasicBlock *BB = &MBB;
+    for (MachineInstr *MI : SplitPoints) {
+      BB = splitBlock(BB, MI);
+    }
+  }
+}
+
+unsigned SIWholeQuadMode::findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI,
+                                          MachineBasicBlock::iterator &Before) {
+  assert(BI.LiveMaskIn);
+  if (BI.LiveMaskIn == BI.LiveMaskOut)
+    return BI.LiveMaskIn;
+
+  // FIXME: make this more efficient than scanning all instructions in a block
+  unsigned LiveMaskReg = BI.LiveMaskIn;
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+
+  while ((II != IE) && (II != Before)) {
+    MachineInstr *I = &*II;
+    if (LiveMaskRegs.count(I))
+      LiveMaskReg = LiveMaskRegs[I];
+    II++;
+  }
+
+  assert(LiveMaskReg);
+  return LiveMaskReg;
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  BlockInfo &BI = BII->second;
+
   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
-  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
+  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
+    BI.InitialState = StateWQM;
     return;
+  }
 
   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
                     << ":\n");
@@ -723,6 +1134,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
   // switch to/from WQM as well.
   MachineBasicBlock::iterator FirstWWM = IE;
+
+  // Record initial state is block information.
+  BI.InitialState = State;
+
   for (;;) {
     MachineBasicBlock::iterator Next = II;
     char Needs = StateExact | StateWQM; // WWM is disabled by default
@@ -786,11 +1201,12 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 
       MachineBasicBlock::iterator Before =
           prepareInsertion(MBB, First, II, Needs == StateWQM,
-                           Needs == StateExact || WQMFromExec);
+                           Needs == StateExact || WQMFromExec,
+                           Needs == StateExact && isEntry);
 
       if (State == StateWWM) {
         assert(SavedNonWWMReg);
-        fromWWM(MBB, Before, SavedNonWWMReg);
+        fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState);
         State = NonWWMState;
       }
 
@@ -804,7 +1220,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
           if (!WQMFromExec && (OutNeeds & StateWQM))
             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
 
-          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+          toExact(MBB, Before, SavedWQMReg, findLiveMaskReg(MBB, BI, Before));
           State = StateExact;
         } else if (State == StateExact && (Needs & StateWQM) &&
                    !(Needs & StateExact)) {
@@ -833,11 +1249,13 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 
     if (II == IE)
       break;
+
     II = Next;
   }
 }
 
-void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+bool SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  bool Changed = false;
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
     Register Dest = MI->getOperand(0).getReg();
@@ -847,7 +1265,19 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
 
     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
     MI->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+bool SIWholeQuadMode::lowerDemoteInstrs() {
+  bool Changed = false;
+  for (MachineInstr *MI : DemoteInstrs) {
+    MachineBasicBlock *MBB = MI->getParent();
+    splitBlock(MBB, MI);
+    Changed = true;
   }
+  return Changed;
 }
 
 void SIWholeQuadMode::lowerCopyInstrs() {
@@ -878,6 +1308,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   Blocks.clear();
   LiveMaskQueries.clear();
   LowerToCopyInstrs.clear();
+  DemoteInstrs.clear();
+  LiveMaskRegs.clear();
+  StateTransition.clear();
+
   CallingConv = MF.getFunction().getCallingConv();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -887,38 +1321,52 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
 
-  char GlobalFlags = analyzeFunction(MF);
-  unsigned LiveMaskReg = 0;
-  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  if (!(GlobalFlags & StateWQM)) {
-    lowerLiveMaskQueries(Exec);
-    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
-      return !LiveMaskQueries.empty();
-  } else {
-    // Store a copy of the original live mask when required
-    MachineBasicBlock &Entry = MF.front();
-    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
-
-    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
-      LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
-      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
-                                 TII->get(AMDGPU::COPY), LiveMaskReg)
-                             .addReg(Exec);
-      LIS->InsertMachineInstrInMaps(*MI);
-    }
+  const char GlobalFlags = analyzeFunction(MF);
+  const bool NeedsLiveMask =
+    !(DemoteInstrs.empty() && LiveMaskQueries.empty());
+  const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  unsigned LiveMaskReg = Exec;
+
+  if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty()) {
+    // Shader only needs Exact mode
+    const bool LoweredQueries = lowerLiveMaskQueries(LiveMaskReg);
+    const bool LoweredDemotes = lowerDemoteInstrs();
+    return LoweredQueries || LoweredDemotes;
+  }
 
-    lowerLiveMaskQueries(LiveMaskReg);
+  MachineBasicBlock &Entry = MF.front();
+  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+
+  // Store a copy of the original live mask when required
+  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
+    LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
+    MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+                               TII->get(AMDGPU::COPY), LiveMaskReg)
+                           .addReg(Exec);
+    LIS->InsertMachineInstrInMaps(*MI);
+  }
+
+  if ((GlobalFlags == StateWQM) && DemoteInstrs.empty()) {
+    // Shader only needs WQM
+    BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
+              AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
+            Exec)
+        .addReg(Exec);
 
-    if (GlobalFlags == StateWQM) {
-      // For a shader that needs only WQM, we can just set it once.
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
-                AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
-              Exec)
-          .addReg(Exec);
+    lowerLiveMaskQueries(LiveMaskReg);
+    lowerCopyInstrs();
+    return true;
+  }
 
-      lowerCopyInstrs();
-      // EntryMI may become invalid here
-      return true;
+  if (NeedsLiveMask && (GlobalFlags & StateWQM)) {
+    BlockInfo &BI = Blocks[&Entry];
+    BI.LiveMaskIn = LiveMaskReg;
+    analyzeLiveLanes(MF);
+  } else {
+    for (auto BII : Blocks) {
+      BlockInfo &BI = Blocks[&*BII.first];
+      BI.LiveMaskIn = LiveMaskReg;
+      BI.LiveMaskOut = LiveMaskReg;
     }
   }
 
@@ -926,14 +1374,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
   lowerCopyInstrs();
 
-  // Handle the general case
-  for (auto BII : Blocks)
-    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+  for (auto BII : Blocks) {
+    processBlock(*BII.first, BII.first == &Entry);
+  }
 
-  // Physical registers like SCC aren't tracked by default anyway, so just
-  // removing the ranges we computed is the simplest option for maintaining
-  // the analysis results.
-  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+  if (NeedsLiveMask && (GlobalFlags & StateWQM)) {
+    // Lowering blocks causes block splitting.
+    // Hence live ranges and slot indexes cease to be valid here.
+    for (auto BII : Blocks) {
+      lowerBlock(*BII.first);
+    }
+  } else {
+    lowerLiveMaskQueries(LiveMaskReg);
+    lowerDemoteInstrs();
+  }
 
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
new file mode 100644
index 00000000000000..1a4bde2c9f31e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -0,0 +1,271 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,GCN-64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GCN-64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-64 %s
+
+; GCN-LABEL: {{^}}static_exact:
+; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0
+; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0
+; GCN-32: s_mov_b32 exec_lo, 0
+; GCN-64: s_mov_b64 exec, 0
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_exact:
+; GCN-32: v_cmp_le_f32_e64 [[CND:s[0-9]+]], 0, v1
+; GCN-64: v_cmp_le_f32_e64 [[CND:s\[[0-9]+:[0-9]+\]]], 0, v1
+; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0
+; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[CND]]
+; GCN-64: s_and_b64 exec, exec, [[CND]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}branch:
+; GCN-32: s_and_saveexec_b32 s1, s0
+; GCN-64: s_and_saveexec_b64 s[2:3], s[0:1]
+; GCN-32: s_xor_b32 s0, exec_lo, s1
+; GCN-64: s_xor_b64 s[0:1], exec, s[2:3]
+; GCN-32: s_mov_b32 exec_lo, 0
+; GCN-64: s_mov_b64 exec, 0
+; GCN-32: s_or_b32 exec_lo, exec_lo, s0
+; GCN-64: s_or_b64 exec, exec, s[0:1]
+; GCN: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @branch(float %arg0, float %arg1) {
+.entry:
+  %i0 = fptosi float %arg0 to i32
+  %i1 = fptosi float %arg1 to i32
+  %c0 = or i32 %i0, %i1
+  %c1 = and i32 %c0, 1
+  %c2 = icmp eq i32 %c1, 0
+  br i1 %c2, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}wqm_demote_1:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue
+; GCN: image_sample
+; GCN: v_add_f32_e32
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+; GCN-LABEL: {{^}}wqm_demote_2:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: image_sample
+; GCN: ; %.demote
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue
+; GCN: v_add_f32_e32
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+
+; GCN-LABEL: {{^}}wqm_deriv:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote0
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue0
+; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], 1.0, 0, [[LIVE]]
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: ; %.demote1
+; GCN-32-NEXT: s_mov_b32 exec_lo, 0
+; GCN-64-NEXT: s_mov_b64 exec, 0
+; GCN: ; %.continue1
+; GCN: exp mrt0
+define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.cond = select i1 %live, i32 0, i32 1065353216
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+; GCN-LABEL: {{^}}wqm_deriv_loop:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote0
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN: ; %.continue0.preheader
+; GCN: ; %.continue0
+; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:v[0-9]+]], 0, [[LIVE]]
+; GCN: ; %.demote1
+; GCN-32: s_andn2_b32 [[LIVE]], [[LIVE]], exec
+; GCN-64: s_andn2_b64 [[LIVE]], [[LIVE]], exec
+; GCN: ; %.return
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: exp mrt0
+define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.cond = select i1 %live, i32 0, i32 %count
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  %next = add i32 %count, 1
+  %loop.cond = icmp slt i32 %next, %limit
+  br i1 %loop.cond, label %.continue0, label %.return
+
+.return:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.wqm.demote(i1) #0
+declare i1 @llvm.amdgcn.ps.live() #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare float @llvm.amdgcn.wqm.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind }
+attributes #4 = { convergent nounwind readnone }