Overwritten with: 92b4e09 [AMDGPU] Use waterfall for readlane with no…

…n-uniform index Based on upstream llvm 371856: b4160cb [ADT] Remove a workaround for old versions of clang Local (non-upstream) changes since 0866736: 92b4e09 [AMDGPU] Use waterfall for readlane with non-uniform index b9cf832 [AMDGPU] Allow FP inline constant in v_madak_f16 and v_fmaak_f16 Added AMD modification notices and removed non-LLVM directories and some GPL files.
jaebaek · Oct 10, 2019 · 9d4e892 · 9d4e892
2 parents 0866736 + 92b4e09
commit 9d4e892
Show file tree

Hide file tree

Showing 7 changed files with 189 additions and 19 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -124,8 +124,9 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
-    AU.setPreservesCFG();
+    // FIXME: Temporarily disable these flags as they do not currently hold
+    //AU.addPreserved<MachineDominatorTree>();
+    //AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3839,7 +3839,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     if (NeedClampOperand)
       I.addImm(0); // clamp bit for e64 encoding
 
-    TII->legalizeOperands(*I);
+    SmallSetVector<MachineInstr *, 32> Worklist;
+    TII->legalizeOperands(*I, Worklist);
 
     MI.eraseFromParent();
     return BB;

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2460,7 +2460,8 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       // It might happen that UseMI was commuted
       // and we now have SGPR as SRC1. If so 2 inlined
       // constant and SGPR are illegal.
-      legalizeOperands(UseMI);
+      SmallSetVector<MachineInstr *, 32> Worklist;
+      legalizeOperands(UseMI, Worklist);
 
       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
       if (DeleteDef)
@@ -3955,9 +3956,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
 }
 
 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
-                                       MachineInstr &MI) const {
+                                       MachineInstr &MI,
+                                       SetVectorType &Worklist) const {
   unsigned Opc = MI.getOpcode();
   const MCInstrDesc &InstrDesc = get(Opc);
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   MachineOperand &Src0 = MI.getOperand(Src0Idx);
@@ -4007,15 +4011,125 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
     return;
 
   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
-  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
-  // select is uniform.
+  // lane select.
+  // Previous implementations assumed that a non SGPR operand meant
+  // that the value was uniform across all lanes - modified this behaviour to
+  // use a waterfall operation to process all indices. This will be a worst case
+  // of 64 iterations, but will only be a single iteration for a uniform across
+  // all lanes so the extra cost is low
   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
       RI.isVGPR(MRI, Src1.getReg())) {
-    Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    // Waterfall to read all the values across all lanes
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
+    MachineBasicBlock::iterator I(&MI);
     const DebugLoc &DL = MI.getDebugLoc();
-    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
-        .add(Src1);
-    Src1.ChangeToRegister(Reg, false);
+
+    unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    // Initialize the register we accumulate the result into
+    BuildMI(MBB, I, DL, get(AMDGPU::V_MOV_B32_e32), InitReg)
+      .addImm(0x0);
+
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+    unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+    unsigned NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    BuildMI(MBB, I, DL, get(TargetOpcode::IMPLICIT_DEF), TmpExec);
+
+    // Save the EXEC mask
+    BuildMI(MBB, I, DL, get(AMDGPU::S_MOV_B64), SaveExec)
+      .addReg(AMDGPU::EXEC);
+
+    MachineBasicBlock &LoopBB = *MF.CreateMachineBasicBlock();
+    MachineBasicBlock &RemainderBB = *MF.CreateMachineBasicBlock();
+    MachineFunction::iterator MBBI(MBB);
+    ++MBBI;
+
+    MF.insert(MBBI, &LoopBB);
+    MF.insert(MBBI, &RemainderBB);
+
+    LoopBB.addSuccessor(&LoopBB);
+    LoopBB.addSuccessor(&RemainderBB);
+
+    RemainderBB.transferSuccessorsAndUpdatePHIs(&MBB);
+    RemainderBB.splice(RemainderBB.begin(), &MBB, I, MBB.end());
+
+    MBB.addSuccessor(&LoopBB);
+
+    MachineBasicBlock::iterator J = LoopBB.begin();
+
+    unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned CurrentValue = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+    BuildMI(LoopBB, J, DL, get(TargetOpcode::PHI), PhiReg)
+      .addReg(InitReg)
+      .addMBB(&MBB)
+      .addReg(NewDst)
+      .addMBB(&LoopBB);
+
+    BuildMI(LoopBB, J, DL, get(TargetOpcode::PHI), PhiExec)
+      .addReg(TmpExec)
+      .addMBB(&MBB)
+      .addReg(NewExec)
+      .addMBB(&LoopBB);
+
+    // Read the next variant <- also loop target.
+    BuildMI(LoopBB, J, DL, get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
+      .addReg(Src1.getReg(), getUndefRegState(Src1.isUndef()));
+
+    // Compare the just read value to all possible Idx values.
+    BuildMI(LoopBB, J, DL, get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
+      .addReg(CurrentIdxReg)
+      .addReg(Src1.getReg(), 0, Src1.getSubReg());
+
+    // Update EXEC, save the original EXEC value to VCC.
+    BuildMI(LoopBB, J, DL, get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+      .addReg(CondReg, RegState::Kill);
+
+    // TODO: Conditional branch here to loop header as a potential optimization?
+
+    // Use readlane to get the value for all lanes with the current index
+    BuildMI(LoopBB, J, DL, get(AMDGPU::V_READLANE_B32), CurrentValue)
+      .addReg(Src0.getReg())
+      .addReg(CurrentIdxReg);
+
+    // Mov the just read value into the destination using or
+    // TODO: In theory a mov would do here - but this is tricky to get to work
+    // correctly as it seems to confuse the register allocator and other passes
+    BuildMI(LoopBB, J, DL, get(AMDGPU::V_OR_B32_e64), NewDst)
+      .addReg(PhiReg)
+      .addReg(CurrentValue);
+
+    MRI.setSimpleHint(NewExec, CondReg);
+
+    // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+    BuildMI(LoopBB, J, DL, get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC)
+      .addReg(NewExec);
+
+    // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+    // s_cbranch_scc0?
+
+    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+    BuildMI(LoopBB, J, DL, get(AMDGPU::S_CBRANCH_EXECNZ))
+      .addMBB(&LoopBB);
+
+    MachineBasicBlock::iterator First = RemainderBB.begin();
+    BuildMI(RemainderBB, First, DL, get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+      .addReg(SaveExec);
+
+    MRI.replaceRegWith(DstReg, NewDst);
+    addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+
+    MI.eraseFromParent();
+
     return;
   }
 
@@ -4454,13 +4568,14 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
 }
 
 void SIInstrInfo::legalizeOperands(MachineInstr &MI,
+                                   SetVectorType &Worklist,
                                    MachineDominatorTree *MDT) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Legalize VOP2
   if (isVOP2(MI) || isVOPC(MI)) {
-    legalizeOperandsVOP2(MRI, MI);
+    legalizeOperandsVOP2(MRI, MI, Worklist);
     return;
   }
 
@@ -4931,7 +5046,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
-      legalizeOperands(Inst, MDT);
+      legalizeOperands(Inst, Worklist, MDT);
       continue;
     }
 
@@ -5023,7 +5138,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
     }
 
     // Legalize the operands
-    legalizeOperands(Inst, MDT);
+    legalizeOperands(Inst, Worklist, MDT);
 
     if (HasDst)
      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
@@ -5057,7 +5172,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
     Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
     Inst.addImplicitDefUseOperands(*MBB.getParent());
     MRI.replaceRegWith(OldDstReg, ResultReg);
-    legalizeOperands(Inst, MDT);
+    legalizeOperands(Inst, Worklist, MDT);
 
     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
     return true;
@@ -5335,8 +5450,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
 
   // Try to legalize the operands in case we need to swap the order to keep it
   // valid.
-  legalizeOperands(*LoHalf, MDT);
-  legalizeOperands(*HiHalf, MDT);
+  legalizeOperands(*LoHalf, Worklist, MDT);
+  legalizeOperands(*HiHalf, Worklist, MDT);
 
   // Move all users of this moved vlaue.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -3,6 +3,8 @@
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+// Notified per clause 4(b) of the license.
 //
 //===----------------------------------------------------------------------===//
 //
@@ -844,7 +846,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   /// Legalize operands in \p MI by either commuting it or inserting a
   /// copy of src1.
-  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+                            MachineInstr &MI,
+                            SetVectorType &Worklist) const;
 
   /// Fix operands in \p MI to satisfy constant bus requirements.
   void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
@@ -868,6 +872,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   /// instructions and control-flow around \p MI.  If present, \p MDT is
   /// updated.
   void legalizeOperands(MachineInstr &MI,
+                        SetVectorType &Worklist,
                         MachineDominatorTree *MDT = nullptr) const;
 
   /// Replace this instruction's opcode with the equivalent VALU

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -3,6 +3,8 @@
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+// Notified per clause 4(b) of the license.
 //
 //===----------------------------------------------------------------------===//
 
@@ -251,7 +253,9 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
 
 class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
-  field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
+  field dag Ins32 = !if(!eq(vt.Size, 32),
+                        (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
+                        (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
   field bit HasExt = 0;
 
   // Hack to stop printing _e64

diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
@@ -1,3 +1,5 @@
+# Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+# Notified per clause 4(b) of the license.
 # RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f32
@@ -181,3 +183,23 @@ body:             |
     %1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $exec
 
 ...
+
+# GCN-LABEL: name: test_fmaak_inline_literal_f16
+# GCN: %2:vgpr_32 = V_FMAAK_F16 16384, killed %0, 49664, implicit $exec
+
+---
+name:            test_fmaak_inline_literal_f16
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+  
+    %3:vgpr_32 = COPY killed $vgpr0
+
+    %26:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
+    %28:vgpr_32 = V_FMAC_F16_e32 16384, killed %3, %26, implicit $exec
+    S_ENDPGM 0
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
@@ -1,3 +1,5 @@
+# Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+# Notified per clause 4(b) of the license.
 # RUN: llc -march=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_madmk_reg_imm_f32
@@ -188,3 +190,23 @@ body:             |
     %1 = V_MAC_F32_e32 %stack.0, %0, %2, implicit $exec
 
 ...
+
+# GCN-LABEL: name: test_madak_inline_literal_f16
+# GCN: %2:vgpr_32 = V_MADAK_F16 16384, killed %0, 49664, implicit $exec
+
+---
+name:            test_madak_inline_literal_f16
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+  
+    %3:vgpr_32 = COPY killed $vgpr0
+
+    %26:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
+    %28:vgpr_32 = V_MAC_F16_e32 16384, killed %3, %26, implicit $exec
+    S_ENDPGM 0
+
+...
+