Skip to content

Commit

Permalink
Overwritten with: 92b4e09 [AMDGPU] Use waterfall for readlane with no…
Browse files Browse the repository at this point in the history
…n-uniform index

Based on upstream llvm 371856: b4160cb [ADT] Remove a workaround for old versions of clang

Local (non-upstream) changes since 0866736:
92b4e09 [AMDGPU] Use waterfall for readlane with non-uniform index
b9cf832 [AMDGPU] Allow FP inline constant in v_madak_f16 and v_fmaak_f16

Added AMD modification notices and removed non-LLVM directories and some GPL files.
  • Loading branch information
Tim Renouf committed Oct 10, 2019
2 parents 0866736 + 92b4e09 commit 9d4e892
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 19 deletions.
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,9 @@ class SIFixSGPRCopies : public MachineFunctionPass {

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
AU.setPreservesCFG();
// FIXME: Temporarily disable these flags as they do not currently hold
//AU.addPreserved<MachineDominatorTree>();
//AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3839,7 +3839,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
if (NeedClampOperand)
I.addImm(0); // clamp bit for e64 encoding

TII->legalizeOperands(*I);
SmallSetVector<MachineInstr *, 32> Worklist;
TII->legalizeOperands(*I, Worklist);

MI.eraseFromParent();
return BB;
Expand Down
143 changes: 129 additions & 14 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2460,7 +2460,8 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// It might happen that UseMI was commuted
// and we now have SGPR as SRC1. If so 2 inlined
// constant and SGPR are illegal.
legalizeOperands(UseMI);
SmallSetVector<MachineInstr *, 32> Worklist;
legalizeOperands(UseMI, Worklist);

bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
Expand Down Expand Up @@ -3955,9 +3956,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
}

void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
MachineInstr &MI,
SetVectorType &Worklist) const {
unsigned Opc = MI.getOpcode();
const MCInstrDesc &InstrDesc = get(Opc);
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();

int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
MachineOperand &Src0 = MI.getOperand(Src0Idx);
Expand Down Expand Up @@ -4007,15 +4011,125 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
return;

// Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
// lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
// select is uniform.
// lane select.
// Previous implementations assumed that a non SGPR operand meant
// that the value was uniform across all lanes - modified this behaviour to
// use a waterfall operation to process all indices. This will be a worst case
// of 64 iterations, but will only be a single iteration for a uniform across
// all lanes so the extra cost is low
if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
RI.isVGPR(MRI, Src1.getReg())) {
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
// Waterfall to read all the values across all lanes
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
MachineOperand &Src0 = MI.getOperand(Src0Idx);

MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src1);
Src1.ChangeToRegister(Reg, false);

unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

// Initialize the register we accumulate the result into
BuildMI(MBB, I, DL, get(AMDGPU::V_MOV_B32_e32), InitReg)
.addImm(0x0);

unsigned DstReg = MI.getOperand(0).getReg();
unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
unsigned NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

BuildMI(MBB, I, DL, get(TargetOpcode::IMPLICIT_DEF), TmpExec);

// Save the EXEC mask
BuildMI(MBB, I, DL, get(AMDGPU::S_MOV_B64), SaveExec)
.addReg(AMDGPU::EXEC);

MachineBasicBlock &LoopBB = *MF.CreateMachineBasicBlock();
MachineBasicBlock &RemainderBB = *MF.CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;

MF.insert(MBBI, &LoopBB);
MF.insert(MBBI, &RemainderBB);

LoopBB.addSuccessor(&LoopBB);
LoopBB.addSuccessor(&RemainderBB);

RemainderBB.transferSuccessorsAndUpdatePHIs(&MBB);
RemainderBB.splice(RemainderBB.begin(), &MBB, I, MBB.end());

MBB.addSuccessor(&LoopBB);

MachineBasicBlock::iterator J = LoopBB.begin();

unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
unsigned CurrentValue = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);

BuildMI(LoopBB, J, DL, get(TargetOpcode::PHI), PhiReg)
.addReg(InitReg)
.addMBB(&MBB)
.addReg(NewDst)
.addMBB(&LoopBB);

BuildMI(LoopBB, J, DL, get(TargetOpcode::PHI), PhiExec)
.addReg(TmpExec)
.addMBB(&MBB)
.addReg(NewExec)
.addMBB(&LoopBB);

// Read the next variant <- also loop target.
BuildMI(LoopBB, J, DL, get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
.addReg(Src1.getReg(), getUndefRegState(Src1.isUndef()));

// Compare the just read value to all possible Idx values.
BuildMI(LoopBB, J, DL, get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
.addReg(CurrentIdxReg)
.addReg(Src1.getReg(), 0, Src1.getSubReg());

// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, J, DL, get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
.addReg(CondReg, RegState::Kill);

// TODO: Conditional branch here to loop header as a potential optimization?

// Use readlane to get the value for all lanes with the current index
BuildMI(LoopBB, J, DL, get(AMDGPU::V_READLANE_B32), CurrentValue)
.addReg(Src0.getReg())
.addReg(CurrentIdxReg);

// Mov the just read value into the destination using or
// TODO: In theory a mov would do here - but this is tricky to get to work
// correctly as it seems to confuse the register allocator and other passes
BuildMI(LoopBB, J, DL, get(AMDGPU::V_OR_B32_e64), NewDst)
.addReg(PhiReg)
.addReg(CurrentValue);

MRI.setSimpleHint(NewExec, CondReg);

// Update EXEC, switch all done bits to 0 and all todo bits to 1.
BuildMI(LoopBB, J, DL, get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(NewExec);

// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?

// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
BuildMI(LoopBB, J, DL, get(AMDGPU::S_CBRANCH_EXECNZ))
.addMBB(&LoopBB);

MachineBasicBlock::iterator First = RemainderBB.begin();
BuildMI(RemainderBB, First, DL, get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addReg(SaveExec);

MRI.replaceRegWith(DstReg, NewDst);
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);

MI.eraseFromParent();

return;
}

Expand Down Expand Up @@ -4454,13 +4568,14 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
}

void SIInstrInfo::legalizeOperands(MachineInstr &MI,
SetVectorType &Worklist,
MachineDominatorTree *MDT) const {
MachineFunction &MF = *MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();

// Legalize VOP2
if (isVOP2(MI) || isVOPC(MI)) {
legalizeOperandsVOP2(MRI, MI);
legalizeOperandsVOP2(MRI, MI, Worklist);
return;
}

Expand Down Expand Up @@ -4931,7 +5046,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
// legalize its operands instead.
legalizeOperands(Inst, MDT);
legalizeOperands(Inst, Worklist, MDT);
continue;
}

Expand Down Expand Up @@ -5023,7 +5138,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
}

// Legalize the operands
legalizeOperands(Inst, MDT);
legalizeOperands(Inst, Worklist, MDT);

if (HasDst)
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
Expand Down Expand Up @@ -5057,7 +5172,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
Inst.addImplicitDefUseOperands(*MBB.getParent());
MRI.replaceRegWith(OldDstReg, ResultReg);
legalizeOperands(Inst, MDT);
legalizeOperands(Inst, Worklist, MDT);

addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
return true;
Expand Down Expand Up @@ -5335,8 +5450,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,

// Try to legalize the operands in case we need to swap the order to keep it
// valid.
legalizeOperands(*LoHalf, MDT);
legalizeOperands(*HiHalf, MDT);
legalizeOperands(*LoHalf, Worklist, MDT);
legalizeOperands(*HiHalf, Worklist, MDT);

// Move all users of this moved vlaue.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
// Notified per clause 4(b) of the license.
//
//===----------------------------------------------------------------------===//
//
Expand Down Expand Up @@ -844,7 +846,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {

/// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
void legalizeOperandsVOP2(MachineRegisterInfo &MRI,
MachineInstr &MI,
SetVectorType &Worklist) const;

/// Fix operands in \p MI to satisfy constant bus requirements.
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
Expand All @@ -868,6 +872,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// instructions and control-flow around \p MI. If present, \p MDT is
/// updated.
void legalizeOperands(MachineInstr &MI,
SetVectorType &Worklist,
MachineDominatorTree *MDT = nullptr) const;

/// Replace this instruction's opcode with the equivalent VALU
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
// Notified per clause 4(b) of the license.
//
//===----------------------------------------------------------------------===//

Expand Down Expand Up @@ -251,7 +253,9 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {

class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
field dag Ins32 = !if(!eq(vt.Size, 32),
(ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
(ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
field bit HasExt = 0;

// Hack to stop printing _e64
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# Notified per clause 4(b) of the license.
# RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s

# GCN-LABEL: name: test_fmamk_reg_imm_f32
Expand Down Expand Up @@ -181,3 +183,23 @@ body: |
%1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $exec
...

# GCN-LABEL: name: test_fmaak_inline_literal_f16
# GCN: %2:vgpr_32 = V_FMAAK_F16 16384, killed %0, 49664, implicit $exec

---
name: test_fmaak_inline_literal_f16
liveins:
- { reg: '$vgpr0', virtual-reg: '%3' }
body: |
bb.0:
liveins: $vgpr0
%3:vgpr_32 = COPY killed $vgpr0
%26:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
%28:vgpr_32 = V_FMAC_F16_e32 16384, killed %3, %26, implicit $exec
S_ENDPGM 0
...

22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# Notified per clause 4(b) of the license.
# RUN: llc -march=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s

# GCN-LABEL: name: test_madmk_reg_imm_f32
Expand Down Expand Up @@ -188,3 +190,23 @@ body: |
%1 = V_MAC_F32_e32 %stack.0, %0, %2, implicit $exec
...

# GCN-LABEL: name: test_madak_inline_literal_f16
# GCN: %2:vgpr_32 = V_MADAK_F16 16384, killed %0, 49664, implicit $exec

---
name: test_madak_inline_literal_f16
liveins:
- { reg: '$vgpr0', virtual-reg: '%3' }
body: |
bb.0:
liveins: $vgpr0
%3:vgpr_32 = COPY killed $vgpr0
%26:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
%28:vgpr_32 = V_MAC_F16_e32 16384, killed %3, %26, implicit $exec
S_ENDPGM 0
...

0 comments on commit 9d4e892

Please sign in to comment.