Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Correctly insert s_nops for dst forwarding hazard #100276

Merged
merged 17 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 112 additions & 22 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -875,13 +875,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
return DataIdx >= 0 &&
TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
};

int WaitStatesNeededForDef =
VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

return WaitStatesNeeded;
}

/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
/// pack the computed value into correct bit position of the dest register. This
/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
/// dst_sel that is not aligned to the register. This function analayzes the \p
/// MI and \returns an operand with dst forwarding issue, or nullptr if
/// none exists.
static const MachineOperand *
getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
if (!SIInstrInfo::isVALU(MI))
jrbyrnes marked this conversation as resolved.
Show resolved Hide resolved
return nullptr;

const SIInstrInfo *TII = ST.getInstrInfo();

unsigned Opcode = MI.getOpcode();

// There are three different types of instructions
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
// which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
// CVT_SR_BF8_F32 with op_sel[3:2]
jrbyrnes marked this conversation as resolved.
Show resolved Hide resolved
// != 0
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return nullptr;
} else {
// Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
// CVT_SR_BF8_F32 with op_sel[3:2] != 0)
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
SISrcMods::DST_OP_SEL ||
(AMDGPU::isFP8DstSelInst(Opcode) &&
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
SISrcMods::OP_SEL_0))))
return nullptr;
}

return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
}

/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
const MachineOperand *Dst,
const SIRegisterInfo *TRI) {
// We must consider implicit reads of the VALU. SDWA with dst_sel and
// UNUSED_PRESERVE will implicitly read the result from forwarded dest,
// and we must account for that hazard.
// We also must account for WAW hazards. In particular, WAW with dest
// preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
// !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
// check for ECC. Without accounting for this hazard, the ECC will be
// wrong.
// TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
// complete zeroesHigh16BitsOfDest)
for (auto &Operand : VALU->operands()) {
if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
return true;
}
}
return false;
}

int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
int WaitStatesNeeded = 0;

Expand Down Expand Up @@ -912,27 +977,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
if (ST.hasDstSelForwardingHazard()) {
const int Shift16DefWaitstates = 1;

auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
if (!SIInstrInfo::isVALU(MI))
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
if (SIInstrInfo::isSDWA(MI)) {
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return false;
} else {
if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
->getImm() &
SISrcMods::DST_OP_SEL))
return false;
}
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
Register Def = Dst->getReg();
const MachineOperand *ForwardedDst =
getDstSelForwardingOperand(ProducerMI, ST);
if (ForwardedDst) {
return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
}

for (const MachineOperand &Use : VALU->explicit_uses()) {
if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
if (ProducerMI.isInlineAsm()) {
// Assume inline asm has dst forwarding hazard
for (auto &Def : ProducerMI.all_defs()) {
if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
return true;
}
}
Expand Down Expand Up @@ -1029,7 +1085,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
// problematic thus far.

// see checkVALUHazards()
if (!ST.has12DWordStoreHazard())
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
return 0;

const MachineRegisterInfo &MRI = MF.getRegInfo();
Expand All @@ -1038,11 +1094,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
for (const MachineOperand &Op :
llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
if (Op.isReg() && Op.isDef()) {
WaitStatesNeeded =
std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
if (!TRI.isVectorRegister(MRI, Op.getReg()))
continue;

if (ST.has12DWordStoreHazard()) {
WaitStatesNeeded =
std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
}
}
}

if (ST.hasDstSelForwardingHazard()) {
const int Shift16DefWaitstates = 1;

auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
// Assume inline asm reads the dst
if (Dst)
return IA->modifiesRegister(Dst->getReg(), &TRI) ||
IA->readsRegister(Dst->getReg(), &TRI);

if (ProducerMI.isInlineAsm()) {
// If MI is inline asm, assume it has dst forwarding hazard
for (auto &Def : ProducerMI.all_defs()) {
if (IA->modifiesRegister(Def.getReg(), &TRI) ||
IA->readsRegister(Def.getReg(), &TRI)) {
return true;
}
}
}

return false;
};

int WaitStatesNeededForDef =
Shift16DefWaitstates -
getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}

return WaitStatesNeeded;
}

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2325,6 +2325,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {

field bit IsFP8SrcByteSel = 0;
field bit IsFP8DstByteSel = 0;
field bit HasFP8DstByteSel = 0;
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);

field bit HasDst = !ne(DstVT.Value, untyped.Value);
Expand Down Expand Up @@ -2904,6 +2905,15 @@ def getVCMPXOpFromVCMP : InstrMapping {
let ValueCols = [["1"]];
}

def FP8DstByteSelTable : GenericTable {
let FilterClass = "VOP3_Pseudo";
jrbyrnes marked this conversation as resolved.
Show resolved Hide resolved
let CppTypeName = "FP8DstByteSelInfo";
let Fields = ["Opcode", "HasFP8DstByteSel"];

let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getFP8DstByteSelHelper";
}

def VOPDComponentTable : GenericTable {
let FilterClass = "VOPD_Component";
let CppTypeName = "VOPDComponentInfo";
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,13 @@ struct SingleUseExceptionInfo {
bool IsInvalidSingleUseProducer;
};

struct FP8DstByteSelInfo {
uint16_t Opcode;
bool HasFP8DstByteSel;
};

#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
Expand Down Expand Up @@ -629,6 +636,11 @@ bool isInvalidSingleUseProducerInst(unsigned Opc) {
return Info && Info->IsInvalidSingleUseProducer;
}

bool isFP8DstSelInst(unsigned Opc) {
const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
return Info ? Info->HasFP8DstByteSel : false;
}

unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,9 @@ getVOPDInstInfo(unsigned VOPDOpcode, const MCInstrInfo *InstrInfo);
LLVM_READONLY
bool isTrue16Inst(unsigned Opc);

LLVM_READONLY
bool isFP8DstSelInst(unsigned Opc);

LLVM_READONLY
bool isInvalidSingleUseConsumerInst(unsigned Opc);

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
let HasSrc2Mods = 1;
let HasExtVOP3DPP = 1;
let HasOpSel = 1;
let HasFP8DstByteSel = 1;
let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
getAsmVOP3OpSel<3, HasClamp, HasOMod,
HasSrc0FloatMods, HasSrc1FloatMods,
Expand All @@ -587,6 +588,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
let IsFP8DstByteSel = 1;
let HasFP8DstByteSel = 1;
let HasClamp = 0;
defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsWMMA = P.IsWMMA;
let IsSWMMAC = P.IsSWMMAC;

bit HasFP8DstByteSel = P.HasFP8DstByteSel;

let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,
!if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
Expand Down
Loading