From 92f9ef7f238ed8e2d06f148fe6e7607fdd573794 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 23 Jul 2024 15:14:14 -0700 Subject: [PATCH 01/17] [AMDGPU] Correctly insert s_nops for implicit read of SDWA Change-Id: I4e22bb3764705f328827eb64704720a0d6aa1a9b --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 24 +++++++++ .../AMDGPU/sdwa-dst-preserve-hazard.mir | 54 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a402fc6d7e6110..570fcf63587fd6 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -935,6 +935,30 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) return true; } + + // If the non-impacted bits of a sdwa dst are preserved, then we + // read them and must resolve the hazard. SDWA dst preserve is modelled + // as a tied-def implicit use + if (SIInstrInfo::isSDWA(*VALU)) { + if (auto *DstSel = + TII->getNamedOperand(*VALU, AMDGPU::OpName::dst_sel)) { + if (DstSel->getImm() == AMDGPU::SDWA::DWORD) + return false; + if (auto *ThisDst = + TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { + if (!TRI->regsOverlap(Def, ThisDst->getReg())) + return false; + for (const MachineOperand &ImplicitOp : + VALU->implicit_operands()) { + if (ImplicitOp.isDef()) + continue; + if (ImplicitOp.isReg() && + TRI->regsOverlap(Def, ImplicitOp.getReg())) + return true; + } + } + } + } } return false; diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir new file mode 100644 index 00000000000000..ec3b61c4776eef --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s + +--- +name: hazard_vcmpx_sdwa_permlane16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6 + ; HAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16 + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6 + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec + ; HAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec + ; HAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; HAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec + ; HAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3 + ; HAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16 + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6 + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec + ; NOHAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3 + ; NOHAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1 + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr7 = COPY $vgpr6, implicit $exec + renamable $vgpr6 = COPY $vgpr5, implicit $exec + renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec + KILL killed renamable $vgpr2, renamable $vgpr3 + KILL killed renamable $vgpr0, renamable $vgpr1 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec + renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0) + GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec + S_ENDPGM 0 +... From c6b2c2c0bbcf8d3ae9f1b633db37d7b0fa51ee1a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 24 Jul 2024 12:36:40 -0700 Subject: [PATCH 02/17] Handle opsel case + address review comments Change-Id: I131dbb24762a53f31af0aa685c48b6c14233bf3d --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 36 +++--- llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 103 ++++++++++++++++++ 2 files changed, 116 insertions(+), 23 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 570fcf63587fd6..66349e9527725f 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -875,6 +875,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, return DataIdx >= 0 && TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); }; + int WaitStatesNeededForDef = VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); @@ -931,33 +932,22 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { Register Def = Dst->getReg(); - for (const MachineOperand &Use : VALU->explicit_uses()) { + for (const MachineOperand &Use : VALU->all_uses()) { if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) return true; } - // If the non-impacted bits of a sdwa dst are preserved, then we - // read them and must resolve the hazard. SDWA dst preserve is modelled - // as a tied-def implicit use - if (SIInstrInfo::isSDWA(*VALU)) { - if (auto *DstSel = - TII->getNamedOperand(*VALU, AMDGPU::OpName::dst_sel)) { - if (DstSel->getImm() == AMDGPU::SDWA::DWORD) - return false; - if (auto *ThisDst = - TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { - if (!TRI->regsOverlap(Def, ThisDst->getReg())) - return false; - for (const MachineOperand &ImplicitOp : - VALU->implicit_operands()) { - if (ImplicitOp.isDef()) - continue; - if (ImplicitOp.isReg() && - TRI->regsOverlap(Def, ImplicitOp.getReg())) - return true; - } - } - } + // We also read the dst for sub 32 writes to the same register for ECC + if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) { + Register ThisDef = ThisDst->getReg(); + if (!TRI->regsOverlap(Def, ThisDef)) + return false; + if (AMDGPU::hasNamedOperand(VALU->getOpcode(), + AMDGPU::OpName::op_sel) && + TII->getNamedOperand(*VALU, AMDGPU::OpName::src0_modifiers) + ->getImm() & + SISrcMods::DST_OP_SEL) + return true; } } diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir new file mode 100644 index 00000000000000..a8a26b49a57e98 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s + +--- +name: sdwa_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: opsel_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: opsel_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: sdwa_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... From 86c8316e99a9e5ccdca8d32e2d92890062abafd2 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 24 Jul 2024 13:59:08 -0700 Subject: [PATCH 03/17] Add minimal support for INLINEASM correctness Change-Id: Icbe306650f5513787db77acd49eedbb08d8f2149 --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 70 ++++-- llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 221 ++++++++++++++++++ 2 files changed, 275 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 66349e9527725f..c87cdd043b8899 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -883,6 +883,39 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, return WaitStatesNeeded; } +static const MachineOperand * +getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { + const SIInstrInfo *TII = ST.getInstrInfo(); + if (SIInstrInfo::isVALU(MI) && SIInstrInfo::isSDWA(MI)) { + if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) + if (DstSel->getImm() == AMDGPU::SDWA::DWORD) + return nullptr; + } else if (SIInstrInfo::isVALU(MI)) { + if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) || + !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & + SISrcMods::DST_OP_SEL)) + return nullptr; + } + + const MachineOperand *Dst = nullptr; + + if (SIInstrInfo::isVALU(MI)) + Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + // Assume inline asm has dst forwarding hazard + else if (MI.isInlineAsm()) { + for (auto &Op : + llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { + if (Op.isReg() && Op.isDef()) { + Dst = &Op; + break; + } + } + } + + return Dst; +} + int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { int WaitStatesNeeded = 0; @@ -914,22 +947,10 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { const int Shift16DefWaitstates = 1; auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { - if (!SIInstrInfo::isVALU(MI)) - return false; const SIInstrInfo *TII = ST.getInstrInfo(); - if (SIInstrInfo::isSDWA(MI)) { - if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) - if (DstSel->getImm() == AMDGPU::SDWA::DWORD) - return false; - } else { - if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) || - !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers) - ->getImm() & - SISrcMods::DST_OP_SEL)) - return false; - } const SIRegisterInfo *TRI = ST.getRegisterInfo(); - if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { + const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); + if (Dst) { Register Def = Dst->getReg(); for (const MachineOperand &Use : VALU->all_uses()) { @@ -950,7 +971,6 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { return true; } } - return false; }; @@ -1043,7 +1063,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { // problematic thus far. // see checkVALUHazards() - if (!ST.has12DWordStoreHazard()) + if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard()) return 0; const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1054,6 +1074,24 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { if (Op.isReg() && Op.isDef()) { WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); + + if (!TRI.isVectorRegister(MRI, Op.getReg())) + continue; + + if (ST.hasDstSelForwardingHazard()) { + const int Shift16DefWaitstates = 1; + + auto IsShift16BitDefFn = [this](const MachineInstr &MI) { + const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); + // Assume inline asm reads the dst + return Dst ? true : false; + }; + + int WaitStatesNeededForDef = + Shift16DefWaitstates - + getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index a8a26b49a57e98..55dfa8cb640091 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -27,6 +27,30 @@ body: | S_ENDPGM 0 ... +--- +name: sdwa_no_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_no_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_no_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + --- name: opsel_sdwa_hazard body: | @@ -52,6 +76,54 @@ body: | S_ENDPGM 0 ... +--- +name: opsel_no_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_no_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + S_ENDPGM 0 +... + +--- +name: no_opsel_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: no_opsel_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + --- name: opsel_opsel_hazard body: | @@ -77,6 +149,54 @@ body: | S_ENDPGM 0 ... +--- +name: opsel_no_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_no_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_no_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: no_opsel_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: no_opsel_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: no_opsel_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + --- name: sdwa_sdwa_hazard body: | @@ -101,3 +221,104 @@ body: | renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) S_ENDPGM 0 ... + +--- +name: sdwa_nosdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_nosdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_nosdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + S_ENDPGM 0 +... + +--- +name: inline_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: inline_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: inline_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: sdwa_inline_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_inline_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_inline_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + S_ENDPGM 0 +... + + +--- +name: inline_inline_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: inline_inline_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: inline_inline_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: S_ENDPGM 0 + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + S_ENDPGM 0 +... + From b292ded9d932d9724fe63541d303e40f95fe66a4 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 24 Jul 2024 15:03:11 -0700 Subject: [PATCH 04/17] Review comments Change-Id: Ib799d777e86306ce2ad4d74c421123e4420668e4 --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 37 ++++++------- .../AMDGPU/sdwa-dst-preserve-hazard.mir | 54 ------------------- 2 files changed, 17 insertions(+), 74 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index c87cdd043b8899..e20b1b54d5b017 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -885,35 +885,32 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { + // Assume inline asm has dst forwarding hazard + if (MI.isInlineAsm()) { + for (auto &Op : + llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { + if (Op.isReg() && Op.isDef()) { + return &Op; + } + } + } + const SIInstrInfo *TII = ST.getInstrInfo(); - if (SIInstrInfo::isVALU(MI) && SIInstrInfo::isSDWA(MI)) { + if (!SIInstrInfo::isVALU(MI)) + return nullptr; + + if (SIInstrInfo::isSDWA(MI)) { if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) if (DstSel->getImm() == AMDGPU::SDWA::DWORD) return nullptr; - } else if (SIInstrInfo::isVALU(MI)) { + } else { if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) || !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & SISrcMods::DST_OP_SEL)) return nullptr; } - const MachineOperand *Dst = nullptr; - - if (SIInstrInfo::isVALU(MI)) - Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - - // Assume inline asm has dst forwarding hazard - else if (MI.isInlineAsm()) { - for (auto &Op : - llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { - if (Op.isReg() && Op.isDef()) { - Dst = &Op; - break; - } - } - } - - return Dst; + return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); } int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { @@ -1084,7 +1081,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { auto IsShift16BitDefFn = [this](const MachineInstr &MI) { const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); // Assume inline asm reads the dst - return Dst ? true : false; + return (bool)Dst; }; int WaitStatesNeededForDef = diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir deleted file mode 100644 index ec3b61c4776eef..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir +++ /dev/null @@ -1,54 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s - ---- -name: hazard_vcmpx_sdwa_permlane16 -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6 - ; HAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16 - ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6 - ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec - ; HAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec - ; HAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec - ; HAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec - ; HAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3 - ; HAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1 - ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec - ; HAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec - ; HAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec - ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0) - ; HAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec - ; HAZARD-NEXT: S_ENDPGM 0 - ; - ; NOHAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16 - ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6 - ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec - ; NOHAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3 - ; NOHAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1 - ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0) - ; NOHAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec - ; NOHAZARD-NEXT: S_ENDPGM 0 - renamable $vgpr7 = COPY $vgpr6, implicit $exec - renamable $vgpr6 = COPY $vgpr5, implicit $exec - renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec - renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec - KILL killed renamable $vgpr2, renamable $vgpr3 - KILL killed renamable $vgpr0, renamable $vgpr1 - renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec - renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec - renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec - renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0) - GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec - S_ENDPGM 0 -... From 63a0cf28f3aa214ab6837efbb8d733b4c56f31f4 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 24 Jul 2024 15:08:54 -0700 Subject: [PATCH 05/17] Only init TII when needed Change-Id: I4f461219cd534229f21a36785808e785a5cb06ce --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index e20b1b54d5b017..1542a1056ec1d3 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -895,10 +895,10 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { } } - const SIInstrInfo *TII = ST.getInstrInfo(); if (!SIInstrInfo::isVALU(MI)) return nullptr; + const SIInstrInfo *TII = ST.getInstrInfo(); if (SIInstrInfo::isSDWA(MI)) { if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) if (DstSel->getImm() == AMDGPU::SDWA::DWORD) From 7a6b34574b1b3f315bff0263a6e94a9c7b7e29e2 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 25 Jul 2024 12:12:48 -0700 Subject: [PATCH 06/17] Add snop for WAW with lo bits of dst Change-Id: I29938c50257b6ebf302aafcd5b23d7484d06d89a --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 1542a1056ec1d3..ad7f694dec4a9f 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -960,11 +960,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { Register ThisDef = ThisDst->getReg(); if (!TRI->regsOverlap(Def, ThisDef)) return false; - if (AMDGPU::hasNamedOperand(VALU->getOpcode(), - AMDGPU::OpName::op_sel) && - TII->getNamedOperand(*VALU, AMDGPU::OpName::src0_modifiers) - ->getImm() & - SISrcMods::DST_OP_SEL) + if (TII->isVOP3(*VALU) && !TII->isVOP3P(*VALU)) return true; } } From 930d15ce58e5aa7ceabd96df93d200793f46c596 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 25 Jul 2024 14:18:22 -0700 Subject: [PATCH 07/17] Review comments Change-Id: I9882238a6227c7e6522e00fa98e1a3f18f7b4ec2 --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 42 ++++++--- llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 92 ++++++++----------- 2 files changed, 65 insertions(+), 69 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index ad7f694dec4a9f..8a6494d7599b01 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -885,16 +885,6 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { - // Assume inline asm has dst forwarding hazard - if (MI.isInlineAsm()) { - for (auto &Op : - llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { - if (Op.isReg() && Op.isDef()) { - return &Op; - } - } - } - if (!SIInstrInfo::isVALU(MI)) return nullptr; @@ -946,8 +936,21 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); - if (Dst) { + SmallVector Dsts; + const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST); + if (ForwardedDst) { + Dsts.push_back(ForwardedDst); + } else if (MI.isInlineAsm()) { + // Assume inline asm has dst forwarding hazard + for (auto &Op : + drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { + if (Op.isReg() && Op.isDef()) { + Dsts.push_back(&Op); + } + } + } + + for (auto Dst : Dsts) { Register Def = Dst->getReg(); for (const MachineOperand &Use : VALU->all_uses()) { @@ -1077,7 +1080,20 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { auto IsShift16BitDefFn = [this](const MachineInstr &MI) { const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); // Assume inline asm reads the dst - return (bool)Dst; + if (Dst) + return true; + + if (MI.isInlineAsm()) { + // Assume other inline asm has dst forwarding hazard + for (auto &Op : + drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { + if (Op.isReg() && Op.isDef()) { + return true; + } + } + } + + return false; }; int WaitStatesNeededForDef = diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index 55dfa8cb640091..cda7f2a679fe06 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -13,41 +13,42 @@ body: | ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: sdwa_opsel_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; NOHAZARD-NEXT: S_ENDPGM 0 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec S_ENDPGM 0 ... --- -name: sdwa_no_opsel_hazard +name: sdwa_lo_opsel_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - ; HAZARD-LABEL: name: sdwa_no_opsel_hazard + ; HAZARD-LABEL: name: sdwa_lo_opsel_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; - ; NOHAZARD-LABEL: name: sdwa_no_opsel_hazard + ; NOHAZARD-LABEL: name: sdwa_lo_opsel_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; NOHAZARD-NEXT: S_ENDPGM 0 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -60,7 +61,7 @@ body: | ; HAZARD-LABEL: name: opsel_sdwa_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: S_NOP 0 ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_ENDPGM 0 @@ -68,10 +69,10 @@ body: | ; NOHAZARD-LABEL: name: opsel_sdwa_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; NOHAZARD-NEXT: S_ENDPGM 0 - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) S_ENDPGM 0 ... @@ -85,17 +86,17 @@ body: | ; HAZARD-LABEL: name: opsel_no_sdwa_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; NOHAZARD-NEXT: S_ENDPGM 0 - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec S_ENDPGM 0 ... @@ -109,17 +110,17 @@ body: | ; HAZARD-LABEL: name: no_opsel_sdwa_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; NOHAZARD-NEXT: S_ENDPGM 0 - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) S_ENDPGM 0 ... @@ -133,67 +134,46 @@ body: | ; HAZARD-LABEL: name: opsel_opsel_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: opsel_opsel_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; NOHAZARD-NEXT: S_ENDPGM 0 - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec S_ENDPGM 0 ... ---- -name: opsel_no_opsel_hazard -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - - ; HAZARD-LABEL: name: opsel_no_opsel_hazard - ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; HAZARD-NEXT: S_ENDPGM 0 - ; - ; NOHAZARD-LABEL: name: opsel_no_opsel_hazard - ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; NOHAZARD-NEXT: S_ENDPGM 0 - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - S_ENDPGM 0 -... +# TODO -- there is no reason for s_nop --- -name: no_opsel_opsel_hazard +name: opsel_opsel_no_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - ; HAZARD-LABEL: name: no_opsel_opsel_hazard + ; HAZARD-LABEL: name: opsel_opsel_no_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; - ; NOHAZARD-LABEL: name: no_opsel_opsel_hazard + ; NOHAZARD-LABEL: name: opsel_opsel_no_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; NOHAZARD-NEXT: S_ENDPGM 0 - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec S_ENDPGM 0 ... From 6bc5b9742269243dfceaf6508eda5aa003ff7c5e Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 30 Jul 2024 15:18:55 -0700 Subject: [PATCH 08/17] Add V_CVT_SR_*8_F32 as having forwarded dest + Address comments Change-Id: If3662fe303c5ae4608a3fdb6bc2b231326cd4af0 --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 40 +++-- llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 144 ++++++++++++++++-- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 2 + 3 files changed, 167 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 8a6494d7599b01..8dff228fe9de2c 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -889,14 +889,29 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { return nullptr; const SIInstrInfo *TII = ST.getInstrInfo(); + + unsigned Opcode = MI.getOpcode(); + + // There are three different types of instructions + // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 with + // op_sel[3] != 0, and 3. CVR_SR_FP8_F32 and CVT_SR_BF8_F32 with op_sel[3:2] + // != 0 + if (SIInstrInfo::isSDWA(MI)) { + // Type 1: SDWA with dst_sel != DWORD if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) if (DstSel->getImm() == AMDGPU::SDWA::DWORD) return nullptr; } else { - if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) || + // Type 2 && Type 3: (VOP3 with op_sel[3] != 0) || (CVT_SR_FP8_F32 and + // CVT_SR_BF8_F32 with op_sel[3:2] != 0) + if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & - SISrcMods::DST_OP_SEL)) + SISrcMods::DST_OP_SEL || + ((Opcode == AMDGPU::V_CVT_SR_BF8_F32_e64 || + Opcode == AMDGPU::V_CVT_SR_FP8_F32_e64) && + (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & + SISrcMods::OP_SEL_0)))) return nullptr; } @@ -953,18 +968,23 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { for (auto Dst : Dsts) { Register Def = Dst->getReg(); + // We must consider implicit reads of the VALU. SDWA with dst_sel and + // UNUSED_PRESERVE will implicitly read the result from forwarded dest, + // and we must account for that hazard. for (const MachineOperand &Use : VALU->all_uses()) { if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) return true; } - // We also read the dst for sub 32 writes to the same register for ECC + if (!TII->isVOP3(*VALU)) + return false; + + // We also must account for WAW hazards. In particular, WAW hazards with + // dest op_sel and dest preserve semantics will read the forwarded dest + // for parity check for ECC. Without accounting for this hazard, the ECC + // will be wrong. if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) { - Register ThisDef = ThisDst->getReg(); - if (!TRI->regsOverlap(Def, ThisDef)) - return false; - if (TII->isVOP3(*VALU) && !TII->isVOP3P(*VALU)) - return true; + return TRI->regsOverlap(Def, ThisDst->getReg()); } } return false; @@ -1077,7 +1097,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { if (ST.hasDstSelForwardingHazard()) { const int Shift16DefWaitstates = 1; - auto IsShift16BitDefFn = [this](const MachineInstr &MI) { + auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) { const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); // Assume inline asm reads the dst if (Dst) @@ -1087,7 +1107,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { // Assume other inline asm has dst forwarding hazard for (auto &Op : drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { - if (Op.isReg() && Op.isDef()) { + if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) { return true; } } diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index cda7f2a679fe06..fd97f74d0c744d 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -77,20 +77,21 @@ body: | S_ENDPGM 0 ... + --- -name: opsel_no_sdwa_hazard +name: opsel_no_sdwa_no_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - ; HAZARD-LABEL: name: opsel_no_sdwa_hazard + ; HAZARD-LABEL: name: opsel_no_sdwa_no_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; - ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard + ; NOHAZARD-LABEL: name: opsel_no_sdwa_no_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec @@ -102,19 +103,19 @@ body: | ... --- -name: no_opsel_sdwa_hazard +name: no_opsel_sdwa_no_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - ; HAZARD-LABEL: name: no_opsel_sdwa_hazard + ; HAZARD-LABEL: name: no_opsel_sdwa_no_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_ENDPGM 0 ; - ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard + ; NOHAZARD-LABEL: name: no_opsel_sdwa_no_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec @@ -177,6 +178,32 @@ body: | S_ENDPGM 0 ... +# DS_READ_U16_D16 has dest preserve semantics, but only VALU consumers have hazard + +--- +name: sdwa_loadsel_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_loadsel_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_loadsel_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec + renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec + S_ENDPGM 0 +... + --- name: sdwa_sdwa_hazard body: | @@ -203,19 +230,118 @@ body: | ... --- -name: sdwa_nosdwa_hazard +name: cvt_sdwa_hazard_1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_hazard_1 + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_1 + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: cvt_sdwa_hazard_2 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_hazard_2 + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_2 + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: cvt_sdwa_hazard_3 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_hazard_3 + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_3 + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: cvt_sdwa_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: sdwa_nosdwa_no_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode - ; HAZARD-LABEL: name: sdwa_nosdwa_hazard + ; HAZARD-LABEL: name: sdwa_nosdwa_no_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; - ; NOHAZARD-LABEL: name: sdwa_nosdwa_hazard + ; NOHAZARD-LABEL: name: sdwa_nosdwa_no_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index d3fc96d7ff8012..8313f5b655efba 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -375,6 +375,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -469,6 +470,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; From 7744d2c6230f041c89458ec74eb77a820a131ed9 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 31 Jul 2024 10:43:43 -0700 Subject: [PATCH 09/17] Update comments Change-Id: I3011b7a6092cb056bf6095966dee9bc6be0586b7 --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 8dff228fe9de2c..b7066de1203696 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -883,6 +883,12 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, return WaitStatesNeeded; } +/// Dest sel forwarding issue occurs if additional logic is needed to swizzle / +/// pack the computed value into correct bit position of the dest register. This +/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with +/// dst_sel that is not aligned to the register. This function analayzes the \p +/// MI and \returns an operand with dst setl forwarding issue, or nullptr if +/// none exists. static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { if (!SIInstrInfo::isVALU(MI)) @@ -893,17 +899,17 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { unsigned Opcode = MI.getOpcode(); // There are three different types of instructions - // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 with - // op_sel[3] != 0, and 3. CVR_SR_FP8_F32 and CVT_SR_BF8_F32 with op_sel[3:2] + // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 + // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and + // CVT_SR_BF8_F32 with op_sel[3:2] // != 0 - if (SIInstrInfo::isSDWA(MI)) { // Type 1: SDWA with dst_sel != DWORD if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) if (DstSel->getImm() == AMDGPU::SDWA::DWORD) return nullptr; } else { - // Type 2 && Type 3: (VOP3 with op_sel[3] != 0) || (CVT_SR_FP8_F32 and + // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and // CVT_SR_BF8_F32 with op_sel[3:2] != 0) if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & From d1b308415bdc5e349b0e36eb1a13b9ad288a23d3 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 2 Aug 2024 09:24:52 -0700 Subject: [PATCH 10/17] Insert s_nop for all RAW and WAW Change-Id: I838ccf7fc3e7263b95a179d4c3364d6297a9ac41 --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index b7066de1203696..13ed58e80f1d78 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -887,7 +887,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, /// pack the computed value into correct bit position of the dest register. This /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with /// dst_sel that is not aligned to the register. This function analayzes the \p -/// MI and \returns an operand with dst setl forwarding issue, or nullptr if +/// MI and \returns an operand with dst forwarding issue, or nullptr if /// none exists. static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { @@ -972,25 +972,19 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { } for (auto Dst : Dsts) { - Register Def = Dst->getReg(); - // We must consider implicit reads of the VALU. SDWA with dst_sel and // UNUSED_PRESERVE will implicitly read the result from forwarded dest, // and we must account for that hazard. - for (const MachineOperand &Use : VALU->all_uses()) { - if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) - return true; - } - - if (!TII->isVOP3(*VALU)) - return false; - - // We also must account for WAW hazards. In particular, WAW hazards with - // dest op_sel and dest preserve semantics will read the forwarded dest + // We also must account for WAW hazards. In particular, WAW with dest preserve semantics + // (e.g. VOP3 with op_sel, VOP2 && !zeroesHigh16BitsOfDest) will read the forwarded dest // for parity check for ECC. Without accounting for this hazard, the ECC // will be wrong. - if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) { - return TRI->regsOverlap(Def, ThisDst->getReg()); + // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. complete zeroesHigh16BitsOfDest) + for (auto &Operand : VALU->operands()) { + if (Operand.isReg() && + TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { + return true; + } } } return false; From b02ce1694fb2a342e44c4bb4c5ee692711e0999e Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 2 Aug 2024 10:43:33 -0700 Subject: [PATCH 11/17] Use searchable table instead of hardcoding instructions Change-Id: Iecb574091f5389be115d5f853f3e06bcb396f7f9 --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 28 ++++++++++--------- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 10 +++++++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 12 ++++++++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 ++ llvm/lib/Target/AMDGPU/VOP3Instructions.td | 1 + llvm/lib/Target/AMDGPU/VOPInstructions.td | 2 ++ llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 6 ++++ 7 files changed, 49 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 13ed58e80f1d78..44c8020f512d3e 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -914,8 +914,7 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & SISrcMods::DST_OP_SEL || - ((Opcode == AMDGPU::V_CVT_SR_BF8_F32_e64 || - Opcode == AMDGPU::V_CVT_SR_FP8_F32_e64) && + (AMDGPU::isFP8DstSelInst(Opcode) && (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & SISrcMods::OP_SEL_0)))) return nullptr; @@ -955,7 +954,6 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { const int Shift16DefWaitstates = 1; auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { - const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); SmallVector Dsts; const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST); @@ -975,14 +973,16 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { // We must consider implicit reads of the VALU. SDWA with dst_sel and // UNUSED_PRESERVE will implicitly read the result from forwarded dest, // and we must account for that hazard. - // We also must account for WAW hazards. In particular, WAW with dest preserve semantics - // (e.g. VOP3 with op_sel, VOP2 && !zeroesHigh16BitsOfDest) will read the forwarded dest - // for parity check for ECC. Without accounting for this hazard, the ECC - // will be wrong. - // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. complete zeroesHigh16BitsOfDest) + // We also must account for WAW hazards. In particular, WAW with dest + // preserve semantics (e.g. VOP3 with op_sel, VOP2 && + // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity + // check for ECC. Without accounting for this hazard, the ECC will be + // wrong. + // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. + // complete zeroesHigh16BitsOfDest) for (auto &Operand : VALU->operands()) { if (Operand.isReg() && - TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { + TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { return true; } } @@ -1088,12 +1088,14 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { for (const MachineOperand &Op : llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { if (Op.isReg() && Op.isDef()) { - WaitStatesNeeded = - std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); - if (!TRI.isVectorRegister(MRI, Op.getReg())) continue; + if (ST.has12DWordStoreHazard()) { + WaitStatesNeeded = + std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); + } + if (ST.hasDstSelForwardingHazard()) { const int Shift16DefWaitstates = 1; @@ -1104,7 +1106,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { return true; if (MI.isInlineAsm()) { - // Assume other inline asm has dst forwarding hazard + // If MI is inline asm, assume it has dst forwarding hazard for (auto &Op : drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index d2838349340d2e..d0cb15629ac1ee 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2325,6 +2325,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit IsFP8SrcByteSel = 0; field bit IsFP8DstByteSel = 0; + field bit HasFP8DstByteSel = 0; field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel); field bit HasDst = !ne(DstVT.Value, untyped.Value); @@ -2904,6 +2905,15 @@ def getVCMPXOpFromVCMP : InstrMapping { let ValueCols = [["1"]]; } +def FP8DstByteSelTable : GenericTable { + let FilterClass = "VOP3_Pseudo"; + let CppTypeName = "FP8DstByteSelInfo"; + let Fields = ["Opcode", "HasFP8DstByteSel"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getFP8DstByteSelHelper"; +} + def VOPDComponentTable : GenericTable { let FilterClass = "VOPD_Component"; let CppTypeName = "VOPDComponentInfo"; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 96d4863e94014c..c3a648b3c2e06d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -385,6 +385,13 @@ struct SingleUseExceptionInfo { bool IsInvalidSingleUseProducer; }; +struct FP8DstByteSelInfo { + uint16_t Opcode; + bool HasFP8DstByteSel; +}; + +#define GET_FP8DstByteSelTable_DECL +#define GET_FP8DstByteSelTable_IMPL #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -629,6 +636,11 @@ bool isInvalidSingleUseProducerInst(unsigned Opc) { return Info && Info->IsInvalidSingleUseProducer; } +bool isFP8DstSelInst(unsigned Opc) { + const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc); + return Info ? Info->HasFP8DstByteSel : false; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 429c3ad335d213..cb31cd690101d0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -861,6 +861,9 @@ getVOPDInstInfo(unsigned VOPDOpcode, const MCInstrInfo *InstrInfo); LLVM_READONLY bool isTrue16Inst(unsigned Opc); +LLVM_READONLY +bool isFP8DstSelInst(unsigned Opc); + LLVM_READONLY bool isInvalidSingleUseConsumerInst(unsigned Opc); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index efa8e9c74d4495..b3691ff48e64d7 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -568,6 +568,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, let HasSrc2Mods = 1; let HasExtVOP3DPP = 1; let HasOpSel = 1; + let HasFP8DstByteSel = 1; let AsmVOP3OpSel = !subst(", $src2_modifiers", "", getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index f2ed17ac305a1b..ae97a66a5827bd 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -113,6 +113,8 @@ class VOP3_Pseudo pattern = [], let IsWMMA = P.IsWMMA; let IsSWMMAC = P.IsSWMMAC; + bit HasFP8DstByteSel = !if(isVop3OpSel, P.HasFP8DstByteSel, 0); + let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64)); diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index fd97f74d0c744d..e24817078d8bc9 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -78,6 +78,8 @@ body: | ... +# TODO -- there is no reason for s_nop (V_ADD_U16 doesn't preserve the dest) + --- name: opsel_no_sdwa_no_hazard body: | @@ -88,6 +90,7 @@ body: | ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_NOP 0 ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; @@ -328,6 +331,8 @@ body: | S_ENDPGM 0 ... +# TODO -- there is no reason for s_nop (V_ADD_U16 doesn't preserve the dest) + --- name: sdwa_nosdwa_no_hazard body: | @@ -338,6 +343,7 @@ body: | ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_NOP 0 ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec ; HAZARD-NEXT: S_ENDPGM 0 ; From 7f43ff672c193dc68508a2370832a421aaea9890 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 5 Aug 2024 09:28:10 -0700 Subject: [PATCH 12/17] Consistently use HasFP8DstByteSel Change-Id: Iddb81c71b16d48242e26fcb598e4d824efc55a3c --- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 1 + llvm/lib/Target/AMDGPU/VOPInstructions.td | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index b3691ff48e64d7..01b1c63dd0d6e0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -588,6 +588,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, class VOP3_CVT_SR_F8_ByteSel_Profile : VOP3_Profile> { let IsFP8DstByteSel = 1; + let HasFP8DstByteSel = 1; let HasClamp = 0; defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel); let Ins64 = !con(getIns64 pattern = [], let IsWMMA = P.IsWMMA; let IsSWMMAC = P.IsSWMMAC; - bit HasFP8DstByteSel = !if(isVop3OpSel, P.HasFP8DstByteSel, 0); + bit HasFP8DstByteSel = P.HasFP8DstByteSel; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, From 23d28ff25bec611ab3fd64ffc2a11d276079c941 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 16 Aug 2024 12:03:04 -0700 Subject: [PATCH 13/17] Review comments Change-Id: I001760b83d26846912a24f0ed1c3f1b9be90915a --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 53 ++++++++++--------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 44c8020f512d3e..fd2f4810ec9c8c 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -923,6 +923,30 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); } +/// Checks whether the provided \p MI "consumes" the operand with a Dest sel +/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit +/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) +static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, + const MachineOperand *Dst, + const SIRegisterInfo *TRI) { + // We must consider implicit reads of the VALU. SDWA with dst_sel and + // UNUSED_PRESERVE will implicitly read the result from forwarded dest, + // and we must account for that hazard. + // We also must account for WAW hazards. In particular, WAW with dest + // preserve semantics (e.g. VOP3 with op_sel, VOP2 && + // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity + // check for ECC. Without accounting for this hazard, the ECC will be + // wrong. + // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. + // complete zeroesHigh16BitsOfDest) + for (auto &Operand : VALU->operands()) { + if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { + return true; + } + } + return false; +} + int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { int WaitStatesNeeded = 0; @@ -955,38 +979,17 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); - SmallVector Dsts; const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST); if (ForwardedDst) { - Dsts.push_back(ForwardedDst); + return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); } else if (MI.isInlineAsm()) { // Assume inline asm has dst forwarding hazard - for (auto &Op : - drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { - if (Op.isReg() && Op.isDef()) { - Dsts.push_back(&Op); - } - } - } - - for (auto Dst : Dsts) { - // We must consider implicit reads of the VALU. SDWA with dst_sel and - // UNUSED_PRESERVE will implicitly read the result from forwarded dest, - // and we must account for that hazard. - // We also must account for WAW hazards. In particular, WAW with dest - // preserve semantics (e.g. VOP3 with op_sel, VOP2 && - // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity - // check for ECC. Without accounting for this hazard, the ECC will be - // wrong. - // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. - // complete zeroesHigh16BitsOfDest) - for (auto &Operand : VALU->operands()) { - if (Operand.isReg() && - TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { + for (auto &Def : MI.all_defs()) { + if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) return true; - } } } + return false; }; From a62e5c81f7760b133a1a8667366a9889250ebeed Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 19 Aug 2024 12:40:17 -0700 Subject: [PATCH 14/17] Consistent with downstream Change-Id: I56ec0c004b941678bc5538b6eca5f02db5e1712e --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 50 ++++++++++--------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index fd2f4810ec9c8c..309560b9322030 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -982,7 +982,9 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST); if (ForwardedDst) { return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); - } else if (MI.isInlineAsm()) { + } + + if (MI.isInlineAsm()) { // Assume inline asm has dst forwarding hazard for (auto &Def : MI.all_defs()) { if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) @@ -1098,35 +1100,35 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); } + } + } - if (ST.hasDstSelForwardingHazard()) { - const int Shift16DefWaitstates = 1; + if (ST.hasDstSelForwardingHazard()) { + const int Shift16DefWaitstates = 1; - auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) { - const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); - // Assume inline asm reads the dst - if (Dst) - return true; + auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) { + const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); + // Assume inline asm reads the dst + if (Dst) + return true; - if (MI.isInlineAsm()) { - // If MI is inline asm, assume it has dst forwarding hazard - for (auto &Op : - drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { - if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) { - return true; - } - } + if (MI.isInlineAsm()) { + // If MI is inline asm, assume it has dst forwarding hazard + for (auto &Op : + drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { + if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) { + return true; } + } + } - return false; - }; + return false; + }; - int WaitStatesNeededForDef = - Shift16DefWaitstates - - getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); - WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); - } - } + int WaitStatesNeededForDef = + Shift16DefWaitstates - + getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); } return WaitStatesNeeded; From 98388f59cc54d3fd00445556b335b170d1edffe7 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 19 Aug 2024 13:49:44 -0700 Subject: [PATCH 15/17] Don't process all inlinseAsm regs Change-Id: I4afe3d78f83da3ab0f37d98047e9f5182c7c0610 --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 309560b9322030..7d7e2516a51df8 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1110,13 +1110,12 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); // Assume inline asm reads the dst if (Dst) - return true; + return IA->modifiesRegister(Dst->getReg(), &TRI) || IA->readsRegister(Dst->getReg(), &TRI); if (MI.isInlineAsm()) { // If MI is inline asm, assume it has dst forwarding hazard - for (auto &Op : - drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) { - if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) { + for (auto &Def : MI.all_defs()) { + if (IA->modifiesRegister(Def.getReg(), &TRI) || IA->readsRegister(Def.getReg(), &TRI)) { return true; } } From 2f4b5eec2e0a8299817be6ab1435821827f2f099 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 19 Aug 2024 14:00:40 -0700 Subject: [PATCH 16/17] Formatting Change-Id: Ib101d4d205a09bf3c7a4c3d4a8e82a480d0c47e6 --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 7d7e2516a51df8..82adcab56784b6 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1110,12 +1110,14 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); // Assume inline asm reads the dst if (Dst) - return IA->modifiesRegister(Dst->getReg(), &TRI) || IA->readsRegister(Dst->getReg(), &TRI); + return IA->modifiesRegister(Dst->getReg(), &TRI) || + IA->readsRegister(Dst->getReg(), &TRI); if (MI.isInlineAsm()) { // If MI is inline asm, assume it has dst forwarding hazard for (auto &Def : MI.all_defs()) { - if (IA->modifiesRegister(Def.getReg(), &TRI) || IA->readsRegister(Def.getReg(), &TRI)) { + if (IA->modifiesRegister(Def.getReg(), &TRI) || + IA->readsRegister(Def.getReg(), &TRI)) { return true; } } From cd0c3c49b6be18f6a0ae1af167077cc5b7aa4500 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 21 Aug 2024 15:58:12 -0700 Subject: [PATCH 17/17] Review Comments --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 82adcab56784b6..e811024b73d31f 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -977,16 +977,17 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { if (ST.hasDstSelForwardingHazard()) { const int Shift16DefWaitstates = 1; - auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { + auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST); + const MachineOperand *ForwardedDst = + getDstSelForwardingOperand(ProducerMI, ST); if (ForwardedDst) { return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); } - if (MI.isInlineAsm()) { + if (ProducerMI.isInlineAsm()) { // Assume inline asm has dst forwarding hazard - for (auto &Def : MI.all_defs()) { + for (auto &Def : ProducerMI.all_defs()) { if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) return true; } @@ -1106,16 +1107,16 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { if (ST.hasDstSelForwardingHazard()) { const int Shift16DefWaitstates = 1; - auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) { - const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST); + auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { + const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); // Assume inline asm reads the dst if (Dst) return IA->modifiesRegister(Dst->getReg(), &TRI) || IA->readsRegister(Dst->getReg(), &TRI); - if (MI.isInlineAsm()) { + if (ProducerMI.isInlineAsm()) { // If MI is inline asm, assume it has dst forwarding hazard - for (auto &Def : MI.all_defs()) { + for (auto &Def : ProducerMI.all_defs()) { if (IA->modifiesRegister(Def.getReg(), &TRI) || IA->readsRegister(Def.getReg(), &TRI)) { return true;