diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 40f4b74b70cf1b..02a0518ab5ab27 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -69,7 +69,6 @@ #include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "Utils/AMDGPUMCUtils.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" @@ -405,67 +404,6 @@ static bool isReachable(const MachineInstr *From, (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); } -// Writelane is special in that it can use SGPR and M0 (which would normally -// count as using the constant bus twice - but in this case it is allowed as the -// lane selector doesn't count as a use of the constant bus). -// However, it is still required to abide by the 1 SGPR rule -// Apply a fix here as we might have multiple SGPRs after legalizing VGPRs to -// SGPRs -static bool fixWriteLane(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - bool Changed = false; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - - if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32) { - int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - MachineOperand &Src1 = MI.getOperand(Src1Idx); - - // Check to see if the instruction violates the 1 SGPR rule - if ((Src0.isReg() && TRI->isSGPRReg(MRI, Src0.getReg()) && Src0.getReg() != AMDGPU::M0) && - (Src1.isReg() && TRI->isSGPRReg(MRI, Src1.getReg()) && Src1.getReg() != AMDGPU::M0)) { - - // Check for trivially easy constant prop into one of the operands - // If this is the case then perform the operation now to resolve SGPR - // issue - bool Resolved = false; - std::vector MOs { &Src0, &Src1 }; - for (auto MO : MOs ) { - auto Imm = AMDGPU::foldToImm(*MO, &MRI, TII); - if (Imm && TII->isInlineConstant(APInt(64, *Imm, true))) { - MO->ChangeToImmediate(*Imm); - Changed = true; - Resolved = true; - break; - } - } - - if (!Resolved) { - // Haven't managed to resolve by replacing an SGPR with an immediate - // Move src1 to be in M0 - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .add(Src1); - Src1.ChangeToRegister(AMDGPU::M0, false); - Changed = true; - } - } - } - } - } - - return Changed; -} - // Return the first non-prologue instruction in the block. static MachineBasicBlock::iterator getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { @@ -750,8 +688,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } } - fixWriteLane(MF); - if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 06b9ba45b88e30..9b3b2436475ce5 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -26,7 +26,6 @@ #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" -#include "Utils/AMDGPUMCUtils.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" @@ -78,6 +77,8 @@ class SIPeepholeSDWA : public MachineFunctionPass { std::unordered_map PotentialMatches; SmallVector ConvertedInstructions; + Optional foldToImm(const MachineOperand &Op) const; + public: static char ID; @@ -518,6 +519,33 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, return SDWADstOperand::convertToSDWA(MI, TII); } +Optional SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { + if (Op.isImm()) { + return Op.getImm(); + } + + // If this is not immediate then it can be copy of immediate value, e.g.: + // %1 = S_MOV_B32 255; + if (Op.isReg()) { + for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { + if (!isSameReg(Op, Def)) + continue; + + const MachineInstr *DefInst = Def.getParent(); + if (!TII->isFoldableCopy(*DefInst)) + return None; + + const MachineOperand &Copied = DefInst->getOperand(1); + if (!Copied.isImm()) + return None; + + return Copied.getImm(); + } + } + + return None; +} + std::unique_ptr SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { unsigned Opcode = MI.getOpcode(); @@ -537,7 +565,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { // from: v_lshlrev_b32_e32 v1, 16/24, v0 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = AMDGPU::foldToImm(*Src0, MRI, TII); + auto Imm = foldToImm(*Src0); if (!Imm) break; @@ -578,7 +606,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { // from: v_lshlrev_b16_e32 v1, 8, v0 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = AMDGPU::foldToImm(*Src0, MRI, TII); + auto Imm = foldToImm(*Src0); if (!Imm || *Imm != 8) break; @@ -618,12 +646,12 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { // 24 | 8 | BYTE_3 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - auto Offset = AMDGPU::foldToImm(*Src1, MRI, TII); + auto Offset = foldToImm(*Src1); if (!Offset) break; MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - auto Width = AMDGPU::foldToImm(*Src2, MRI, TII); + auto Width = foldToImm(*Src2); if (!Width) break; @@ -666,10 +694,10 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); auto ValSrc = Src1; - auto Imm = AMDGPU::foldToImm(*Src0, MRI, TII); + auto Imm = foldToImm(*Src0); if (!Imm) { - Imm = AMDGPU::foldToImm(*Src1, MRI, TII); + Imm = foldToImm(*Src1); ValSrc = Src0; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMCUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMCUtils.cpp deleted file mode 100644 index 57a5e2b2927324..00000000000000 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMCUtils.cpp +++ /dev/null @@ -1,52 +0,0 @@ -//===- AMDGPUMCUtils.cpp - MachineIR utils for AMDGPU -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCUtils.h" - -namespace llvm { - -namespace AMDGPU { - -static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { - return LHS.isReg() && - RHS.isReg() && - LHS.getReg() == RHS.getReg() && - LHS.getSubReg() == RHS.getSubReg(); -} - -Optional foldToImm(const MachineOperand &Op, - const MachineRegisterInfo *MRI, const SIInstrInfo *TII) { - if (Op.isImm()) { - return Op.getImm(); - } - - // If this is not immediate then it can be copy of immediate value, e.g.: - // %1 = S_MOV_B32 255; - if (Op.isReg()) { - for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { - if (!isSameReg(Op, Def)) - continue; - - const MachineInstr *DefInst = Def.getParent(); - if (!TII->isFoldableCopy(*DefInst)) - return None; - - const MachineOperand &Copied = DefInst->getOperand(1); - if (!Copied.isImm()) - return None; - - return Copied.getImm(); - } - } - - return None; -} - -} // end namespace AMDGPU -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMCUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMCUtils.h deleted file mode 100644 index d9effcc86e12cd..00000000000000 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMCUtils.h +++ /dev/null @@ -1,25 +0,0 @@ -//===- AMDGPUMCUtils.h - MachineIR utils for AMDGPU -------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMCUTILS_H -#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMCUTILS_H - -#include "AMDGPU.h" -#include "SIInstrInfo.h" - -namespace llvm { -namespace AMDGPU { - Optional foldToImm(const MachineOperand &Op, - const MachineRegisterInfo *MRI, const SIInstrInfo *TII); - -} // namespace AMDGPU -} // namespace llvm - -#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMCUTILS_H diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt index 5f2cfb8e29c8fd..893e1fb7223908 100644 --- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -3,5 +3,4 @@ add_llvm_library(LLVMAMDGPUUtils AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp AMDGPUPALMetadata.cpp - AMDGPUMCUtils.cpp ) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index b792cf676d68fa..2d0ebe8edb7289 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 ; CHECK-LABEL: {{^}}test_writelane_sreg: -; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0 +; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %oldval = load i32, i32 addrspace(1)* %out %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x ; CHECK-LABEL: {{^}}test_writelane_m0_sreg: ; CHECK: s_mov_b32 m0, -1 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 -; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0 +; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}} define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %oldval = load i32, i32 addrspace(1)* %out %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0) ; CHECK-LABEL: {{^}}test_writelane_sreg_oldval: ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}} -; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0 +; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval) store i32 %writelane, i32 addrspace(1)* %out, align 4 @@ -68,7 +68,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 add ; CHECK-LABEL: {{^}}test_writelane_imm_oldval: ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42 -; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0 +; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42) store i32 %writelane, i32 addrspace(1)* %out, align 4