Skip to content

Commit

Permalink
Overwritten with: f3e0a98 Revert "[AMDGPU] Make SGPR spills exec mask…
Browse files Browse the repository at this point in the history
… agnostic"

Based on upstream llvm : 1813451 [Test] Add test showing missing opportunity of folding ICmp(Phi(Consts...))

Local changes since 0c3b9cd:
f3e0a98 Revert "[AMDGPU] Make SGPR spills exec mask agnostic"

Added AMD modification notices and removed some GPL files.

Change-Id: I6a0fa2b92ab6ddd7e10ef1e03f3e7f46b8b2cd1f
  • Loading branch information
trenouf committed Jun 10, 2020
2 parents 0c3b9cd + f3e0a98 commit 3274df1
Show file tree
Hide file tree
Showing 8 changed files with 342 additions and 741 deletions.
305 changes: 84 additions & 221 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -871,145 +871,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
}

// Generate a VMEM access which loads or stores the VGPR containing an SGPR
// spill such that all the lanes set in VGPRLanes are loaded or stored.
// This generates exec mask manipulation and will use SGPRs available in MI
// or VGPR lanes in the VGPR to save and restore the exec mask.
void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
int Index, int Offset,
unsigned EltSize, Register VGPR,
int64_t VGPRLanes,
RegScavenger *RS,
bool IsLoad) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = ST.getInstrInfo();

Register SuperReg = MI->getOperand(0).getReg();
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
unsigned FirstPart = isWave32 ? Offset * 16 : Offset * 32;

bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();

const bool SuperRegIsExec =
SuperReg == AMDGPU::EXEC || SuperReg == AMDGPU::EXEC_LO;

// If exec mask is stored in the VGPR, make sure it is stored after
// any lanes used by the spill (16 lanes on Wave32, 32 lanes on Wave64).
const unsigned ExecLoLane = SuperRegIsExec ? 0 : (isWave32 ? 16 : 32);
const unsigned ExecHiLane = SuperRegIsExec ? 1 : (isWave32 ? 17 : 33);

// Try to use the src/dst SGPRs to hold a copy of the exec mask.
// Use VGPR lanes when this is not possible, i.e. the src value
// must be valid after the spill or src is smaller than exec mask.
bool StoreExecInVGPR = !IsLoad && (SuperRegIsExec || !IsKill);

// On Wave32 only handle EXEC_LO.
// On Wave64 only update EXEC_HI if there is sufficent space for a copy.
bool OnlyExecLo = isWave32 || NumSubRegs == 1;

unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Register SavedExecReg;

// Backup EXEC
if (SuperRegIsExec) {
// Do nothing; exec is already stored in VGPR or will be overwritten
} else if (StoreExecInVGPR) {
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
VGPR)
.addReg(AMDGPU::EXEC_LO)
.addImm(ExecLoLane)
.addReg(VGPR, getUndefRegState(IsLoad));

if (!isWave32) {
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
VGPR)
.addReg(AMDGPU::EXEC_HI)
.addImm(ExecHiLane)
.addReg(VGPR);
}
} else {
if (OnlyExecLo) {
SavedExecReg = NumSubRegs == 1
? SuperReg
: getSubReg(SuperReg, SplitParts[FirstPart]);
} else {
SavedExecReg =
getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart]),
AMDGPU::sub0, &AMDGPU::SGPR_64RegClass);
// If src/dst is an odd size it is possible subreg0 is not aligned.
if (!SavedExecReg && NumSubRegs > 2)
SavedExecReg =
getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart + 1]),
AMDGPU::sub0, &AMDGPU::SGPR_64RegClass);
}

assert(SavedExecReg);
BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
}

// Setup EXEC
BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);

// Load/store VGPR
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);

Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
? getBaseRegister()
: getFrameRegister(*MF);

Align Alignment = FrameInfo.getObjectAlign(Index);
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(*MF, Index);
MachineMemOperand *MMO = MF->getMachineMemOperand(
PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
EltSize, Alignment);

if (IsLoad) {
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
Index,
VGPR, false,
MFI->getScratchRSrcReg(), FrameReg,
Offset * EltSize, MMO,
RS);
} else {
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
Index,
VGPR, !StoreExecInVGPR,
MFI->getScratchRSrcReg(), FrameReg,
Offset * EltSize, MMO,
RS);
// This only ever adds one VGPR spill
MFI->addToSpilledVGPRs(1);
}

// Restore EXEC
if (SuperRegIsExec && IsLoad) {
// Do nothing; exec will be overwritten
} else if (StoreExecInVGPR) {
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
AMDGPU::EXEC_LO)
.addReg(VGPR, getKillRegState(!IsLoad && isWave32))
.addImm(ExecLoLane);
if (!isWave32) {
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
AMDGPU::EXEC_HI)
.addReg(VGPR, getKillRegState(!IsLoad))
.addImm(ExecHiLane);
}
} else {
assert(SavedExecReg);
BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
.addReg(SavedExecReg, RegState::Kill);
}
}

bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
Expand All @@ -1031,6 +892,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();

MachineFrameInfo &FrameInfo = MF->getFrameInfo();

assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
SuperReg != MFI->getFrameOffsetReg()));

Expand All @@ -1042,10 +905,17 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();

if (SpillToVGPR) {
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
// Scavenged temporary VGPR to use. It must be scavenged once for any number
// of spilled subregs.
Register TmpVGPR;

// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];

// During SGPR spilling to VGPR, determine if the VGPR is defined. The
Expand All @@ -1067,52 +937,42 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
// it are fixed.
}
} else {
// Scavenged temporary VGPR to use. It must be scavenged once for any number
// of spilled subregs.
Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);

// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);

unsigned PerVGPR = isWave32 ? 16 : 32;
unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;

for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
unsigned TmpVGPRFlags = RegState::Undef;

// Write sub registers into the VGPR
for (unsigned i = Offset * PerVGPR,
e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
i < e; ++i) {
Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

MachineInstrBuilder WriteLane =
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
TmpVGPR)
.addReg(SubReg, SubKillState)
.addImm(i % PerVGPR)
.addReg(TmpVGPR, TmpVGPRFlags);
TmpVGPRFlags = 0;

// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
if (NumSubRegs > 1) {
// The last implicit use of the SuperReg carries the "Kill" flag.
unsigned SuperKillState = 0;
if (i + 1 == NumSubRegs)
SuperKillState |= getKillRegState(IsKill);
WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
}
} else {
// XXX - Can to VGPR spill fail for some subregisters but not others?
if (OnlyToVGPR)
return false;

// Spill SGPR to a frame index.
if (!TmpVGPR.isValid())
TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);

MachineInstrBuilder Mov
= BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(SubReg, SubKillState);

// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
if (NumSubRegs > 1) {
// The last implicit use of the SuperReg carries the "Kill" flag.
unsigned SuperKillState = 0;
if (i + 1 == e)
SuperKillState |= getKillRegState(IsKill);
Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
}

// Write out VGPR
buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
RS, false);
Align Alignment = FrameInfo.getObjectAlign(Index);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO =
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize,
commonAlignment(Alignment, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
.addReg(TmpVGPR, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srrsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
.addImm(i * 4) // offset
.addMemOperand(MMO);
}
}

Expand All @@ -1135,6 +995,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
if (OnlyToVGPR && !SpillToVGPR)
return false;

MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();

Expand All @@ -1149,11 +1010,13 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();

if (SpillToVGPR) {
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
Register TmpVGPR;

for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
auto MIB =
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
Expand All @@ -1163,36 +1026,36 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,

if (NumSubRegs > 1 && i == 0)
MIB.addReg(SuperReg, RegState::ImplicitDefine);
}
} else {
Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);

unsigned PerVGPR = isWave32 ? 16 : 32;
unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;

for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
// Load in VGPR data
buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
RS, true);

// Unpack lanes
for (unsigned i = Offset * PerVGPR,
e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
i < e; ++i) {
Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

bool LastSubReg = (i + 1 == e);
auto MIB =
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
.addReg(TmpVGPR, getKillRegState(LastSubReg))
.addImm(i);

if (NumSubRegs > 1 && i == 0)
MIB.addReg(SuperReg, RegState::ImplicitDefine);
}
} else {
if (OnlyToVGPR)
return false;

// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
if (!TmpVGPR.isValid())
TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
Align Alignment = FrameInfo.getObjectAlign(Index);

MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);

MachineMemOperand *MMO =
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize,
commonAlignment(Alignment, EltSize * i));

BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
.addImm(i * 4) // offset
.addMemOperand(MMO);

auto MIB =
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpVGPR, RegState::Kill);

if (NumSubRegs > 1)
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
}
}

Expand Down
5 changes: 0 additions & 5 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
const TargetRegisterClass *getPointerRegClass(
const MachineFunction &MF, unsigned Kind = 0) const override;

void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index,
int Offset, unsigned EltSize, Register VGPR,
int64_t VGPRLanes, RegScavenger *RS,
bool IsLoad) const;

/// If \p OnlyToVGPR is true, this will only succeed if this
bool spillSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
Expand Down
Loading

0 comments on commit 3274df1

Please sign in to comment.