-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU]Optimize SGPR spills #93668
[AMDGPU]Optimize SGPR spills #93668
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-backend-amdgpu Author: Vikash Gupta (vg0204) Changes
Patch is 69.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93668.diff 12 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index f850767270a4f..172583bc76fa8 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -371,6 +371,8 @@ namespace llvm {
/// StackSlotColoring - This pass performs stack slot coloring.
extern char &StackSlotColoringID;
+ FunctionPass *
+ createStackSlotColoring(bool preserveRegAllocNeededAnalysis = false);
/// This pass lays out funclets contiguously.
extern char &FuncletLayoutID;
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 9fdc8a338b52a..ab58e3dd369cf 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -13,6 +13,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervalUnion.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -64,6 +65,10 @@ namespace {
MachineFrameInfo *MFI = nullptr;
const TargetInstrInfo *TII = nullptr;
const MachineBlockFrequencyInfo *MBFI = nullptr;
+ SlotIndexes *Indexes = nullptr;
+
+ // - preserves Analysis passes in case RA may be called afterwards.
+ bool preserveRegAllocNeededAnalysis = false;
// SSIntervals - Spill slot intervals.
std::vector<LiveInterval*> SSIntervals;
@@ -140,7 +145,9 @@ namespace {
public:
static char ID; // Pass identification
- StackSlotColoring() : MachineFunctionPass(ID) {
+ StackSlotColoring(bool preserveRegAllocNeededAnalysis_ = false)
+ : MachineFunctionPass(ID),
+ preserveRegAllocNeededAnalysis(preserveRegAllocNeededAnalysis_) {
initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
}
@@ -152,6 +159,16 @@ namespace {
AU.addRequired<MachineBlockFrequencyInfo>();
AU.addPreserved<MachineBlockFrequencyInfo>();
AU.addPreservedID(MachineDominatorsID);
+
+ // As in some Target's pipeline, register allocation (RA) might be
+ // splitted into multiple phases based on register class. So, this pass
+ // may be invoked multiple times requiring it to save these analyses to be
+ // used by RA later.
+ if (preserveRegAllocNeededAnalysis) {
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<LiveDebugVariables>();
+ }
+
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -496,8 +513,10 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
++I;
}
- for (MachineInstr *MI : toErase)
+ for (MachineInstr *MI : toErase) {
MI->eraseFromParent();
+ Indexes->removeMachineInstrFromMaps(*MI);
+ }
return changed;
}
@@ -515,6 +534,7 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget().getInstrInfo();
LS = &getAnalysis<LiveStacks>();
MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+ Indexes = &getAnalysis<SlotIndexes>();
bool Changed = false;
@@ -549,3 +569,8 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
return Changed;
}
+
+FunctionPass *
+llvm::createStackSlotColoring(bool preserveRegAllocNeededAnalysis) {
+ return new StackSlotColoring(preserveRegAllocNeededAnalysis);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..2a4b9c2f87e65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1406,6 +1406,12 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// since FastRegAlloc does the replacements itself.
addPass(createVirtRegRewriter(false));
+ // As by this point SGPR's RA is done introducing SGPR spills to stack frame
+ // through SGPRAllocPass. So, invoking StackSlotColoring here, may allow these
+ // SGPR spills to re-use stack slots, before these spills is further lowered
+ // down via SILowerSGPRSpills(i.e. equivalent of PEI for SGPRs).
+ addPass(createStackSlotColoring(/*preserveRegAllocNeededAnalysis*/ true));
+
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
addPass(&SIPreAllocateWWMRegsID);
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index b6a0152f6fa83..fa3aa9d7091b7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -278,15 +278,16 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks) {
MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
+
DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
TII->get(AMDGPU::IMPLICIT_DEF), Reg);
MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
// Set SGPR_SPILL asm printer flag
MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
- if (LIS) {
+
+ if (LIS)
LIS->InsertMachineInstrInMaps(*MIB);
- }
}
}
@@ -300,6 +301,7 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
auto MIB = BuildMI(*RestoreBlock, InsertBefore, DL,
TII->get(TargetOpcode::KILL));
MIB.addReg(Reg);
+
if (LIS)
LIS->InsertMachineInstrInMaps(*MIB);
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4b5f9bdd82b8d..e5829e8193218 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1775,8 +1775,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
if (SpillToVGPR) {
- assert(SB.NumSubRegs == VGPRSpills.size() &&
- "Num of VGPR lanes should be equal to num of SGPRs spilled");
+ assert(SB.NumSubRegs <= VGPRSpills.size() &&
+ "Num of VGPR lanes should be greater or equal to num of SGPRs "
+ "spilled, as Stack Slot Coloring pass assigns different SGPR spills "
+ "into same stack slots");
for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
Register SubReg =
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0ff5dd3680dfa..882eab9ba761f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -362,10 +362,12 @@
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: Virtual Register Rewriter
+; GCN-O1-NEXT: Stack Slot Coloring
; GCN-O1-NEXT: SI lower SGPR spill instructions
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
+; GCN-O1-NEXT: Live Stack Slot Analysis
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: SI Lower WWM Copies
; GCN-O1-NEXT: GCN NSA Reassign
@@ -665,10 +667,12 @@
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
+; GCN-O1-OPTS-NEXT: Stack Slot Coloring
; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
+; GCN-O1-OPTS-NEXT: Live Stack Slot Analysis
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
@@ -974,10 +978,12 @@
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: Virtual Register Rewriter
+; GCN-O2-NEXT: Stack Slot Coloring
; GCN-O2-NEXT: SI lower SGPR spill instructions
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
+; GCN-O2-NEXT: Live Stack Slot Analysis
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: SI Lower WWM Copies
; GCN-O2-NEXT: GCN NSA Reassign
@@ -1295,10 +1301,12 @@
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: Virtual Register Rewriter
+; GCN-O3-NEXT: Stack Slot Coloring
; GCN-O3-NEXT: SI lower SGPR spill instructions
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
+; GCN-O3-NEXT: Live Stack Slot Analysis
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: SI Lower WWM Copies
; GCN-O3-NEXT: GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index fbe34a3a3970b..25e9e09748c81 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -221,15 +221,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: ; def s29
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT: v_writelane_b32 v40, s21, 24
-; GFX906-NEXT: v_writelane_b32 v40, s22, 25
-; GFX906-NEXT: v_writelane_b32 v40, s23, 26
-; GFX906-NEXT: v_writelane_b32 v40, s24, 27
-; GFX906-NEXT: v_writelane_b32 v40, s25, 28
-; GFX906-NEXT: v_writelane_b32 v40, s26, 29
-; GFX906-NEXT: v_writelane_b32 v40, s27, 30
-; GFX906-NEXT: v_writelane_b32 v40, s28, 31
-; GFX906-NEXT: v_writelane_b32 v40, s29, 32
+; GFX906-NEXT: v_writelane_b32 v40, s21, 12
+; GFX906-NEXT: v_writelane_b32 v40, s22, 13
+; GFX906-NEXT: v_writelane_b32 v40, s23, 14
+; GFX906-NEXT: v_writelane_b32 v40, s24, 15
+; GFX906-NEXT: v_writelane_b32 v40, s25, 16
+; GFX906-NEXT: v_writelane_b32 v40, s26, 17
+; GFX906-NEXT: v_writelane_b32 v40, s27, 18
+; GFX906-NEXT: v_writelane_b32 v40, s28, 19
+; GFX906-NEXT: v_writelane_b32 v40, s29, 20
; GFX906-NEXT: v_readlane_b32 s4, v40, 10
; GFX906-NEXT: v_readlane_b32 s6, v40, 0
; GFX906-NEXT: v_readlane_b32 s8, v40, 8
@@ -249,39 +249,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
-; GFX906-NEXT: v_readlane_b32 s21, v40, 24
+; GFX906-NEXT: v_readlane_b32 s21, v40, 12
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s21
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s22, v40, 25
+; GFX906-NEXT: v_readlane_b32 s22, v40, 13
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s22
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s23, v40, 26
+; GFX906-NEXT: v_readlane_b32 s23, v40, 14
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s23
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s24, v40, 27
+; GFX906-NEXT: v_readlane_b32 s24, v40, 15
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s24
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s25, v40, 28
+; GFX906-NEXT: v_readlane_b32 s25, v40, 16
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s25
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s26, v40, 29
+; GFX906-NEXT: v_readlane_b32 s26, v40, 17
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s26
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s27, v40, 30
+; GFX906-NEXT: v_readlane_b32 s27, v40, 18
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s27
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s28, v40, 31
+; GFX906-NEXT: v_readlane_b32 s28, v40, 19
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s28
; GFX906-NEXT: ;;#ASMEND
-; GFX906-NEXT: v_readlane_b32 s29, v40, 32
+; GFX906-NEXT: v_readlane_b32 s29, v40, 20
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s29
; GFX906-NEXT: ;;#ASMEND
@@ -602,15 +602,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: ; def s29
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT: v_writelane_b32 v40, s21, 24
-; GFX908-NEXT: v_writelane_b32 v40, s22, 25
-; GFX908-NEXT: v_writelane_b32 v40, s23, 26
-; GFX908-NEXT: v_writelane_b32 v40, s24, 27
-; GFX908-NEXT: v_writelane_b32 v40, s25, 28
-; GFX908-NEXT: v_writelane_b32 v40, s26, 29
-; GFX908-NEXT: v_writelane_b32 v40, s27, 30
-; GFX908-NEXT: v_writelane_b32 v40, s28, 31
-; GFX908-NEXT: v_writelane_b32 v40, s29, 32
+; GFX908-NEXT: v_writelane_b32 v40, s21, 12
+; GFX908-NEXT: v_writelane_b32 v40, s22, 13
+; GFX908-NEXT: v_writelane_b32 v40, s23, 14
+; GFX908-NEXT: v_writelane_b32 v40, s24, 15
+; GFX908-NEXT: v_writelane_b32 v40, s25, 16
+; GFX908-NEXT: v_writelane_b32 v40, s26, 17
+; GFX908-NEXT: v_writelane_b32 v40, s27, 18
+; GFX908-NEXT: v_writelane_b32 v40, s28, 19
+; GFX908-NEXT: v_writelane_b32 v40, s29, 20
; GFX908-NEXT: v_readlane_b32 s4, v40, 10
; GFX908-NEXT: v_readlane_b32 s6, v40, 0
; GFX908-NEXT: v_readlane_b32 s8, v40, 8
@@ -630,39 +630,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
-; GFX908-NEXT: v_readlane_b32 s21, v40, 24
+; GFX908-NEXT: v_readlane_b32 s21, v40, 12
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s21
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s22, v40, 25
+; GFX908-NEXT: v_readlane_b32 s22, v40, 13
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s22
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s23, v40, 26
+; GFX908-NEXT: v_readlane_b32 s23, v40, 14
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s23
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s24, v40, 27
+; GFX908-NEXT: v_readlane_b32 s24, v40, 15
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s24
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s25, v40, 28
+; GFX908-NEXT: v_readlane_b32 s25, v40, 16
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s25
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s26, v40, 29
+; GFX908-NEXT: v_readlane_b32 s26, v40, 17
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s26
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s27, v40, 30
+; GFX908-NEXT: v_readlane_b32 s27, v40, 18
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s27
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s28, v40, 31
+; GFX908-NEXT: v_readlane_b32 s28, v40, 19
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s28
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_readlane_b32 s29, v40, 32
+; GFX908-NEXT: v_readlane_b32 s29, v40, 20
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s29
; GFX908-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index 17a19116735e4..04a9f3cb2d332 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -17,10 +17,12 @@
; DEFAULT: Greedy Register Allocator
; DEFAULT-NEXT: Virtual Register Rewriter
+; DEFAULT-NEXT: Stack Slot Coloring
; DEFAULT-NEXT: SI lower SGPR spill instructions
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: SI Pre-allocate WWM Registers
+; DEFAULT-NEXT: Live Stack Slot Analysis
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: SI Lower WWM Copies
; DEFAULT-NEXT: GCN NSA Reassign
@@ -50,10 +52,12 @@
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: Basic Register Allocator
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
+; BASIC-DEFAULT-NEXT: Stack Slot Coloring
; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
; BASIC-DEFAULT-NEXT: Virtual Register Map
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
+; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
@@ -69,10 +73,12 @@
; DEFAULT-BASIC: Greedy Register Allocator
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
+; DEFAULT-BASIC-NEXT: Stack Slot Coloring
; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
+; DEFAULT-BASIC-NEXT: Live Stack Slot Analysis
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
@@ -90,10 +96,12 @@
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: Virtual Register Rewriter
+; BASIC-BASIC-NEXT: Stack Slot Coloring
; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
+; BASIC-BASIC-NEXT: Live Stack Slot Analysis
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: SI Lower WWM Copies
; BASIC-BASIC-NEXT: GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index bea2e6d4b45a3..e1bd1523d78a4 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10098,7 +10098,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT: s_mov_b32 s2, 0x84400
+; GFX6-NEXT: s_mov_b32 s2, 0x86a00
; GFX6-NEXT: s_mov_b64 s[8:9], exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10108,7 +10108,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT: s_mov_b32 s2, 0x84000
+; GFX6-NEXT: s_mov_b32 s2, 0x86600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10117,7 +10117,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT: s_mov_b32 s2, 0x83c00
+; GFX6-NEXT: s_mov_b32 s2, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10126,7 +10126,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT: s_mov_b32 s2, 0x83800
+; GFX6-NEXT: s_mov_b32 s2, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt...
[truncated]
|
@@ -67,6 +67,9 @@ namespace { | |||
const MachineBlockFrequencyInfo *MBFI = nullptr; | |||
SlotIndexes *Indexes = nullptr; | |||
|
|||
// - preserves Analysis passes in case RA may be called afterwards. | |||
bool preserveRegAllocNeededAnalysis = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There shouldn't be a reason to have a configuration flag for this. The pass can just unconditionally preserve the analyses.
This should also be a separate PR, not just a separate commit in this PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There shouldn't be a reason to have a configuration flag for this. The pass can just unconditionally preserve the analyses.
This should also be a separate PR, not just a separate commit in this PR
ok sure, I should put changes in generic pass as separate PR.
But, as the same stackSlotColoring pass exists in generic way(after RA) for other Target's pipeline where these analysis are not really required to be preserved. As in this case, it just got additionally called in between AMDGPU target pipeline(where addtional analysis needs to be preserved). Therefore, got suggested to introduce this flag to avoid any changes for other architecture's pipeline( maybe caused due to additional pass preserving in stackSlotColoring).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Preserved does not mean required. Make them unconditionally preserved. Unless something later also requires them, they shouldn't remain live
6da7a6e
to
8e5fedb
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the generic changes should go in a separate patch.
@@ -0,0 +1,127 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-before=prologepilog -o - %s | FileCheck -check-prefix=SHARE %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-mcpu missing. in all new tests.
Since the RUN lines are the same combine all 3 tests into a single .mir file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-mcpu missing. in all new tests. Since the RUN lines are the same combine all 3 tests into a single .mir file.
Can you enlighten me on what cpu to mention, as I am not sure exactly how to decide on.
Also, kindly comment on the configuration flag related changes.
Please keep subject lines concise. "Implemented a patch" just wastes words. If you remove that part then you can explain a little bit more about the optimization. |
Point Noted! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch needs a pre-commit PSDB test.
@@ -0,0 +1,353 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should precommit the test. But I'm not sure this test currently does anything significant to the slot optimization.
Add another run line to show the output after stack-slot-coloring as well to see the effect of this pass.
Should add a short description at each test what it meant for?
I don't see the slot-indices being shared for the SGPR spills.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe I added the RUN command for both BEFORE & AFTER stack slot coloring pass (the one invoked just after sgpr regAlloc), & stack indices re-usage can be seen.
@@ -548,4 +548,4 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) { | |||
Assignments.clear(); | |||
|
|||
return Changed; | |||
} | |||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something is wrong at the EOF. With this PR, there shouldn't be any change in StackSlotColoring.cpp.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will make appropriate chnages
@@ -284,9 +284,8 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, | |||
MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); | |||
// Set SGPR_SPILL asm printer flag | |||
MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); | |||
if (LIS) { | |||
if (LIS) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Revert this change. Make sure you don't introduce unwanted changes in any file. Preview once before committing the code changes.
assert(SB.NumSubRegs == VGPRSpills.size() && | ||
"Num of VGPR lanes should be equal to num of SGPRs spilled"); | ||
assert(SB.NumSubRegs <= VGPRSpills.size() && | ||
"Num of VGPR lanes should be greater or equal to num of SGPRs " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Keep this string as concise as possible. Any extra details you want to add should be included in a comment statement before the assert. The original string looks good. You might want to change "should be equal to" -> "should be less or equal to".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that number of VGPR lanes corresponds to spill slot size(equal to largest spill stack object residing in it) which should be equal to or greater than number of SGPR spills created which corresponds to the spill stack object size.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new test llvm/test/CodeGen/AMDGPU/stack-slot-color-after-sgpr-regalloc.mir needs to be written precisely to show the stack indices being shared. See my previous comments.
@@ -1775,10 +1775,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, | |||
|
|||
if (SpillToVGPR) { | |||
|
|||
// Since stack slot coloring pass is trying to optimize SGPR spills, | |||
// VGPR lanes (mapped from spill stack slot) may be shared for unequal SGPR |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change unequal SGPR spills into SGPR spills of different sizes.
"Num of VGPR lanes should be greater or equal to num of SGPRs " | ||
"spilled, as Stack Slot Coloring pass assigns different SGPR spills " | ||
"into same stack slots"); | ||
"Num of VGPR lanes mapped should be greater or equal to num of " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it is more appropriate to use 'shouldn't be less than' instead of 'should be greater or equal'
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it is more appropriate to use 'shouldn't be less than' instead of 'should be greater or equal'
Then should I reverse the assert statement, so that 'shouldn't be less than' can be used?
While going through it, I found a problem in test case output, need to reassess my changes in PR, currently working on it. |
Everything is alright! no changes is needed |
@@ -0,0 +1,295 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefix=CHECK %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should use a different check-prefix instead of the default CHECK. May be STACK-SLOT-OPT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are now two instances of stack-slot-coloring in our pipeline. You're interested in the first invocation. Better specify it with the index too, like -stop-after=stack-slot-coloring,0. The index zero here is meant for stopping after the first instance.
@@ -0,0 +1,295 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test filename can be optimize-vgpr-lanes-for-sgpr-spills.mir?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But, I think this filename is as good after patch is included, here it is not done yet, right?
8ae045f
to
42224f4
Compare
Introduced the StackSlotColoring pass after SGPR RegAlloc and Spill to optimize stack slots reusage.
Raised another PR itself to deal with generic changes in StackSlotColoring pass needed to resolve current ticket. Merged multiple mir test cases into one to be concise.
@vg0204 Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested Please check whether problems have been caused by your change specifically, as How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
Hi, I believe this turned one of our buildbots red: https://lab.llvm.org/buildbot/#/builders/123/builds/253 |
Yeah, that would be great! |
I can confirm that this PR caused regression in blender test of buildbot:
Currently the buildbot has disabled all Blender test scenes so it is apparently green, but we'd get this fixed ASAP since this commit may get to amd-staging branch and cause other regressions. debugger shows memfault happens at the highlighted line: the real instruction causing memfault is most likely the |
kernel-hip-amdgcn-amd-amdhsa-gfx908.zip To accurately reproduce the assembly file, use the following clang command:
|
Getting this while trying to reproduce. |
This is due to mixing upstream and amd-staging with the different debug info |
Can you please elaborate? |
See #56194 |
let me try getting a bitcode without debug info |
res.new.zip |
The assembly generated is having debug info, so it is same as the assmble shared previously, but not the one present in res.new.zip! |
Also, I do noat have any clue how to proceed with this bitcode & assembly files provided by you @yxsamliu , to help in fixing the blender test caused by the commit. So if you could help me out in that, it would be great! |
the idea is to locate the buffer_load_dword in integrator_intersect_shadow which has invalid s[0:3] in the bad case and locate the counter part in the good case. By comparing the ISA, find the def of s[0:3] or chase further to find the instruction that causes invalid def of s[0:3]. Then find the invalid transformation that causes that instruction by focusing on how each pass transform that instruction. If the ISA of integrator_intersect_shadow is too different in the bad/good cases we can force noinline for callees of integrator_intersect_shadow in the hope of narrow down the issue in its callees. We may also consider other ways to minimize the difference between the bad/good cases, e.g. adding some counter or threshold in the newly added pass to control the number of changes it makes to pinpoint the minimal change that causes pass/fail. |
Can we temporarily revert this patch while we investigate the regression? Thanks. |
opened issue #96353 for better tracking the issue. Let's continue further discussion there. |
Yes, revert it to unblock the blender testing. |
Thank you Sam for this heads up! |
This reverts commit 4b9112e. A separate issue(llvm#96353) describing it has been opened to further keep its track.
This PR is dependent on [llvm#93779](llvm#93779). As currently, each SGPR Spills are lowered to go into distinct stack slots in stack frame after SGPR allocation phase. Therefore, this patch utilizes the capability of StackSlotColoring to ensure the stack slot sharing if possible for stack frame index, where the SGPR spills are occuring in the non-interfering region. StackSlotColoring is introduced immediately after SGPR register allocation, just to ensure that any further lowering happens on the optimally allocated stack slots, with certain flags to indicate the preservation of certain analysis result later to be used by RA of other register classes.
This reverts commit 4b9112e. A separate issue(llvm#96353) describing it has been opened to further keep its track.
This reverts commit c2fc7f7. As the dependent patch about split vgpr regalloc pipeline solved the issue(llvm#96353).
This reverts commit c2fc7f7. As the dependent patch about split vgpr regalloc pipeline solved the issue(llvm#96353).
This reverts commit c2fc7f7. As the dependent patch about split vgpr regalloc pipeline solved the issue(llvm#96353).
This PR is dependent on #93779.
As currently, each SGPR Spills are lowered to go into distinct stack slots in stack frame after SGPR allocation phase. Therefore, this patch utilizes the capability of StackSlotColoring to ensure the stack slot sharing if possible for stack frame index, where the SGPR spills are occuring in the non-interfering region.
StackSlotColoring is introduced immediately after SGPR register allocation, just to ensure that any further lowering happens on the optimally allocated stack slots, with certain flags to indicate the preservation of certain analysis result later to be used by RA of other register classes.