-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Enable load clustering by default #73789
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: Alex Bradbury (asb) Changes[RISCV] Enable load clustering by default Also tweaks the heuristic to cluster if operations are within a cache Posting for comment and for people to test on their workloads, feedback Stacks on top of #73778. Patch is 662.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73789.diff 45 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 665b7449ddb820a..4ec46f9bde1adf2 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1496,12 +1496,18 @@ class TargetInstrInfo : public MCInstrInfo {
/// to TargetPassConfig::createMachineScheduler() to have an effect.
///
/// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
+ /// \p Offset1 and \p Offset2 are the byte offsets for the memory
+ /// operations.
+ /// \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate if the offset is
+ /// scaled by a runtime quantity.
/// \p ClusterSize is the number of operations in the resulting load/store
/// cluster if this hook returns true.
/// \p NumBytes is the number of bytes that will be loaded from all the
/// clustered loads if this hook returns true.
virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const {
llvm_unreachable("target did not implement shouldClusterMemOps()");
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 4add33ba0996af0..cd5fe71ef0c1ad1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1698,11 +1698,12 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
SmallVector<const MachineOperand *, 4> BaseOps;
int64_t Offset;
unsigned Width;
+ bool OffsetIsScalable;
MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
- int64_t Offset, unsigned Width)
+ int64_t Offset, bool OffsetIsScalable, unsigned Width)
: SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
- Width(Width) {}
+ OffsetIsScalable(OffsetIsScalable), Width(Width) {}
static bool Compare(const MachineOperand *const &A,
const MachineOperand *const &B) {
@@ -1831,8 +1832,10 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
}
- if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
- CurrentClusterBytes))
+ if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpa.Offset,
+ MemOpa.OffsetIsScalable, MemOpb.BaseOps,
+ MemOpb.Offset, MemOpb.OffsetIsScalable,
+ ClusterLength, CurrentClusterBytes))
continue;
SUnit *SUa = MemOpa.SU;
@@ -1899,7 +1902,8 @@ void BaseMemOpClusterMutation::collectMemOpRecords(
unsigned Width;
if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
OffsetIsScalable, Width, TRI)) {
- MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+ MemOpRecords.push_back(
+ MemOpInfo(&SU, BaseOps, Offset, OffsetIsScalable, Width));
LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
<< Offset << ", OffsetIsScalable: " << OffsetIsScalable
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e97f17e3f49c587..6b49e17528ada74 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4230,8 +4230,9 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
///
/// Only called for LdSt for which getMemOperandWithOffset returns true.
bool AArch64InstrInfo::shouldClusterMemOps(
- ArrayRef<const MachineOperand *> BaseOps1,
- ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+ ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+ bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
unsigned NumBytes) const {
assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
const MachineOperand &BaseOp1 = *BaseOps1.front();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index cc588cdad6b8e5a..65e5fb49536da24 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -179,7 +179,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
int64_t &MinOffset, int64_t &MaxOffset);
bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 50f8ad4433c6d5c..442ae4dd7b34fe1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -232,7 +232,10 @@ class SIInsertHardClauses : public MachineFunctionPass {
// scheduler it limits the size of the cluster to avoid increasing
// register pressure too much, but this pass runs after register
// allocation so there is no need for that kind of limit.
- !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
+ // We also lie about the Offset and OffsetIsScalable parameters,
+ // as they aren't used in the SIInstrInfo implementation.
+ !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false,
+ 2, 2)))) {
// Finish the current clause.
Changed |= emitClause(CI, SII);
CI = ClauseInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d6912544f..0a06fa88b6b1025 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -541,7 +541,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
}
bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index eba817756e9c58e..6da4f74dfe5f3ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -234,7 +234,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const TargetRegisterInfo *TRI) const final;
bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const override;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 6784049348b1638..0de795d9d5e812e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2878,8 +2878,9 @@ static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
}
bool PPCInstrInfo::shouldClusterMemOps(
- ArrayRef<const MachineOperand *> BaseOps1,
- ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+ ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+ bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
unsigned NumBytes) const {
assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 31e9859a41739a1..0c9ad607418ecc7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -530,7 +530,9 @@ class PPCInstrInfo : public PPCGenInstrInfo {
/// Returns true if the two given memory operations should be scheduled
/// adjacent.
bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2918e5654db4f9f..cf399f37d255c29 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2266,8 +2266,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
}
bool RISCVInstrInfo::shouldClusterMemOps(
- ArrayRef<const MachineOperand *> BaseOps1,
- ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+ ArrayRef<const MachineOperand *> BaseOps1, int64_t Offset1,
+ bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
@@ -2281,9 +2282,9 @@ bool RISCVInstrInfo::shouldClusterMemOps(
return false;
}
- // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
- // indicate they likely share a cache line.
- return ClusterSize <= 4;
+ // A cache line is typically 64 bytes, so cluster if the memory ops are on
+ // the same or a neighbouring cache line.
+ return std::abs(Offset1 - Offset2) < 64;
}
// Set BaseReg (the base register operand), Offset (the byte offset being
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0954286a419bdd5..7e1d3f31180650d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -158,7 +158,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
const TargetRegisterInfo *TRI) const override;
bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
+ int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b6c194f03f54209..0954fbb8314c6fa 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -95,11 +95,6 @@ static cl::opt<bool>
cl::desc("Enable Split RegisterAlloc for RVV"),
cl::init(false));
-static cl::opt<bool> EnableMISchedLoadClustering(
- "riscv-misched-load-clustering", cl::Hidden,
- cl::desc("Enable load clustering in the machine scheduler"),
- cl::init(false));
-
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -350,15 +345,10 @@ class RISCVPassConfig : public TargetPassConfig {
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override {
const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
- ScheduleDAGMILive *DAG = nullptr;
- if (EnableMISchedLoadClustering) {
- DAG = createGenericSchedLive(C);
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- }
- if (ST.hasMacroFusion()) {
- DAG = DAG ? DAG : createGenericSchedLive(C);
+ ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.hasMacroFusion())
DAG->addMutation(createRISCVMacroFusionDAGMutation());
- }
return DAG;
}
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 274f1cef49aa955..3695a8a7f60862f 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -200,25 +200,25 @@ define i128 @add_wide_operand(i128 %a) nounwind {
;
; RV32C-LABEL: add_wide_operand:
; RV32C: # %bb.0:
-; RV32C-NEXT: lw a6, 4(a1)
+; RV32C-NEXT: lw a6, 8(a1)
; RV32C-NEXT: c.lw a3, 12(a1)
-; RV32C-NEXT: c.lw a4, 0(a1)
-; RV32C-NEXT: c.lw a1, 8(a1)
+; RV32C-NEXT: c.lw a2, 4(a1)
+; RV32C-NEXT: c.lw a1, 0(a1)
; RV32C-NEXT: c.lui a5, 16
; RV32C-NEXT: c.add a3, a5
; RV32C-NEXT: c.slli a3, 3
-; RV32C-NEXT: srli a5, a1, 29
-; RV32C-NEXT: c.or a3, a5
-; RV32C-NEXT: srli a5, a4, 29
-; RV32C-NEXT: slli a2, a6, 3
-; RV32C-NEXT: c.or a2, a5
; RV32C-NEXT: srli a5, a6, 29
+; RV32C-NEXT: c.or a3, a5
+; RV32C-NEXT: srli a5, a1, 29
+; RV32C-NEXT: slli a4, a2, 3
+; RV32C-NEXT: c.or a4, a5
+; RV32C-NEXT: c.srli a2, 29
+; RV32C-NEXT: c.slli a6, 3
+; RV32C-NEXT: or a2, a6, a2
; RV32C-NEXT: c.slli a1, 3
-; RV32C-NEXT: c.or a1, a5
-; RV32C-NEXT: c.slli a4, 3
-; RV32C-NEXT: c.sw a4, 0(a0)
-; RV32C-NEXT: c.sw a1, 8(a0)
-; RV32C-NEXT: c.sw a2, 4(a0)
+; RV32C-NEXT: c.sw a1, 0(a0)
+; RV32C-NEXT: c.sw a2, 8(a0)
+; RV32C-NEXT: c.sw a4, 4(a0)
; RV32C-NEXT: c.sw a3, 12(a0)
; RV32C-NEXT: c.jr ra
;
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 7111316931f19b7..7aeaaab68a208cb 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -25,69 +25,69 @@ define void @callee() nounwind {
; ILP32: # %bb.0:
; ILP32-NEXT: lui a0, %hi(var)
; ILP32-NEXT: flw fa5, %lo(var)(a0)
-; ILP32-NEXT: flw fa4, %lo(var+4)(a0)
-; ILP32-NEXT: flw fa3, %lo(var+8)(a0)
-; ILP32-NEXT: flw fa2, %lo(var+12)(a0)
; ILP32-NEXT: addi a1, a0, %lo(var)
-; ILP32-NEXT: flw fa1, 16(a1)
-; ILP32-NEXT: flw fa0, 20(a1)
-; ILP32-NEXT: flw ft0, 24(a1)
-; ILP32-NEXT: flw ft1, 28(a1)
-; ILP32-NEXT: flw ft2, 32(a1)
-; ILP32-NEXT: flw ft3, 36(a1)
-; ILP32-NEXT: flw ft4, 40(a1)
-; ILP32-NEXT: flw ft5, 44(a1)
-; ILP32-NEXT: flw ft6, 48(a1)
-; ILP32-NEXT: flw ft7, 52(a1)
-; ILP32-NEXT: flw fa6, 56(a1)
-; ILP32-NEXT: flw fa7, 60(a1)
-; ILP32-NEXT: flw ft8, 64(a1)
-; ILP32-NEXT: flw ft9, 68(a1)
-; ILP32-NEXT: flw ft10, 72(a1)
-; ILP32-NEXT: flw ft11, 76(a1)
-; ILP32-NEXT: flw fs0, 80(a1)
-; ILP32-NEXT: flw fs1, 84(a1)
-; ILP32-NEXT: flw fs2, 88(a1)
-; ILP32-NEXT: flw fs3, 92(a1)
-; ILP32-NEXT: flw fs4, 96(a1)
-; ILP32-NEXT: flw fs5, 100(a1)
-; ILP32-NEXT: flw fs6, 104(a1)
-; ILP32-NEXT: flw fs7, 108(a1)
+; ILP32-NEXT: flw fa4, 16(a1)
+; ILP32-NEXT: flw fa3, 20(a1)
+; ILP32-NEXT: flw fa2, 24(a1)
+; ILP32-NEXT: flw fa1, 28(a1)
+; ILP32-NEXT: flw fa0, 32(a1)
+; ILP32-NEXT: flw ft0, 36(a1)
+; ILP32-NEXT: flw ft1, 40(a1)
+; ILP32-NEXT: flw ft2, 44(a1)
+; ILP32-NEXT: flw ft3, 48(a1)
+; ILP32-NEXT: flw ft4, 52(a1)
+; ILP32-NEXT: flw ft5, 56(a1)
+; ILP32-NEXT: flw ft6, 60(a1)
+; ILP32-NEXT: flw ft7, 64(a1)
+; ILP32-NEXT: flw fa6, 68(a1)
+; ILP32-NEXT: flw fa7, 72(a1)
+; ILP32-NEXT: flw ft8, 76(a1)
+; ILP32-NEXT: flw ft9, 80(a1)
+; ILP32-NEXT: flw ft10, 84(a1)
+; ILP32-NEXT: flw ft11, 88(a1)
+; ILP32-NEXT: flw fs0, 92(a1)
+; ILP32-NEXT: flw fs1, 96(a1)
+; ILP32-NEXT: flw fs2, 100(a1)
+; ILP32-NEXT: flw fs3, 104(a1)
+; ILP32-NEXT: flw fs4, 108(a1)
+; ILP32-NEXT: flw fs5, 112(a1)
+; ILP32-NEXT: flw fs6, 116(a1)
+; ILP32-NEXT: flw fs7, 120(a1)
; ILP32-NEXT: flw fs8, 124(a1)
-; ILP32-NEXT: flw fs9, 120(a1)
-; ILP32-NEXT: flw fs10, 116(a1)
-; ILP32-NEXT: flw fs11, 112(a1)
+; ILP32-NEXT: flw fs9, %lo(var+4)(a0)
+; ILP32-NEXT: flw fs10, %lo(var+8)(a0)
+; ILP32-NEXT: flw fs11, %lo(var+12)(a0)
; ILP32-NEXT: fsw fs8, 124(a1)
-; ILP32-NEXT: fsw fs9, 120(a1)
-; ILP32-NEXT: fsw fs10, 116(a1)
-; ILP32-NEXT: fsw fs11, 112(a1)
-; ILP32-NEXT: fsw fs7, 108(a1)
-; ILP32-NEXT: fsw fs6, 104(a1)
-; ILP32-NEXT: fsw fs5, 100(a1)
-; ILP32-NEXT: fsw fs4, 96(a1)
-; ILP32-NEXT: fsw fs3, 92(a1)
-; ILP32-NEXT: fsw fs2, 88(a1)
-; ILP32-NEXT: fsw fs1, 84(a1)
-; ILP32-NEXT: fsw fs0, 80(a1)
-; ILP32-NEXT: fsw ft11, 76(a1)
-; ILP32-NEXT: fsw ft10, 72(a1)
-; ILP32-NEXT: fsw ft9, 68(a1)
-; ILP32-NEXT: fsw ft8, 64(a1)
-; ILP32-NEXT: fsw fa7, 60(a1)
-; ILP32-NEXT: fsw fa6, 56(a1)
-; ILP32-NEXT: fsw ft7, 52(a1)
-; ILP32-NEXT: fsw ft6, 48(a1)
-; ILP32-NEXT: fsw ft5, 44(a1)
-; ILP32-NEXT: fsw ft4, 40(a1)
-; ILP32-NEXT: fsw ft3, 36(a1)
-; ILP32-NEXT: fsw ft2, 32(a1)
-; ILP32-NEXT: fsw ft1, 28(a1)
-; ILP32-NEXT: fsw ft0, 24(a1)
-; ILP32-NEXT: fsw fa0, 20(a1)
-; ILP32-NEXT: fsw fa1, 16(a1)
-; ILP32-NEXT: fsw fa2, %lo(var+12)(a0)
-; ILP32-NEXT: fsw fa3, %lo(var+8)(a0)
-; ILP32-NEXT: fsw fa4, %lo(var+4)(a0)
+; ILP32-NEXT: fsw fs7, 120(a1)
+; ILP32-NEXT: fsw fs6, 116(a1)
+; ILP32-NEXT: fsw fs5, 112(a1)
+; ILP32-NEXT: fsw fs4, 108(a1)
+; ILP32-NEXT: fsw fs3, 104(a1)
+; ILP32-NEXT: fsw fs2, 100(a1)
+; ILP32-NEXT: fsw fs1, 96(a1)
+; ILP32-NEXT: fsw fs0, 92(a1)
+; ILP32-NEXT: fsw ft11, 88(a1)
+; ILP32-NEXT: fsw ft10, 84(a1)
+; ILP32-NEXT: fsw ft9, 80(a1)
+; ILP32-NEXT: fsw ft8, 76(a1)
+; ILP32-NEXT: fsw fa7, 72(a1)
+; ILP32-NEXT: fsw fa6, 68(a1)
+; ILP32-NEXT: fsw ft7, 64(a1)
+; ILP32-NEXT: fsw ft6, 60(a1)
+; ILP32-NEXT: fsw ft5, 56(a1)
+; ILP32-NEXT: fsw ft4, 52(a1)
+; ILP32-NEXT: fsw ft3, 48(a1)
+; ILP32-NEXT: fsw ft2, 44(a1)
+; ILP32-NEXT: fsw ft1, 40(a1)
+; ILP32-NEXT: fsw ft0, 36(a1)
+; ILP32-NEXT: fsw fa0, 32(a1)
+; ILP32-NEXT: fsw fa1, 28(a1)
+; ILP32-NEXT: fsw fa2, 24(a1)
+; ILP32-NEXT: fsw fa3, 20(a1)
+; ILP32-NEXT: fsw fa4, 16(a1)
+; ILP32-NEXT: fsw fs11, %lo(var+12)(a0)
+; ILP32-NEXT: fsw fs10, %lo(var+8)(a0)
+; ILP32-NEXT: fsw fs9, %lo(var+4)(a0)
; ILP32-NEXT: fsw fa5, %lo(var)(a0)
; ILP32-NEXT: ret
;
@@ -95,69 +95,69 @@ define void @callee() nounwind {
; LP64: # %bb.0:
; LP64-NEXT: lui a0, %hi(var)
; LP64-NEXT: flw fa5, %lo(var)(a0)
-; LP64-NEXT: flw fa4, %lo(var+4)(a0)
-; LP64-NEXT: flw fa3, %lo(var+8)(a0)
-; LP64-NEXT: flw fa2, %lo(var+12)(a0)
; LP64-NEXT: addi a1, a0, %lo(var)
-; LP64-NEXT: flw fa1, 16(a1)
-; LP64-NEXT: flw fa0, 20(a1)
-; LP64-NEXT: flw ft0, 24(a1)
-; LP64-NEXT: flw ft1, 28(a1)
-; LP64-NEXT: flw ft2, 32(a1)
-; LP64-NEXT: flw ft3, 36(a1)
-; LP64-NEXT: flw ft4, 40(a1)
-; LP64-NEXT: flw ft5, 44(a1)
-; LP64-NEXT: flw ft6, 48(a1)
-; LP64-NEXT: flw ft7, 52(a1)
-; LP64-NEXT: flw fa6, 56(a1)
-; LP64-NEXT: flw fa7, 60(a1)
-; LP64-NEXT: flw ft8, 64(a1)
-; LP64-NEXT: flw ft9, 68(a1)
-; LP64-NEXT: flw ft10, 72(a1)
-; LP64-NEXT: flw ft11, 76(a1)
-; LP64-NEXT: flw fs0, 80(a1)
-; LP64-NEXT: flw fs1, 84(a1)
-; LP64-NEXT: flw fs2, 88(a1)
-; LP64-NEXT: flw fs3, 92(a1)
-; LP64-NEXT: flw fs4, 96(a1)
-; LP64-NEXT: flw fs5, 100(a1)
-; LP64-NEXT: ...
[truncated]
|
7a50880
to
ceb5e02
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the test diffs, two observations:
- This appears to be increasing register pressure in some cases. I can't find any case where it causes a spill it didn't before, but we do have cases where more registers are live at the same time.
- Clustering doesn't appear to be sorting by offset, and instead appears to be a somewhat arbitrary reordering of already adjacent loads. (When they happen to already be adjacent.)
Does this work if we try doing this post RA? That would cut down on the potential register pressure concern. To be clear, this is just a question, I could easily be convinced that doing this pre-RA is the right answer.
Your heuristic looks reasonable to me. I don't have data one way or another to indicate a preference for it, or anything else.
// TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets | ||
// indicate they likely share a cache line. | ||
return ClusterSize <= 4; | ||
// A cache line is typically 64 bytes, so cluster if the memory ops are on |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can get cache line size via TargetTransformInfo::getCacheLineSize
or MCSubtargetInfo::getCacheLineSize
(for the default implementation, TargetTransformInfo::getCacheLineSize
just calls MCSubtargetInfo::getCacheLineSize
). If the return value is not 0, we can use it as a more exact value.
For RISCV, the cache line size info can be specified when defining processor.
…/ getMemOperandWithOffsetWidth I noted AArch64 happily accepts a FrameIndex operand as well as a register. This doesn't cause any changes outside of my C++ unit test for the current state of in-tree, but this will cause additional test changes if llvm#73789 is rebased on top of it. Note that the returned Offset doesn't seem at all as meaningful if you hae a FrameIndex base, though the approach taken here follows AArch64 (see D54847). This change won't harm the approach taken in shouldClusterMemOps because memOpsHaveSameBasePtr will only return true if the FrameIndex operand is the same for both operations.
…/ getMemOperandWithOffsetWidth (#73802) I noted AArch64 happily accepts a FrameIndex operand as well as a register. This doesn't cause any changes outside of my C++ unit test for the current state of in-tree, but this will cause additional test changes if #73789 is rebased on top of it. Note that the returned Offset doesn't seem at all as meaningful if you have a FrameIndex base, though the approach taken here follows AArch64 (see D54847). This change won't harm the approach taken in shouldClusterMemOps because memOpsHaveSameBasePtr will only return true if the FrameIndex operand is the same for both operations.
✅ With the latest revision this PR passed the C/C++ code formatter. |
Digging into the ordering behaviour, it is indeed a bit confusing. BaseMemOpClusterMutation::apply will collect the MemOpInfo (calling getMemOperandsWithOffsetWidth for this), then they're put into group (commonly one big group, unless over a certain size threshold), and then the array of MemOpInfo is sorted into ascending order based on base+offset. clusterNeighbouringMemOps then performs clustering using that order except there's logic that avoids reordering that was added in https://reviews.llvm.org/D72706 with the argument that the reordering "leads to test case churn and 'goes against the machine schedulers "do no harm" philosophy'. I'm not sure I follow how the reordering could be harmful, and it does seem unfortunate because it seems to me the sdag scheduler does do this reordering so there's even more churn than you'd think when trying to check the incremental benefit of adding sdag load clustering in addition. I've added a new parameter to the pass that controls this behaviour, and enabled the reordering for RISC-V. If people are happy with this direction, I'd intend to split this out as a patch that this one depends on. Trying to enable this for the post-ra scheduler it seems to fail to make any changes (i.e. doesn't work), though it's possible I made an error. Thanks @wangpc-pp for the suggestion re querying cache line size. I've not added that in this iteration, but will do so in the next rev. |
06c5c58
to
c9ca21b
Compare
For reference, #74600 shows the additional diff if you then enable the selectiondag scheduler. Lots of cases where there's just noise (e.g. different register choice), but a few I want to look at more closely as ideally we'd get by only enabling load clustering in the MachineScheduler I think. |
/// \p ClusterSize is the number of operations in the resulting load/store | ||
/// cluster if this hook returns true. | ||
/// \p NumBytes is the number of bytes that will be loaded from all the | ||
/// clustered loads if this hook returns true. | ||
virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, | ||
int64_t Offset1, bool OffsetIsScalable1, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need a width operand too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You could imagine trying to use that, but our current heuristic doesn't make use of that information so I don't think it's necessary to expand the parameter list further.
Split out from llvm#73789. Clusters if the operations are within a cache line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not sure if that's intentionally set to 512 bytes or if the division is in error. Adopts the suggestion from @wangpc-pp to query the cache line size and use it if available.
c9ca21b
to
a387218
Compare
a387218
to
1a46b84
Compare
Split out from llvm#73789. Clusters if the operations are within a cache line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not sure if that's intentionally set to 512 bytes or if the division is in error. Adopts the suggestion from @wangpc-pp to query the cache line size and use it if available.
Split out from #73789, so as to leave that PR just for flipping load clustering to on by default. Clusters if the operations are within a cache line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not sure if that's intentionally set to 512 bytes or if the division is in error. Adopts the suggestion from @wangpc-pp to query the cache line size and use it if available. We also cap the maximum cluster size to cap the potential register pressure impact (which may lead to additional spills).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/comment addressed. The comment is basically a rework of the patch, but the resulting patch is near trivial so it felt reasonable to do a conditional LGTM here.
static cl::opt<bool> EnableMISchedLoadClustering( | ||
"riscv-misched-load-clustering", cl::Hidden, | ||
cl::desc("Enable load clustering in the machine scheduler"), | ||
cl::init(false)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this patch, please just flip the bool default here. If you want to remove the option entirely, do that in a separate change in a couple weeks.
…ustering (#75338) Reordering based on the sort order of the MemOpInfo array was disabled in <https://reviews.llvm.org/D72706>. However, it's not clear this is desirable for al targets. It also makes it more difficult to compare the incremental benefit of enabling load clustering in the selectiondag scheduler as well was the machinescheduler, as the sdag scheduler does seem to allow this reordering. This patch adds a parameter that can control the behaviour on a per-target basis. Split out from #73789.
…ustering (llvm#75338) Reordering based on the sort order of the MemOpInfo array was disabled in <https://reviews.llvm.org/D72706>. However, it's not clear this is desirable for al targets. It also makes it more difficult to compare the incremental benefit of enabling load clustering in the selectiondag scheduler as well was the machinescheduler, as the sdag scheduler does seem to allow this reordering. This patch adds a parameter that can control the behaviour on a per-target basis. Split out from llvm#73789.
Reverse ping! |
Thanks for the ping, I've been intending to return to this so it's good to hear there's interest. I'll rebase in the coming days and try to move this forwards. |
1a46b84
to
446abf8
Compare
I've rebased and repushed (again actually, I had done it before but forgotten to push). I've adopted Philip's suggestion of just flipping the default for the option for this patch. |
; RV32I-NEXT: mv s6, a0 | ||
; RV32I-NEXT: li a2, 1 | ||
; RV32I-NEXT: mv a0, a3 | ||
; RV32I-NEXT: mv a1, a4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extra move?
; RV32IM-NEXT: mv s6, a0 | ||
; RV32IM-NEXT: li a2, 1 | ||
; RV32IM-NEXT: mv a0, a3 | ||
; RV32IM-NEXT: mv a1, a4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And here.
; ZVFHMIN-ZFHIN-RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill | ||
; ZVFHMIN-ZFHIN-RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill | ||
; ZVFHMIN-ZFHIN-RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill | ||
; ZVFHMIN-ZFHIN-RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this because the register pressure increases?
Thanks! It seems there are new conflicts (we are actively changing). |
Easy enough for me to regenerate and fix the conflicts if we're happy to merge. As you point out, there are some minor regressions in a few tests but I think what I need from reviewers is an indication that they agree this is the right direction, and if we find any regressions that are meaningful in practice we can follow up with fixes in tree. I can do my own testing, but given your ping that you would like to see this enabled, I'm guessing you've observed it's at least no worse than the status quo on benchmarks, which would be supportive of moving ahead? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Thanks a lot!
We have already enabled it internally, and we do see some improvements on LMBench.
Please land this. |
This just flips the default for the option. A later patch may remove it altogether.
446abf8
to
639c1f2
Compare
We believe this is neutral or slightly better in the majority of cases.
We believe this is neutral or slightly better in the majority of cases.
We believe this is neutral or slightly better in the majority of cases.
We believe this is neutral or slightly better in the majority of cases.
After llvm#73789 enabled load clustering, do the same for store clustering.
Builds on #73789, enabling store clustering by default using the same heuristic.
Builds on llvm#73789, enabling store clustering by default using the same heuristic.
Builds on llvm#73789, enabling store clustering by default using the same heuristic.
Stacks on top of #75338 and #75341
Posting for comment and for people to test on their workloads, feedback
on ideas for a tweaked heuristic etc.