Skip to content

Commit

Permalink
JIT: Reorder stores to make them amenable to stp optimization (dotnet…
Browse files Browse the repository at this point in the history
…#102133)

This generalizes the indir reordering optimization (that currently only
triggers for loads) to kick in for GT_STOREIND nodes.

The main complication with doing this is the fact that the data node of
the second indirection needs its own reordering with the previous
indirection. The existing logic works by reordering all nodes between
the first and second indirection that are unrelated to the second
indirection's computation to happen after it. Once that is done we know
that there are no uses of the first indirection's result between it and
the second indirection, so after doing the necessary interference checks
we can safely move the previous indirection to happen after the data
node of the second indirection.

Example:
```csharp
class Body { public double x, y, z, vx, vy, vz, mass; }

static void Advance(double dt, Body[] bodies)
{
    foreach (Body b in bodies)
    {
        b.x += dt * b.vx;
        b.y += dt * b.vy;
        b.z += dt * b.vz;
    }
}
```

Diff:
```diff
@@ -1,18 +1,17 @@
-G_M55007_IG04:  ;; offset=0x001C
+G_M55007_IG04:  ;; offset=0x0020
             ldr     x3, [x0, w1, UXTW #3]
             ldp     d16, d17, [x3, #0x08]
             ldp     d18, d19, [x3, #0x20]
             fmul    d18, d0, d18
             fadd    d16, d16, d18
-            str     d16, [x3, #0x08]
-            fmul    d16, d0, d19
-            fadd    d16, d17, d16
-            str     d16, [x3, #0x10]
+            fmul    d18, d0, d19
+            fadd    d17, d17, d18
+            stp     d16, d17, [x3, #0x08]
             ldr     d16, [x3, #0x18]
             ldr     d17, [x3, #0x30]
             fmul    d17, d0, d17
             fadd    d16, d16, d17
             str     d16, [x3, #0x18]
             add     w1, w1, #1
             cmp     w2, w1
             bgt     G_M55007_IG04
```
  • Loading branch information
jakobbotsch authored May 15, 2024
1 parent f0ee416 commit a930ef5
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 41 deletions.
134 changes: 108 additions & 26 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,8 +414,7 @@ GenTree* Lowering::LowerNode(GenTree* node)
}

case GT_STOREIND:
LowerStoreIndirCommon(node->AsStoreInd());
break;
return LowerStoreIndirCommon(node->AsStoreInd());

case GT_ADD:
{
Expand Down Expand Up @@ -8895,7 +8894,10 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
// Arguments:
// ind - the store indirection node we are lowering.
//
void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
// Return Value:
// Next node to lower.
//
GenTree* Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
{
assert(ind->TypeGet() != TYP_STRUCT);

Expand All @@ -8910,28 +8912,30 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
#endif
TryCreateAddrMode(ind->Addr(), isContainable, ind);

if (!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind))
if (comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind))
{
return ind->gtNext;
}

#ifndef TARGET_XARCH
if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL))
if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL))
{
const ssize_t handle = ind->Data()->AsIntCon()->IconValue();
if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast<CORINFO_OBJECT_HANDLE>(handle)))
{
const ssize_t handle = ind->Data()->AsIntCon()->IconValue();
if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast<CORINFO_OBJECT_HANDLE>(handle)))
{
// On platforms with weaker memory model we need to make sure we use a store with the release semantic
// when we publish a potentially mutable object
// See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and
// https://github.com/dotnet/runtime/pull/76112#discussion_r980639782
// On platforms with weaker memory model we need to make sure we use a store with the release semantic
// when we publish a potentially mutable object
// See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and
// https://github.com/dotnet/runtime/pull/76112#discussion_r980639782

// This can be relaxed to "just make sure to use stlr/memory barrier" if needed
ind->gtFlags |= GTF_IND_VOLATILE;
}
// This can be relaxed to "just make sure to use stlr/memory barrier" if needed
ind->gtFlags |= GTF_IND_VOLATILE;
}
}
#endif

LowerStoreIndirCoalescing(ind);
LowerStoreIndir(ind);
}
LowerStoreIndirCoalescing(ind);
return LowerStoreIndir(ind);
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -9014,7 +9018,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind)
#ifdef TARGET_ARM64
if (comp->opts.OptimizationEnabled() && ind->OperIs(GT_IND))
{
OptimizeForLdp(ind);
OptimizeForLdpStp(ind);
}
#endif

Expand All @@ -9029,7 +9033,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind)
// cases passing the distance check, but 82 out of these 112 extra cases were
// then rejected due to interference. So 16 seems like a good number to balance
// the throughput costs.
const int LDP_REORDERING_MAX_DISTANCE = 16;
const int LDP_STP_REORDERING_MAX_DISTANCE = 16;

//------------------------------------------------------------------------
// OptimizeForLdp: Record information about an indirection, and try to optimize
Expand All @@ -9042,7 +9046,7 @@ const int LDP_REORDERING_MAX_DISTANCE = 16;
// Returns:
// True if the optimization was successful.
//
bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
bool Lowering::OptimizeForLdpStp(GenTreeIndir* ind)
{
if (!ind->TypeIs(TYP_INT, TYP_LONG, TYP_FLOAT, TYP_DOUBLE, TYP_SIMD8, TYP_SIMD16) || ind->IsVolatile())
{
Expand All @@ -9060,7 +9064,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)

// Every indirection takes an expected 2+ nodes, so we only expect at most
// half the reordering distance to be candidates for the optimization.
int maxCount = min(m_blockIndirs.Height(), LDP_REORDERING_MAX_DISTANCE / 2);
int maxCount = min(m_blockIndirs.Height(), LDP_STP_REORDERING_MAX_DISTANCE / 2);
for (int i = 0; i < maxCount; i++)
{
SavedIndir& prev = m_blockIndirs.TopRef(i);
Expand All @@ -9075,11 +9079,22 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
continue;
}

if (prevIndir->gtNext == nullptr)
{
// Deleted by other optimization
continue;
}

if (prevIndir->OperIsStore() != ind->OperIsStore())
{
continue;
}

JITDUMP("[%06u] and [%06u] are indirs off the same base with offsets +%03u and +%03u\n",
Compiler::dspTreeID(ind), Compiler::dspTreeID(prevIndir), (unsigned)offs, (unsigned)prev.Offset);
if (std::abs(offs - prev.Offset) == genTypeSize(ind))
{
JITDUMP(" ..and they are amenable to ldp optimization\n");
JITDUMP(" ..and they are amenable to ldp/stp optimization\n");
if (TryMakeIndirsAdjacent(prevIndir, ind))
{
// Do not let the previous one participate in
Expand Down Expand Up @@ -9115,7 +9130,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir)
{
GenTree* cur = prevIndir;
for (int i = 0; i < LDP_REORDERING_MAX_DISTANCE; i++)
for (int i = 0; i < LDP_STP_REORDERING_MAX_DISTANCE; i++)
{
cur = cur->gtNext;
if (cur == indir)
Expand Down Expand Up @@ -9172,8 +9187,16 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
INDEBUG(dumpWithMarks());
JITDUMP("\n");

if ((prevIndir->gtLIRFlags & LIR::Flags::Mark) != 0)
{
JITDUMP("Previous indir is part of the data flow of current indir\n");
UnmarkTree(indir);
return false;
}

m_scratchSideEffects.Clear();

bool sawData = false;
for (GenTree* cur = prevIndir->gtNext; cur != indir; cur = cur->gtNext)
{
if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0)
Expand All @@ -9186,6 +9209,11 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
UnmarkTree(indir);
return false;
}

if (indir->OperIsStore())
{
sawData |= cur == indir->Data();
}
}
else
{
Expand All @@ -9197,6 +9225,13 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi

if (m_scratchSideEffects.InterferesWith(comp, indir, true))
{
if (!indir->OperIsLoad())
{
JITDUMP("Have conservative interference with last store. Giving up.\n");
UnmarkTree(indir);
return false;
}

// Try a bit harder, making use of the following facts:
//
// 1. We know the indir is non-faulting, so we do not need to worry
Expand Down Expand Up @@ -9293,8 +9328,39 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
}
}

JITDUMP("Interference checks passed. Moving nodes that are not part of data flow of [%06u]\n\n",
Compiler::dspTreeID(indir));
JITDUMP("Interference checks passed: can move unrelated nodes past second indir.\n");

if (sawData)
{
// If the data node of 'indir' is between 'prevIndir' and 'indir' then
// try to move the previous indir up to happen after the data
// computation. We will be moving all nodes unrelated to the data flow
// past 'indir', so we only need to check interference between
// 'prevIndir' and all nodes that are part of 'indir's dataflow.
m_scratchSideEffects.Clear();
m_scratchSideEffects.AddNode(comp, prevIndir);

for (GenTree* cur = prevIndir->gtNext;; cur = cur->gtNext)
{
if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0)
{
if (m_scratchSideEffects.InterferesWith(comp, cur, true))
{
JITDUMP("Cannot move prev indir [%06u] up past [%06u] to get it past the data computation\n",
Compiler::dspTreeID(prevIndir), Compiler::dspTreeID(cur));
UnmarkTree(indir);
return false;
}
}

if (cur == indir->Data())
{
break;
}
}
}

JITDUMP("Moving nodes that are not part of data flow of [%06u]\n\n", Compiler::dspTreeID(indir));

GenTree* previous = prevIndir;
for (GenTree* node = prevIndir->gtNext;;)
Expand All @@ -9317,6 +9383,22 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
node = next;
}

if (sawData)
{
// For some reason LSRA is not able to reuse a constant if both LIR
// temps are live simultaneously, so skip moving in those cases and
// expect LSRA to reuse the constant instead.
if (indir->Data()->OperIs(GT_CNS_INT, GT_CNS_DBL) && GenTree::Compare(indir->Data(), prevIndir->Data()))
{
JITDUMP("Not moving previous indir since we are expecting constant reuse for the data\n");
}
else
{
BlockRange().Remove(prevIndir);
BlockRange().InsertAfter(indir->Data(), prevIndir);
}
}

JITDUMP("Result:\n\n");
INDEBUG(dumpWithMarks());
JITDUMP("\n");
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,13 +345,13 @@ class Lowering final : public Phase
bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const;

// Per tree node member functions
void LowerStoreIndirCommon(GenTreeStoreInd* ind);
GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind);
GenTree* LowerIndir(GenTreeIndir* ind);
bool OptimizeForLdp(GenTreeIndir* ind);
bool OptimizeForLdpStp(GenTreeIndir* ind);
bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir);
void MarkTree(GenTree* root);
void UnmarkTree(GenTree* root);
void LowerStoreIndir(GenTreeStoreInd* node);
GenTree* LowerStoreIndir(GenTreeStoreInd* node);
void LowerStoreIndirCoalescing(GenTreeIndir* node);
GenTree* LowerAdd(GenTreeOp* node);
GenTree* LowerMul(GenTreeOp* mul);
Expand Down
14 changes: 12 additions & 2 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,21 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
GenTree* next = node->gtNext;
ContainCheckStoreIndir(node);

#ifdef TARGET_ARM64
if (comp->opts.OptimizationEnabled())
{
OptimizeForLdpStp(node);
}
#endif

return next;
}

//------------------------------------------------------------------------
Expand Down
5 changes: 3 additions & 2 deletions src/coreclr/jit/lowerloongarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
ContainCheckStoreIndir(node);
return node->gtNext;
}

//------------------------------------------------------------------------
Expand Down
5 changes: 3 additions & 2 deletions src/coreclr/jit/lowerriscv64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
ContainCheckStoreIndir(node);
return node->gtNext;
}

//------------------------------------------------------------------------
Expand Down
14 changes: 8 additions & 6 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
// Mark all GT_STOREIND nodes to indicate that it is not known
// whether it represents a RMW memory op.
Expand All @@ -92,7 +92,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
// SSE2 doesn't support RMW form of instructions.
if (LowerRMWMemOp(node))
{
return;
return node->gtNext;
}
}

Expand All @@ -109,23 +109,25 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
if (!node->Data()->IsCnsVec())
{
return;
return node->gtNext;
}

if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64))
{
return;
return node->gtNext;
}

if (node->Data()->IsVectorAllBitsSet() || node->Data()->IsVectorZero())
{
// To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors.
return;
return node->gtNext;
}

TryCompressConstVecData(node);
}
#endif

return node->gtNext;
}

//----------------------------------------------------------------------------------------------
Expand Down

0 comments on commit a930ef5

Please sign in to comment.