Skip to content

Commit

Permalink
Memory containment for FIELD_LIST operands on x86
Browse files Browse the repository at this point in the history
Enable broader containment for FIELD_LIST operands on x86.

The primary benefits come from containing loads that long
decomposition has left (e. g. LCL_FLDs):

```diff
-       mov      ecx, dword ptr [ebp+08H]
-       mov      edx, dword ptr [ebp+0CH]
-       push     edx
-       push     ecx
+       push     dword ptr [ebp+0CH]
+       push     dword ptr [ebp+08H]
```
  • Loading branch information
SingleAccretion committed Mar 29, 2022
1 parent 726892f commit 4ea30c1
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 111 deletions.
84 changes: 32 additions & 52 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7661,7 +7661,6 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
break;

case GenTreePutArgStk::Kind::Push:
case GenTreePutArgStk::Kind::PushAllSlots:
assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() ||
(argSize < XMM_REGSIZE_BYTES));
break;
Expand Down Expand Up @@ -7775,8 +7774,6 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
// The GC encoder requires that the stack remain 4-byte aligned at all times. Round the adjustment up
// to the next multiple of 4. If we are going to generate a `push` instruction, the adjustment must
// not require rounding.
// NOTE: if the field is of GC type, we must use a push instruction, since the emitter is not otherwise
// able to detect stores into the outgoing argument area of the stack on x86.
const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
int adjustment = roundUp(currentOffset - fieldOffset, 4);
if (fieldIsSlot && !varTypeIsSIMD(fieldType))
Expand All @@ -7792,6 +7789,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
AddStackLevel(pushSize);
adjustment -= pushSize;
}

m_pushStkArg = true;
}
else
Expand Down Expand Up @@ -7829,63 +7827,43 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
}
}

if (argReg == REG_NA)
bool canStoreWithPush = fieldIsSlot;
bool canLoadWithPush = varTypeIsI(fieldNode) || genIsValidIntReg(argReg);

if (canStoreWithPush && canLoadWithPush)
{
if (m_pushStkArg)
assert(m_pushStkArg);
assert(genTypeSize(fieldNode) == TARGET_POINTER_SIZE);
inst_TT(INS_push, emitActualTypeSize(fieldNode), fieldNode);

currentOffset -= TARGET_POINTER_SIZE;
AddStackLevel(TARGET_POINTER_SIZE);
}
else
{
// If the field is of GC type, we must use a push instruction, since the emitter is not
// otherwise able to detect stores into the outgoing argument area of the stack on x86.
assert(!varTypeIsGC(fieldNode));

// First, if needed, load the field into the temporary register.
if (argReg == REG_NA)
{
if (fieldNode->isUsedFromSpillTemp())
assert(varTypeIsIntegralOrI(fieldNode) && genIsValidIntReg(intTmpReg));

if (fieldNode->isContainedIntOrIImmed())
{
assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
assert(fieldNode->IsRegOptional());
TempDsc* tmp = getSpillTempDsc(fieldNode);
GetEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
regSet.tmpRlsTemp(tmp);
genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
}
else
{
assert(varTypeIsIntegralOrI(fieldNode));
switch (fieldNode->OperGet())
{
case GT_LCL_VAR:
inst_TT(INS_push, emitActualTypeSize(fieldNode), fieldNode);
break;
case GT_CNS_INT:
if (fieldNode->IsIconHandle())
{
inst_IV_handle(INS_push, fieldNode->AsIntCon()->gtIconVal);
}
else
{
inst_IV(INS_push, fieldNode->AsIntCon()->gtIconVal);
}
break;
default:
unreached();
}
// TODO-XArch-CQ: using "ins_Load" here is conservative, as it will always
// extend, which we can avoid if the field type is smaller than the node type.
inst_RV_TT(ins_Load(fieldNode->TypeGet()), emitTypeSize(fieldNode), intTmpReg, fieldNode);
}
currentOffset -= TARGET_POINTER_SIZE;
AddStackLevel(TARGET_POINTER_SIZE);
}
else
{
// The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
assert(varTypeIsIntegralOrI(fieldNode));
switch (fieldNode->OperGet())
{
case GT_LCL_VAR:
inst_RV_TT(INS_mov, emitTypeSize(fieldNode), intTmpReg, fieldNode);
break;
case GT_CNS_INT:
genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
break;
default:
unreached();
}
genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);

argReg = intTmpReg;
}
}
else
{

#if defined(FEATURE_SIMD)
if (fieldType == TYP_SIMD12)
{
Expand All @@ -7897,6 +7875,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
{
genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
}

if (m_pushStkArg)
{
// We always push a slot-rounded size
Expand All @@ -7906,6 +7885,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)

prevFieldOffset = fieldOffset;
}

if (currentOffset != 0)
{
// We don't expect padding at the beginning of a struct, but it could happen with explicit layout.
Expand Down
3 changes: 0 additions & 3 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10940,9 +10940,6 @@ void Compiler::gtDispTree(GenTree* tree,
case GenTreePutArgStk::Kind::Push:
printf(" (Push)");
break;
case GenTreePutArgStk::Kind::PushAllSlots:
printf(" (PushAllSlots)");
break;
default:
unreached();
}
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -6913,7 +6913,7 @@ struct GenTreePutArgStk : public GenTreeUnOp
// block node.

enum class Kind : __int8{
Invalid, RepInstr, PartialRepInstr, Unroll, Push, PushAllSlots,
Invalid, RepInstr, PartialRepInstr, Unroll, Push,
};
Kind gtPutArgStkKind;
#endif
Expand Down Expand Up @@ -7017,7 +7017,7 @@ struct GenTreePutArgStk : public GenTreeUnOp

bool isPushKind() const
{
return (gtPutArgStkKind == Kind::Push) || (gtPutArgStkKind == Kind::PushAllSlots);
return gtPutArgStkKind == Kind::Push;
}
#else // !FEATURE_PUT_STRUCT_ARG_STK
unsigned GetStackByteSize() const;
Expand Down
62 changes: 22 additions & 40 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,66 +467,55 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
if (src->OperIs(GT_FIELD_LIST))
{
#ifdef TARGET_X86
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;

GenTreeFieldList* fieldList = src->AsFieldList();

// The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
// of uses is visible to LSRA.
assert(fieldList->Uses().IsSorted());
fieldList->Uses().Reverse();

// Now that the fields have been sorted, the kind of code we will generate.
bool allFieldsAreSlots = true;
unsigned prevOffset = putArgStk->GetStackByteSize();
// Containment checks.
for (GenTreeFieldList::Use& use : fieldList->Uses())
{
GenTree* const fieldNode = use.GetNode();
const unsigned fieldOffset = use.GetOffset();

GenTree* const fieldNode = use.GetNode();
const var_types fieldType = use.GetType();
assert(!fieldNode->TypeIs(TYP_LONG));

// We can treat as a slot any field that is stored at a slot boundary, where the previous
// field is not in the same slot. (Note that we store the fields in reverse order.)
const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
if (!fieldIsSlot)
{
allFieldsAreSlots = false;
}

// For x86 we must mark all integral fields as contained or reg-optional, and handle them
// accordingly in code generation, since we may have up to 8 fields, which cannot all be in
// registers to be consumed atomically by the call.
if (varTypeIsIntegralOrI(fieldNode))
{
if (fieldNode->OperGet() == GT_LCL_VAR)
// If we are loading from an in-memory local, we would like to use "push", but this
// is only legal if we can safely load all 4 bytes. Retype the local node here to
// TYP_INT for such legal cases to make downstream (LSRA & codegen) logic simpler.
// Retyping is ok because we model this node as STORE<field type>(LOAD<node type>).
// If the field came from promotion, we allow the padding to remain undefined, if
// from decomposition, the field type will be INT (naturally blocking the retyping).
if (varTypeIsSmall(fieldNode) && (genTypeSize(fieldType) <= genTypeSize(fieldNode)) &&
fieldNode->OperIsLocalRead())
{
const LclVarDsc* varDsc = comp->lvaGetDesc(fieldNode->AsLclVarCommon());
if (!varDsc->lvDoNotEnregister)
{
fieldNode->SetRegOptional();
}
else
{
MakeSrcContained(putArgStk, fieldNode);
}
fieldNode->ChangeType(TYP_INT);
}

if (IsContainableImmed(putArgStk, fieldNode))
{
MakeSrcContained(putArgStk, fieldNode);
}
else if (fieldNode->IsIntCnsFitsInI32())
else if (IsContainableMemoryOp(fieldNode) && IsSafeToContainMem(putArgStk, fieldNode))
{
MakeSrcContained(putArgStk, fieldNode);
}
else
{
// For the case where we cannot directly push the value, if we run out of registers,
// it would be better to defer computation until we are pushing the arguments rather
// than spilling, but this situation is not all that common, as most cases of promoted
// structs do not have a large number of fields, and of those most are lclVars or
// copy-propagated constants.
// than spilling, but this situation is not all that common, as most cases of FIELD_LIST
// are promoted structs, which do not not have a large number of fields, and of those
// most are lclVars or copy-propagated constants.
fieldNode->SetRegOptional();
}
}

prevOffset = fieldOffset;
}

// Set the copy kind.
Expand All @@ -535,14 +524,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
// this tuning should probably be undertaken as a whole.
// Also, if there are floating point fields, it may be better to use the "Unroll" mode
// of copying the struct as a whole, if the fields are not register candidates.
if (allFieldsAreSlots)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
}
else
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
#endif // TARGET_X86
return;
}
Expand Down
30 changes: 16 additions & 14 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1519,21 +1519,28 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
#endif // defined(FEATURE_SIMD)

#ifdef TARGET_X86
if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
// In lowering, we have marked all integral fields as usable from memory
// (either contained or reg optional), however, we will not always be able
// to use "push [mem]" in codegen, and so may have to reserve an internal
// register here (for explicit "mov"s).
if (varTypeIsIntegralOrI(fieldNode))
{
assert(genTypeSize(fieldNode) <= TARGET_POINTER_SIZE);

// We can treat as a slot any field that is stored at a slot boundary, where the previous
// field is not in the same slot. (Note that we store the fields in reverse order.)
const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
if (intTemp == nullptr)
const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
const bool canStoreWithPush = fieldIsSlot;
const bool canLoadWithPush = varTypeIsI(fieldNode);

if ((!canStoreWithPush || !canLoadWithPush) && (intTemp == nullptr))
{
intTemp = buildInternalIntRegisterDefForNode(putArgStk);
}
if (!fieldIsSlot && varTypeIsByte(fieldType))

// We can only store bytes using byteable registers.
if (!canStoreWithPush && varTypeIsByte(fieldType))
{
// If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
// (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
// need a byte-addressable register for the store. We will enforce this requirement on an internal
// register, which we can use to copy multiple byte values.
intTemp->registerAssignment &= allByteRegs();
}
}
Expand All @@ -1544,12 +1551,7 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)

for (GenTreeFieldList::Use& use : putArgStk->gtOp1->AsFieldList()->Uses())
{
GenTree* const fieldNode = use.GetNode();
if (!fieldNode->isContained())
{
BuildUse(fieldNode);
srcCount++;
}
srcCount += BuildOperandUses(use.GetNode());
}
buildInternalRegisterUses();

Expand Down

0 comments on commit 4ea30c1

Please sign in to comment.