Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64: Have CpBlkUnroll and InitBlkUnroll use SIMD registers #68085

Merged
merged 4 commits into from
Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 10 additions & 26 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2637,18 +2637,16 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
srcReg = REG_ZR;
}

regNumber dstReg = dstAddrBaseReg;
int dstRegAddrAlignment = 0;
bool isDstRegAddrAlignmentKnown = false;
regNumber dstReg = dstAddrBaseReg;
int dstRegAddrAlignment = 0;

if (dstLclNum != BAD_VAR_NUM)
{
bool fpBased;
const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased);

dstReg = fpBased ? REG_FPBASE : REG_SPBASE;
dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;
isDstRegAddrAlignmentKnown = true;
dstReg = fpBased ? REG_FPBASE : REG_SPBASE;
dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;

helper.SetDstOffset(baseAddr + dstOffset);
}
Expand All @@ -2670,11 +2668,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

bool shouldUse16ByteWideInstrs = false;

// Store operations that cross a 16-byte boundary reduce bandwidth or incur additional latency.
// The following condition prevents using 16-byte stores when dstRegAddrAlignment is:
// 1) unknown (i.e. dstReg is neither FP nor SP) or
// 2) non-zero (i.e. dstRegAddr is not 16-byte aligned).
const bool hasAvailableSimdReg = isDstRegAddrAlignmentKnown && (size > FP_REGSIZE_BYTES);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You will need to adjust LSRA. Otherwise, it won't allocate SIMD register(s) when src/dst doesn't use sp/fp as base register.

Copy link
Member Author

@kunalspathak kunalspathak Apr 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @EgorChesakov . I believe that might be the reason SPC compilation is failing although, from what I understand, this was just the heuristics and we need not have to adjust LSRA.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to adjust the following logic for InitBlock

if (isDstRegAddrAlignmentKnown && (size > FP_REGSIZE_BYTES))
{
// For larger block sizes CodeGen can choose to use 16-byte SIMD instructions.
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}

and for CopyBlock
bool canUse16ByteWideInstrs = isSrcAddrLocal && isDstAddrLocal && (size >= 2 * FP_REGSIZE_BYTES);
// Note that the SIMD registers allocation is speculative - LSRA doesn't know at this point
// whether CodeGen will use SIMD registers (i.e. if such instruction sequence will be more optimal).
// Therefore, it must allocate an additional integer register anyway.
if (canUse16ByteWideInstrs)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}

const bool hasAvailableSimdReg = (size > FP_REGSIZE_BYTES);
const bool canUse16ByteWideInstrs =
hasAvailableSimdReg && (dstRegAddrAlignment == 0) && helper.CanEncodeAllOffsets(FP_REGSIZE_BYTES);

Expand Down Expand Up @@ -2825,35 +2819,26 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)

#ifdef TARGET_ARM64
CopyBlockUnrollHelper helper(srcOffset, dstOffset, size);

regNumber srcReg = srcAddrBaseReg;
int srcRegAddrAlignment = 0;
bool isSrcRegAddrAlignmentKnown = false;
regNumber srcReg = srcAddrBaseReg;

if (srcLclNum != BAD_VAR_NUM)
{
bool fpBased;
const int baseAddr = compiler->lvaFrameAddress(srcLclNum, &fpBased);

srcReg = fpBased ? REG_FPBASE : REG_SPBASE;
srcRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;
isSrcRegAddrAlignmentKnown = true;
srcReg = fpBased ? REG_FPBASE : REG_SPBASE;

helper.SetSrcOffset(baseAddr + srcOffset);
}

regNumber dstReg = dstAddrBaseReg;
int dstRegAddrAlignment = 0;
bool isDstRegAddrAlignmentKnown = false;
regNumber dstReg = dstAddrBaseReg;

if (dstLclNum != BAD_VAR_NUM)
{
bool fpBased;
const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased);

dstReg = fpBased ? REG_FPBASE : REG_SPBASE;
dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;
isDstRegAddrAlignmentKnown = true;
dstReg = fpBased ? REG_FPBASE : REG_SPBASE;

helper.SetDstOffset(baseAddr + dstOffset);
}
Expand Down Expand Up @@ -2914,8 +2899,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
// known and the block size is larger than a single SIMD register size (i.e. when using SIMD instructions can
// be profitable).

const bool canUse16ByteWideInstrs = isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown &&
(size >= 2 * FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);
const bool canUse16ByteWideInstrs = (size >= 2 * FP_REGSIZE_BYTES);

bool shouldUse16ByteWideInstrs = false;

Expand Down
11 changes: 4 additions & 7 deletions src/coreclr/jit/lsraarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
// the target. We do not handle these constraints on the same
// refposition too well so we help ourselves a bit here by forcing the
// null check with LR.
regMaskTP candidates = call->IsFastTailCall() ? RBM_LR : 0;
regMaskTP candidates = call->IsFastTailCall() ? RBM_LR : RBM_NONE;
buildInternalIntRegisterDefForNode(call, candidates);
}
#endif // TARGET_ARM
Expand Down Expand Up @@ -632,9 +632,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
buildInternalIntRegisterDefForNode(blkNode);
}

const bool isDstRegAddrAlignmentKnown = dstAddr->OperIsLocalAddr();

if (isDstRegAddrAlignmentKnown && (size > FP_REGSIZE_BYTES))
if (size > FP_REGSIZE_BYTES)
{
// For larger block sizes CodeGen can choose to use 16-byte SIMD instructions.
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
Expand Down Expand Up @@ -710,10 +708,9 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr());
const bool isDstAddrLocal = dstAddr->OperIsLocalAddr();

// CodeGen can use 16-byte SIMD ldp/stp for larger block sizes
// only when both source and destination base address registers have known alignment.
// CodeGen can use 16-byte SIMD ldp/stp for larger block sizes.
// This is the case, when both registers are either sp or fp.
bool canUse16ByteWideInstrs = isSrcAddrLocal && isDstAddrLocal && (size >= 2 * FP_REGSIZE_BYTES);
bool canUse16ByteWideInstrs = (size >= 2 * FP_REGSIZE_BYTES);

// Note that the SIMD registers allocation is speculative - LSRA doesn't know at this point
// whether CodeGen will use SIMD registers (i.e. if such instruction sequence will be more optimal).
Expand Down