Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64/Sve: Add FFR register liveness tracking #105348

Merged
merged 14 commits into from
Jul 26, 2024
14 changes: 12 additions & 2 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1508,8 +1508,18 @@ void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree)
var_types targetType = tree->TypeGet();
regNumber targetReg = tree->GetRegNum();

inst_Mov(targetType, targetReg, tree->gtSrcReg, /* canSkip */ true);
genTransferRegGCState(targetReg, tree->gtSrcReg);
#ifdef TARGET_ARM64
if (varTypeIsMask(targetType))
{
assert(tree->gtSrcReg == REG_FFR);
GetEmitter()->emitIns_R(INS_sve_rdffr, EA_SCALABLE, REG_FFR);
}
else
#endif
{
inst_Mov(targetType, targetReg, tree->gtSrcReg, /* canSkip */ true);
genTransferRegGCState(targetReg, tree->gtSrcReg);
}

genProduceReg(tree);
}
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -4344,6 +4344,10 @@ class Compiler
#endif // defined(FEATURE_SIMD)

unsigned lvaGSSecurityCookie; // LclVar number
#ifdef TARGET_ARM64
unsigned lvaFfrRegister; // LclVar number
unsigned getFFRegisterVarNum();
#endif
bool lvaTempsHaveLargerOffsetThanVars();

// Returns "true" iff local variable "lclNum" is in SSA form.
Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7448,7 +7448,11 @@ GenTreeIntCon* Compiler::gtNewFalse()
// return a new node representing the value in a physical register
GenTree* Compiler::gtNewPhysRegNode(regNumber reg, var_types type)
{
#ifdef TARGET_ARM64
assert(genIsValidIntReg(reg) || (reg == REG_SPBASE) || (reg == REG_FFR));
#else
assert(genIsValidIntReg(reg) || (reg == REG_SPBASE));
#endif
GenTree* result = new (this, GT_PHYSREG) GenTreePhysReg(reg, type);
return result;
}
Expand Down Expand Up @@ -11554,6 +11558,12 @@ void Compiler::gtGetLclVarNameInfo(unsigned lclNum, const char** ilKindOut, cons
{
ilName = "GsCookie";
}
#ifdef TARGET_ARM64
else if (lclNum == lvaFfrRegister)
{
ilName = "FFReg";
}
#endif
else if (lclNum == lvaRetAddrVar)
{
ilName = "ReturnAddress";
Expand Down
29 changes: 10 additions & 19 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2297,10 +2297,15 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,

switch (intrinsic)
{
case NI_Sve_CreateBreakAfterMask:
case NI_Sve_CreateBreakAfterPropagateMask:
case NI_Sve_CreateBreakBeforeMask:
case NI_Sve_CreateBreakBeforePropagateMask:
{
// HWInstrinsic requires a mask for op3
convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(3));
FALLTHROUGH;
}
case NI_Sve_CreateBreakAfterMask:
case NI_Sve_CreateBreakBeforeMask:
case NI_Sve_CreateMaskForFirstActiveElement:
case NI_Sve_CreateMaskForNextActiveElement:
case NI_Sve_GetActiveElementCount:
Expand All @@ -2310,30 +2315,16 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
{
// HWInstrinsic requires a mask for op2
convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(2));
break;
FALLTHROUGH;
}

default:
break;
}

switch (intrinsic)
{
case NI_Sve_CreateBreakAfterPropagateMask:
case NI_Sve_CreateBreakBeforePropagateMask:
{
// HWInstrinsic requires a mask for op3
convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(3));
// HWInstrinsic requires a mask for op1
convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(1));
break;
}

default:
break;
}

// HWInstrinsic requires a mask for op1
convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(1));

if (HWIntrinsicInfo::IsMultiReg(intrinsic))
{
assert(HWIntrinsicInfo::IsExplicitMaskedOperation(retNode->AsHWIntrinsic()->GetHWIntrinsicId()));
Expand Down
71 changes: 71 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)

emitAttr emitSize;
insOpts opt;
bool unspilledFfr = false;

if (HWIntrinsicInfo::SIMDScalar(intrin.id))
{
Expand All @@ -318,6 +319,39 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
emitSize = EA_SCALABLE;
opt = emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType));

switch (intrin.id)
{
case NI_Sve_GetFfrByte:
case NI_Sve_GetFfrInt16:
case NI_Sve_GetFfrInt32:
case NI_Sve_GetFfrInt64:
case NI_Sve_GetFfrSByte:
case NI_Sve_GetFfrUInt16:
case NI_Sve_GetFfrUInt32:
case NI_Sve_GetFfrUInt64:
{
if ((intrin.op1 != nullptr) && ((intrin.op1->gtFlags & GTF_SPILLED) != 0))
{
// If there was a op1 for this intrinsic, it means FFR is consumed here
// and we need to unspill.
unspilledFfr = true;
}
break;
}
case NI_Sve_LoadVectorFirstFaulting:
{
if ((intrin.op3 != nullptr) && ((intrin.op3->gtFlags & GTF_SPILLED) != 0))
{
// If there was a op3 for this intrinsic, it means FFR is consumed here
// and we need to unspill.
unspilledFfr = true;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we don't call genConsumeReg on this operand? Is GTF_SPILLED the only thing that needs to be handled then?

I don't really understand what the FFR register is, but what happens if it gets changed by a callee and we didn't end up spilling the local (that represents p14) around that call. Is that possible?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we do call it, and this is detecting if it unspilled into p14? I wonder if unspilling itself could have the code to mirror p14 to ffr. I'm sort of worried about what can happen with GT_RELOAD or GT_COPY here... You may want to run jitstressregs (not sure if we'll have any coverage, though?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but what happens if it gets changed by a callee and we didn't end up spilling the local (that represents p14) around that call. Is that possible?

That's the entire purpose of this PR, to make sure that we spill it around the call. Since all predicate registers are callee trash, using P14 does force to save across the call (if it is live).

yes, unspilling will happen as part of genConsumeMultiOpOperands().

I wonder if unspilling itself could have the code to mirror p14 to ffr

I could, but that will exercise in the common code path. Having it here already restricts us to do this for scalable APIs which are SVE only.

I'm sort of worried about what can happen with GT_RELOAD or GT_COPY here... You may want to run jitstressregs (not sure if we'll have any coverage, though?)

Unfortunately we have basic coverage of the CI pipeline that I added few days back that tests with AltJit , but not with jitstressregs. I will see if I can test it locally.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i didn't get a good repro, but I can potentially add a gtSkipReloadOrCopy here to bypass the COPY/RELOAD to check if i will be unspilling here or not. The consumeRegs will anyway do the right thing.

break;
}
default:
break;
}
}
else if (intrin.category == HW_Category_Special)
{
Expand Down Expand Up @@ -2366,6 +2400,43 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_Sve_LoadVectorFirstFaulting:
{
if (unspilledFfr)
{
// We have unspilled the FFR in op1Reg. Restore it back in FFR register.
GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op1Reg, opt);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be op3Reg. I have fixed this locally.

}

insScalableOpts sopt = (opt == INS_OPTS_SCALABLE_B) ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_LSL_N;
GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, REG_ZR, opt, sopt);
break;
}

case NI_Sve_GetFfrByte:
case NI_Sve_GetFfrInt16:
case NI_Sve_GetFfrInt32:
case NI_Sve_GetFfrInt64:
case NI_Sve_GetFfrSByte:
case NI_Sve_GetFfrUInt16:
case NI_Sve_GetFfrUInt32:
case NI_Sve_GetFfrUInt64:
{
if (unspilledFfr)
{
// We have unspilled the FFR in op1Reg. Restore it back in FFR register.
GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op1Reg, opt);
Copy link
Member

@jakobbotsch jakobbotsch Jul 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the codegen for these intrinsics just emit mov targetReg, op1Reg and skip wrffr + rdffr entirely?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could, but I will have to double check if that is fine to do, because sometimes, we do want to make sure ffr is up-to-date. I will leave this as a future optimization.

}

GetEmitter()->emitIns_R(ins, emitSize, targetReg, INS_OPTS_SCALABLE_B);
break;
}
case NI_Sve_SetFfr:
{
assert(targetReg == REG_NA);
GetEmitter()->emitIns_R(ins, emitSize, op1Reg, opt);
break;
}
case NI_Sve_ConditionalExtractAfterLastActiveElementScalar:
case NI_Sve_ConditionalExtractLastActiveElementScalar:
{
Expand Down
Loading
Loading