-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT ARM64-SVE: Add Sve.Abs() and Sve.Add() #100134
Changes from all commits
0d437e3
e9fa735
5acc122
15e893d
c22f458
508a52a
5825c20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1396,6 +1396,60 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, | |
GenTree* op3 = nullptr; | ||
GenTree* op4 = nullptr; | ||
|
||
switch (numArgs) | ||
{ | ||
case 4: | ||
op4 = getArgForHWIntrinsic(sigReader.GetOp4Type(), sigReader.op4ClsHnd); | ||
op4 = addRangeCheckIfNeeded(intrinsic, op4, mustExpand, immLowerBound, immUpperBound); | ||
op3 = getArgForHWIntrinsic(sigReader.GetOp3Type(), sigReader.op3ClsHnd); | ||
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd); | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
break; | ||
|
||
case 3: | ||
op3 = getArgForHWIntrinsic(sigReader.GetOp3Type(), sigReader.op3ClsHnd); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there no case where a range check is needed for 3 arguments? Should there be an assert to validate that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That seems to be the case. The case for 3 args is checked further down and depending on the intrinsics, does the |
||
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd); | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
break; | ||
|
||
case 2: | ||
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd); | ||
op2 = addRangeCheckIfNeeded(intrinsic, op2, mustExpand, immLowerBound, immUpperBound); | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
break; | ||
|
||
case 1: | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
break; | ||
|
||
default: | ||
break; | ||
} | ||
|
||
#if defined(TARGET_ARM64) | ||
// Embedded masks need inserting as op1. | ||
if (HWIntrinsicInfo::IsEmbeddedMaskedOperation(intrinsic)) | ||
{ | ||
numArgs++; | ||
assert(numArgs <= 4); | ||
switch (numArgs) | ||
{ | ||
case 4: | ||
op4 = op3; | ||
tannergooding marked this conversation as resolved.
Show resolved
Hide resolved
|
||
FALLTHROUGH; | ||
case 3: | ||
op3 = op2; | ||
FALLTHROUGH; | ||
case 2: | ||
op2 = op1; | ||
FALLTHROUGH; | ||
default: | ||
break; | ||
} | ||
op1 = gtNewSimdEmbeddedMaskNode(simdBaseJitType, simdSize); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this need to be done here? This seems like its just inserting the implicit Seemingly this could just be inserted as part of lowering instead so that it only impacts LSRA and codegen? |
||
} | ||
#endif | ||
|
||
switch (numArgs) | ||
{ | ||
case 0: | ||
|
@@ -1407,8 +1461,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, | |
|
||
case 1: | ||
{ | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
|
||
if ((category == HW_Category_MemoryLoad) && op1->OperIs(GT_CAST)) | ||
{ | ||
// Although the API specifies a pointer, if what we have is a BYREF, that's what | ||
|
@@ -1467,10 +1519,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, | |
|
||
case 2: | ||
{ | ||
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd); | ||
op2 = addRangeCheckIfNeeded(intrinsic, op2, mustExpand, immLowerBound, immUpperBound); | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
|
||
retNode = isScalar | ||
? gtNewScalarHWIntrinsicNode(nodeRetType, op1, op2, intrinsic) | ||
: gtNewSimdHWIntrinsicNode(nodeRetType, op1, op2, intrinsic, simdBaseJitType, simdSize); | ||
|
@@ -1524,10 +1572,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, | |
|
||
case 3: | ||
{ | ||
op3 = getArgForHWIntrinsic(sigReader.GetOp3Type(), sigReader.op3ClsHnd); | ||
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd); | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
|
||
#ifdef TARGET_ARM64 | ||
if (intrinsic == NI_AdvSimd_LoadAndInsertScalar) | ||
{ | ||
|
@@ -1569,12 +1613,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, | |
|
||
case 4: | ||
{ | ||
op4 = getArgForHWIntrinsic(sigReader.GetOp4Type(), sigReader.op4ClsHnd); | ||
op4 = addRangeCheckIfNeeded(intrinsic, op4, mustExpand, immLowerBound, immUpperBound); | ||
op3 = getArgForHWIntrinsic(sigReader.GetOp3Type(), sigReader.op3ClsHnd); | ||
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd); | ||
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd); | ||
|
||
assert(!isScalar); | ||
retNode = | ||
gtNewSimdHWIntrinsicNode(nodeRetType, op1, op2, op3, op4, intrinsic, simdBaseJitType, simdSize); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -398,6 +398,64 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) | |
unreached(); | ||
} | ||
} | ||
else if (isRMW) | ||
{ | ||
assert(!hasImmediateOperand); | ||
assert(!HWIntrinsicInfo::SupportsContainment(intrin.id)); | ||
|
||
// Move the RMW register out of the way and do not pass it to the emit. | ||
|
||
if (HWIntrinsicInfo::IsEmbeddedMaskedOperation(intrin.id)) | ||
{ | ||
// op1Reg contains a mask, op2Reg contains the RMW register. | ||
|
||
if (targetReg != op2Reg) | ||
{ | ||
assert(targetReg != op3Reg); | ||
GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op2Reg, /* canSkip */ true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure if it is needed here, but we should start document the rules of using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's the same general So I'm not sure there's really any rules to document here, we certainly don't document it at all for x86 or x64 or any of the existing Arm64 RMW cases. |
||
} | ||
|
||
switch (intrin.numOperands) | ||
{ | ||
case 2: | ||
GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op1Reg, opt); | ||
break; | ||
|
||
case 3: | ||
assert(targetReg != op3Reg); | ||
GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op1Reg, op3Reg, opt); | ||
break; | ||
|
||
default: | ||
unreached(); | ||
} | ||
} | ||
else | ||
{ | ||
// op1Reg contains the RMW register. | ||
|
||
if (targetReg != op1Reg) | ||
{ | ||
assert(targetReg != op2Reg); | ||
assert(targetReg != op3Reg); | ||
GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg, /* canSkip */ true); | ||
} | ||
|
||
switch (intrin.numOperands) | ||
{ | ||
case 2: | ||
GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op2Reg, opt); | ||
break; | ||
|
||
case 3: | ||
GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op2Reg, op3Reg, opt); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need an assert that none of the input registers are mask registers? I'd guess the only RMW instructions with masks are ones that have at least 4 operands, so we shouldn't ever see that here. |
||
break; | ||
|
||
default: | ||
unreached(); | ||
} | ||
} | ||
} | ||
else | ||
{ | ||
assert(!hasImmediateOperand); | ||
|
@@ -416,35 +474,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) | |
{ | ||
GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op1Reg, opt); | ||
} | ||
else if (isRMW) | ||
{ | ||
if (targetReg != op1Reg) | ||
{ | ||
assert(targetReg != op2Reg); | ||
|
||
GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg, | ||
/* canSkip */ true); | ||
} | ||
GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op2Reg, opt); | ||
} | ||
else | ||
{ | ||
GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, opt); | ||
} | ||
break; | ||
|
||
case 3: | ||
assert(isRMW); | ||
if (targetReg != op1Reg) | ||
{ | ||
assert(targetReg != op2Reg); | ||
assert(targetReg != op3Reg); | ||
|
||
GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg, /* canSkip */ true); | ||
} | ||
GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op2Reg, op3Reg, opt); | ||
break; | ||
|
||
default: | ||
unreached(); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,10 @@ | |
// SVE Intrinsics | ||
|
||
// Sve | ||
HARDWARE_INTRINSIC(Sve, Abs, -1, -1, false, {INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need the |
||
|
||
HARDWARE_INTRINSIC(Sve, Add, -1, -1, false, {INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_fadd, INS_sve_fadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you elaborate a bit on what I'm looking at the architecture manual and don't see any limitations on the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The instruction only has 3bits for a predicate register so is limited to using predicate registers 0 to 7. This is quite a common pattern across Sve, hence using an common enum for it (I wouldn't do similar for the handful have only 2 bits) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. I think a flag makes sense then, but I'm not a huge fan of the name given how Maybe something more explicit like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this one in particular need |
||
|
||
HARDWARE_INTRINSIC(Sve, CreateTrueMaskByte, -1, 1, false, {INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_EnumPattern, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) | ||
HARDWARE_INTRINSIC(Sve, CreateTrueMaskDouble, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue}, HW_Category_EnumPattern, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) | ||
HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt16, -1, 1, false, {INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_EnumPattern, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the todo above, are we potentially missing an assert which would ensure this is updated for future instructions that need it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is within code just for predicates. So will always be valid as long as the vector length is 128bits.
There is a question of what testing to do for >128bit vectors for .Net9. I suspect a lot of assumptions are made elsewhere that vector length is 128bits, and will require some major debugging. At some point I can do some testing on larger vector length machines. Due to time constraints, maybe the solution for .Net9 is have a check on startup: if vector length is >128bits then ask kernel to reduce to 128bits? Or just disable on >128 bits machines.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That question probably needs a much broader discussion.
There's notably not a lot of benefit of 128-bit SVE over AdvSimd. There's a few new instructions and the ability to emit denser assembly in a few cases, but most of that isn't for the typical hot loop of a method and in some case the code can be less dense (emitting
SVE Abs
requires predication and aptrue
to be generated; whileAdvSimd Abs
does not, so given 128-bit vectors and no predication; AdvSimd is better to use for that instruction).At the same time, there is hardware (AWS Graviton) that has 256-bit SVE support that will most likely run on .NET 9. So it would probably be best if we could ensure it is appropriately handled and we're best able to take advantage of such hardware, not artificially limit it.