Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose AVX512 variants of SSE-SSE4.2 instructions #84909

Merged
merged 24 commits into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8636dab
Ensure already exposed AVX512 APIs are in the right ISAs
tannergooding Apr 16, 2023
08fb613
Exposing Add and Subtract for AVX512F and AVX512BW
tannergooding Apr 16, 2023
396abf1
Exposing Abs for AVX512F, AVX512F.VL, and AVX512BW
tannergooding Apr 16, 2023
835fbad
Exposing AddSaturate and SubtractSaturate for AVX512BW
tannergooding Apr 16, 2023
41d64d7
Expose AlignRight and Average for AVX512BW
tannergooding Apr 16, 2023
3908c75
Expose ConvertToVector* for AVX512F and AVX512BW
tannergooding Apr 16, 2023
d3b91bf
Expose Divide, Multiply, MultiplyAddAdjacent, MultiplyLow, and Multip…
tannergooding Apr 16, 2023
b1e8fa9
Expose DuplicateEvenIndexed and DuplicateOddIndexed in Avx512F
tannergooding Apr 16, 2023
55cb66b
Expose LoadAlignedVector512NonTemporal in Avx512F
tannergooding Apr 16, 2023
30ef5e3
Expose Max and Min for Avx512F, Avx512F.VL, and Avx512BW
tannergooding Apr 16, 2023
ac0acd0
Expose PackSignedSaturate and PackUnsignedSaturate for Avx512BW
tannergooding Apr 16, 2023
03b3ecb
Expose ShiftLeftLogical, ShiftLeftLogical128BitLane, ShiftRightArithm…
tannergooding Apr 16, 2023
b00635e
Expose Shuffle, ShuffleLow, and ShuffleHigh for Avx512F and Avx512BW
tannergooding Apr 16, 2023
aeffab3
Expose Sqrt and SumAbsoluteDifferences for Avx512F and Avx512BW
tannergooding Apr 16, 2023
314605e
Expose UnpackLow and UnpackHigh for Avx512F and Avx512BW
tannergooding Apr 16, 2023
f1d3013
Ensure Avx2.Shuffle correctly handles containment for pshufb
tannergooding Apr 17, 2023
a3ec955
Apply formatting patch
tannergooding Apr 17, 2023
4a84b0c
Fixing a couple of minor asserts
tannergooding Apr 17, 2023
7e6856f
Ensure TYP_DOUBLE is correctly handled for arithmetic right shift
tannergooding Apr 17, 2023
1b8bf9b
Work around an SPMI failure with the baseline
tannergooding Apr 17, 2023
6bade9f
Fixing some test logic to account for "per lane" operations on 512-bi…
tannergooding Apr 17, 2023
2afd909
Ensure we don't fail for 64-bit vector multiplication on 32-bit
tannergooding Apr 17, 2023
b4d9a35
Merge remote-tracking branch 'dotnet/main' into avx512-2
tannergooding Apr 17, 2023
55adfca
Merge remote-tracking branch 'dotnet/main' into avx512-2
tannergooding Apr 18, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ regNumber emitter::getSseShiftRegNumber(instruction ins)

case INS_psrad:
case INS_psraw:
case INS_vpsraq:
{
return (regNumber)4;
}
Expand Down Expand Up @@ -17916,15 +17917,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_pminsw:
case INS_pminud:
case INS_pminsd:
case INS_vpminuq:
case INS_vpminsq:
case INS_pmaxub:
case INS_pmaxsb:
case INS_pmaxuw:
case INS_pmaxsw:
case INS_pmaxsd:
case INS_pmaxud:
case INS_vpmaxsq:
case INS_vpmaxuq:
case INS_pabsb:
case INS_pabsw:
case INS_pabsd:
case INS_vpabsq:
case INS_psignb:
case INS_psignw:
case INS_psignd:
Expand All @@ -17949,6 +17955,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_psrlq:
case INS_psrad:
case INS_psraw:
case INS_vpsraq:
if (insFmt == IF_RWR_CNS)
{
result.insLatency = PERFSCORE_LATENCY_1C;
Expand Down Expand Up @@ -18193,6 +18200,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insLatency += PERFSCORE_LATENCY_10C;
break;

case INS_vpmullq:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_15C;
break;

case INS_vpbroadcastb:
case INS_vpbroadcastw:
case INS_vpbroadcastd:
Expand Down
157 changes: 144 additions & 13 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18966,6 +18966,12 @@ bool GenTree::isCommutativeHWIntrinsic() const
{
return false;
}

case NI_AVX512F_Max:
case NI_AVX512F_Min:
{
return !varTypeIsFloating(node->GetSimdBaseType());
}
#endif // TARGET_XARCH

default:
Expand Down Expand Up @@ -19269,11 +19275,27 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si
return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize);
}

assert((simdSize != 32) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
NamedIntrinsic intrinsic = NI_Illegal;

if (simdBaseType == TYP_LONG)
{
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
{
intrinsic = NI_AVX512F_VL_Abs;
}
}
else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
intrinsic = NI_AVX2_Abs;
}
else if (compOpportunisticallyDependsOn(InstructionSet_SSSE3))
{
intrinsic = NI_SSSE3_Abs;
}

if ((simdBaseType != TYP_LONG) && ((simdSize == 32) || compOpportunisticallyDependsOn(InstructionSet_SSSE3)))
if (intrinsic != NI_Illegal)
{
NamedIntrinsic intrinsic = (simdSize == 32) ? NI_AVX2_Abs : NI_SSSE3_Abs;
return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize);
}
else
Expand Down Expand Up @@ -19390,6 +19412,23 @@ GenTree* Compiler::gtNewSimdBinOpNode(
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_And;

if (varTypeIsIntegral(simdBaseType))
steveharter marked this conversation as resolved.
Show resolved Hide resolved
{
intrinsic = NI_AVX512F_And;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ))
{
intrinsic = NI_AVX512DQ_And;
}
else
{
// Since this is a bitwise operation, we can still support it by lying
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX512F_And;
simdBaseJitType = (simdBaseType == TYP_DOUBLE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_INT;
}
}
else if (simdSize == 32)
{
Expand All @@ -19409,7 +19448,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX_And;
simdBaseJitType = CORINFO_TYPE_FLOAT;
simdBaseJitType = varTypeIsLong(simdBaseType) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_FLOAT;
}
}
else if (simdBaseType == TYP_FLOAT)
Expand All @@ -19429,6 +19468,23 @@ GenTree* Compiler::gtNewSimdBinOpNode(
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_AndNot;

if (varTypeIsIntegral(simdBaseType))
{
intrinsic = NI_AVX512F_AndNot;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ))
{
intrinsic = NI_AVX512DQ_AndNot;
}
else
{
// Since this is a bitwise operation, we can still support it by lying
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX512F_AndNot;
simdBaseJitType = (simdBaseType == TYP_DOUBLE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_INT;
}
}
else if (simdSize == 32)
{
Expand All @@ -19448,7 +19504,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX_AndNot;
simdBaseJitType = CORINFO_TYPE_FLOAT;
simdBaseJitType = varTypeIsLong(simdBaseType) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_FLOAT;
}
}
else if (simdBaseType == TYP_FLOAT)
Expand Down Expand Up @@ -19510,7 +19566,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}

assert(!varTypeIsByte(simdBaseType));
assert((op != GT_RSH) || (!varTypeIsUnsigned(simdBaseType) && !varTypeIsLong(simdBaseType)));

// "over shifting" is platform specific behavior. We will match the C# behavior
// this requires we mask with (sizeof(T) * 8) - 1 which ensures the shift cannot
Expand Down Expand Up @@ -19541,7 +19596,16 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}
else if (op == GT_RSH)
{
intrinsic = NI_AVX2_ShiftRightArithmetic;
if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE))
{
assert(varTypeIsSigned(simdBaseType));
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL));
intrinsic = NI_AVX512F_VL_ShiftRightArithmetic;
}
else
{
intrinsic = NI_AVX2_ShiftRightArithmetic;
}
}
else
{
Expand All @@ -19555,7 +19619,16 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}
else if (op == GT_RSH)
{
intrinsic = NI_SSE2_ShiftRightArithmetic;
if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE))
{
assert(varTypeIsSigned(simdBaseType));
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL));
intrinsic = NI_AVX512F_VL_ShiftRightArithmetic;
}
else
{
intrinsic = NI_SSE2_ShiftRightArithmetic;
}
}
else
{
Expand Down Expand Up @@ -19654,6 +19727,16 @@ GenTree* Compiler::gtNewSimdBinOpNode(
break;
}

case TYP_LONG:
case TYP_ULONG:
{
assert((simdSize == 16) || (simdSize == 32));
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512DQ_VL));

intrinsic = NI_AVX512DQ_VL_MultiplyLow;
break;
}

case TYP_FLOAT:
{
if (simdSize == 32)
Expand Down Expand Up @@ -19696,6 +19779,23 @@ GenTree* Compiler::gtNewSimdBinOpNode(
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_Or;

if (varTypeIsIntegral(simdBaseType))
{
intrinsic = NI_AVX512F_Or;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ))
{
intrinsic = NI_AVX512DQ_Or;
}
else
{
// Since this is a bitwise operation, we can still support it by lying
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX512F_Or;
simdBaseJitType = (simdBaseType == TYP_DOUBLE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_INT;
}
}
else if (simdSize == 32)
{
Expand All @@ -19715,7 +19815,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX_Or;
simdBaseJitType = CORINFO_TYPE_FLOAT;
simdBaseJitType = varTypeIsLong(simdBaseType) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_FLOAT;
}
}
else if (simdBaseType == TYP_FLOAT)
Expand Down Expand Up @@ -19775,6 +19875,23 @@ GenTree* Compiler::gtNewSimdBinOpNode(
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_Xor;

if (varTypeIsIntegral(simdBaseType))
{
intrinsic = NI_AVX512F_Xor;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ))
{
intrinsic = NI_AVX512DQ_Xor;
}
else
{
// Since this is a bitwise operation, we can still support it by lying
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX512F_Xor;
simdBaseJitType = (simdBaseType == TYP_DOUBLE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_INT;
}
}
else if (simdSize == 32)
{
Expand All @@ -19794,7 +19911,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
// about the type and doing the operation using a supported instruction

intrinsic = NI_AVX_Xor;
simdBaseJitType = CORINFO_TYPE_FLOAT;
simdBaseJitType = varTypeIsLong(simdBaseType) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_FLOAT;
}
}
else if (simdBaseType == TYP_FLOAT)
Expand Down Expand Up @@ -21885,6 +22002,10 @@ GenTree* Compiler::gtNewSimdMaxNode(
{
intrinsic = NI_AVX2_Max;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
{
intrinsic = NI_AVX512F_VL_Max;
}
}
}
else
Expand Down Expand Up @@ -21974,14 +22095,17 @@ GenTree* Compiler::gtNewSimdMaxNode(
if (compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
intrinsic = NI_SSE41_Max;
break;
}
break;
}

case TYP_LONG:
case TYP_ULONG:
{
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
{
intrinsic = NI_AVX512F_VL_Max;
}
break;
}

Expand Down Expand Up @@ -22072,6 +22196,10 @@ GenTree* Compiler::gtNewSimdMinNode(
{
intrinsic = NI_AVX2_Min;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
{
intrinsic = NI_AVX512F_VL_Min;
}
}
}
else
Expand Down Expand Up @@ -22157,14 +22285,17 @@ GenTree* Compiler::gtNewSimdMinNode(
if (compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
intrinsic = NI_SSE41_Min;
break;
}
break;
}

case TYP_LONG:
case TYP_ULONG:
{
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
{
intrinsic = NI_AVX512F_VL_Min;
}
break;
}

Expand Down Expand Up @@ -22496,7 +22627,7 @@ GenTree* Compiler::gtNewSimdNarrowNode(
//
// var tmp1 = Avx.ConvertToVector128Single(op1).ToVector256Unsafe();
// var tmp2 = Avx.ConvertToVector128Single(op2);
// return Avx.InsertVector128(tmp1, tmp2, 1);
// return tmp1.WithUpper(tmp2);

CorInfoType opBaseJitType = CORINFO_TYPE_DOUBLE;

Expand Down
17 changes: 12 additions & 5 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1799,13 +1799,20 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
case NI_AVX512BW_ConvertToVector256Byte:
case NI_AVX512BW_ConvertToVector256SByte:
{
// These instructions are RM_R and so we need to ensure the targetReg
// is passed in as the RM register and op1 is passed as the R register

op1Reg = op1->GetRegNum();
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);

emit->emitIns_R_R(ins, attr, op1Reg, targetReg);
if (varTypeIsFloating(baseType))
{
genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1);
}
else
{
// These instructions are RM_R and so we need to ensure the targetReg
// is passed in as the RM register and op1 is passed as the R register

op1Reg = op1->GetRegNum();
emit->emitIns_R_R(ins, attr, op1Reg, targetReg);
}
break;
}

Expand Down
Loading