Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lowering for get_One, get_AllBitSet, createScalar(), createScalarUnsafe(). #83402

Merged
merged 3 commits into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,14 +491,38 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre

if (vecCon->IsAllBitsSet())
{
if ((attr != EA_32BYTE) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
switch (attr)
{
case EA_8BYTE:
case EA_16BYTE:
{
emit->emitIns_R_R(INS_pcmpeqd, attr, targetReg, targetReg);
return;
}
#if defined(FEATURE_SIMD)
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
#else
emit->emitIns_R_R(INS_pcmpeqd, attr, targetReg, targetReg);
case EA_32BYTE:
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
return;
}
break;
}

tannergooding marked this conversation as resolved.
Show resolved Hide resolved
case EA_64BYTE:
{
assert(compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F));
emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
static_cast<int8_t>(0xFF));
return;
}
#endif // FEATURE_SIMD
break;

default:
{
unreached();
}
}
}

Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17752,6 +17752,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vpsllvq:
case INS_vpsrlvd:
case INS_vpsrlvq:
case INS_vpternlogd:
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_1C;
break;
Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/jit/fgbasic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1195,7 +1195,9 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed
case NI_Vector256_Create:
case NI_Vector512_Create:
case NI_Vector256_CreateScalar:
case NI_Vector512_CreateScalar:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
case NI_VectorT256_CreateBroadcast:
case NI_X86Base_BitScanForward:
case NI_X86Base_X64_BitScanForward:
Expand Down Expand Up @@ -1519,6 +1521,9 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed
case NI_Vector256_get_AllBitsSet:
case NI_Vector256_get_One:
case NI_Vector256_get_Zero:
case NI_Vector512_get_AllBitsSet:
case NI_Vector512_get_One:
case NI_Vector512_get_Zero:
case NI_VectorT256_get_AllBitsSet:
case NI_VectorT256_get_One:
case NI_VectorT256_get_Zero:
Expand Down
42 changes: 27 additions & 15 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3743,6 +3743,9 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp)
case NI_Vector256_Create:
case NI_Vector256_CreateScalar:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_Create:
case NI_Vector512_CreateScalar:
case NI_Vector512_CreateScalarUnsafe:
#elif defined(TARGET_ARM64)
case NI_Vector64_Create:
case NI_Vector64_CreateScalar:
Expand Down Expand Up @@ -19246,6 +19249,7 @@ bool GenTree::isContainableHWIntrinsic() const

case NI_Vector128_CreateScalarUnsafe:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
{
// These HWIntrinsic operations are contained as part of scalar ops
return true;
Expand Down Expand Up @@ -21478,6 +21482,10 @@ GenTree* Compiler::gtNewSimdCreateScalarNode(
{
hwIntrinsicID = NI_Vector256_CreateScalar;
}
else if (simdSize == 64)
{
hwIntrinsicID = NI_Vector512_CreateScalar;
}
#elif defined(TARGET_ARM64)
if (simdSize == 8)
{
Expand Down Expand Up @@ -21618,6 +21626,10 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(
{
hwIntrinsicID = NI_Vector256_CreateScalarUnsafe;
}
else if (simdSize == 64)
{
hwIntrinsicID = NI_Vector512_CreateScalarUnsafe;
}
#elif defined(TARGET_ARM64)
if (simdSize == 8)
{
Expand Down Expand Up @@ -21911,15 +21923,7 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode(
// We don't guarantee a non-temporal load will actually occur, so fallback
// to regular aligned loads if the required ISA isn't supported.

if (simdSize == 64)
{
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F))
{
intrinsic = NI_AVX512F_LoadAlignedVector512NonTemporal;
isNonTemporal = true;
}
}
else if (simdSize == 32)
if (simdSize == 32)
{
if (compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
Expand All @@ -21932,6 +21936,14 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode(
intrinsic = NI_AVX_LoadAlignedVector256;
}
}
else if (simdSize == 64)
{
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F))
{
intrinsic = NI_AVX512F_LoadAlignedVector512NonTemporal;
isNonTemporal = true;
}
}
else if (compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
intrinsic = NI_SSE41_LoadAlignedVector128NonTemporal;
Expand Down Expand Up @@ -23173,16 +23185,16 @@ GenTree* Compiler::gtNewSimdStoreAlignedNode(

NamedIntrinsic intrinsic = NI_Illegal;

if (simdSize == 64)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_StoreAligned;
}
else if (simdSize == 32)
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
intrinsic = NI_AVX_StoreAligned;
}
else if (simdSize == 64)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_StoreAligned;
}
else if (simdBaseType != TYP_FLOAT)
{
intrinsic = NI_SSE2_StoreAligned;
Expand Down
5 changes: 4 additions & 1 deletion src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -6355,7 +6355,9 @@ struct GenTreeVecCon : public GenTree
case NI_Vector256_Create:
case NI_Vector512_Create:
case NI_Vector256_CreateScalar:
case NI_Vector512_CreateScalar:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
#elif defined(TARGET_ARM64)
case NI_Vector64_Create:
case NI_Vector64_CreateScalar:
Expand All @@ -6371,7 +6373,8 @@ struct GenTreeVecCon : public GenTree
// CreateScalar leaves the upper bits as zero

#if defined(TARGET_XARCH)
if ((intrinsic != NI_Vector128_CreateScalar) && (intrinsic != NI_Vector256_CreateScalar))
if ((intrinsic != NI_Vector128_CreateScalar) && (intrinsic != NI_Vector256_CreateScalar) &&
(intrinsic != NI_Vector512_CreateScalar))
#elif defined(TARGET_ARM64)
if ((intrinsic != NI_Vector64_CreateScalar) && (intrinsic != NI_Vector128_CreateScalar))
#endif
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
{
case NI_Vector128_CreateScalarUnsafe:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
{
if (varTypeIsIntegral(baseType))
{
Expand Down
6 changes: 5 additions & 1 deletion src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,11 @@ HARDWARE_INTRINSIC(Vector256, Xor,
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Vector512 Intrinsics
HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, CreateScalarUnsafe, 64, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector512, get_AllBitsSet, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(Vector512, get_One, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1088,6 +1088,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector128_CreateScalar:
case NI_Vector256_CreateScalar:
case NI_Vector512_CreateScalar:
{
assert(sig->numArgs == 1);

Expand All @@ -1108,6 +1109,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector128_CreateScalarUnsafe:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
{
assert(sig->numArgs == 1);

Expand Down Expand Up @@ -1403,6 +1405,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector128_get_AllBitsSet:
case NI_Vector256_get_AllBitsSet:
case NI_Vector512_get_AllBitsSet:
{
assert(sig->numArgs == 0);
retNode = gtNewAllBitsSetConNode(retType);
Expand All @@ -1411,6 +1414,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector128_get_One:
case NI_Vector256_get_One:
case NI_Vector512_get_One:
{
assert(sig->numArgs == 0);
retNode = gtNewOneConNode(retType, simdBaseType);
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/jit/instr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id)
static char buf[4][TEMP_BUFFER_LEN];
const char* retbuf;

if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) &&
if (GetEmitter()->IsVexOrEvexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) &&
!GetEmitter()->IsKInstruction(ins))
{
sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName);
Expand Down Expand Up @@ -700,6 +700,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op)
{
case NI_Vector128_CreateScalarUnsafe:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
{
// The hwintrinsic should be contained and its
// op1 should be either contained or spilled. This
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,7 @@ INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE,
INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None)
INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values
INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values
INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
INST3(LAST_AVX512F_INSTRUCTION, "LAST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)

INST3(FIRST_AVX512BW_INSTRUCTION, "FIRST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
Expand Down
13 changes: 9 additions & 4 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_Vector512_Create:
case NI_Vector128_CreateScalar:
case NI_Vector256_CreateScalar:
case NI_Vector512_CreateScalar:
{
// We don't directly support the Vector128.Create or Vector256.Create methods in codegen
// and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect
Expand Down Expand Up @@ -1934,9 +1935,10 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
GenTree* tmp2 = nullptr;
GenTree* tmp3 = nullptr;

bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant<simd_t>(node, simdVal);
bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar);
size_t argCnt = node->GetOperandCount();
bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant<simd_t>(node, simdVal);
bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar) ||
(intrinsicId == NI_Vector512_CreateScalar);
size_t argCnt = node->GetOperandCount();

if (isConstant)
{
Expand Down Expand Up @@ -6680,6 +6682,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
{
case NI_Vector128_CreateScalarUnsafe:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
{
if (!varTypeIsIntegral(childNode->TypeGet()))
{
Expand Down Expand Up @@ -6826,6 +6829,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
{
case NI_Vector128_CreateScalarUnsafe:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
{
if (!supportsSIMDScalarLoads)
{
Expand Down Expand Up @@ -7055,7 +7059,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
NamedIntrinsic childNodeId = childNode->GetHWIntrinsicId();

if ((childNodeId == NI_Vector128_CreateScalarUnsafe) ||
(childNodeId == NI_Vector256_CreateScalarUnsafe))
(childNodeId == NI_Vector256_CreateScalarUnsafe) ||
(childNodeId == NI_Vector512_CreateScalarUnsafe))
{
// We have a very special case of BroadcastScalarToVector(CreateScalarUnsafe(op1))
//
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2019,6 +2019,7 @@ static GenTree* SkipContainedCreateScalarUnsafe(GenTree* node)
{
case NI_Vector128_CreateScalarUnsafe:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector512_CreateScalarUnsafe:
{
return hwintrinsic->Op(1);
}
Expand Down Expand Up @@ -2127,6 +2128,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
case NI_Vector128_ToScalar:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector256_ToScalar:
case NI_Vector512_CreateScalarUnsafe:
{
assert(numArgs == 1);

Expand Down