diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 74420b2883260..14ebb0d27e450 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -968,9 +968,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX regNumber targetReg); void genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode); void genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode); - void genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode); void genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg); - void genSIMDIntrinsicWiden(GenTreeSIMD* simdNode); void genSIMDIntrinsic(GenTreeSIMD* simdNode); // TYP_SIMD12 (i.e Vector3 of size 12 bytes) is not a hardware supported size and requires diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 317e059c683de..7a5e9122fcc38 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3898,15 +3898,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicUnOp(simdNode); break; - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: - genSIMDIntrinsicWiden(simdNode); - break; - - case SIMDIntrinsicNarrow: - genSIMDIntrinsicNarrow(simdNode); - break; - case SIMDIntrinsicSub: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: @@ -3995,20 +3986,9 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicEqual: result = INS_fcmeq; break; - case SIMDIntrinsicNarrow: - // Use INS_fcvtn lower bytes of result followed by INS_fcvtn2 for upper bytes - // Return lower bytes instruction here - result = INS_fcvtn; - break; case SIMDIntrinsicSub: result = INS_fsub; break; - case SIMDIntrinsicWidenLo: - result = INS_fcvtl; - break; - case SIMDIntrinsicWidenHi: - result = INS_fcvtl2; - break; default: assert(!"Unsupported SIMD intrinsic"); unreached(); @@ -4036,20 +4016,9 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicEqual: result = INS_cmeq; break; - case SIMDIntrinsicNarrow: - // Use INS_xtn lower bytes of result followed by INS_xtn2 for upper bytes - // Return lower bytes instruction here - result = INS_xtn; - break; case SIMDIntrinsicSub: result = INS_sub; break; - case SIMDIntrinsicWidenLo: - result = isUnsigned ? INS_uxtl : INS_sxtl; - break; - case SIMDIntrinsicWidenHi: - result = isUnsigned ? INS_uxtl2 : INS_sxtl2; - break; default: assert(!"Unsupported SIMD intrinsic"); unreached(); @@ -4228,113 +4197,6 @@ void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// The Widen intrinsics are broken into separate intrinsics for the two results. -// -void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) -{ - assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) || - (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)); - - GenTree* op1 = simdNode->gtGetOp1(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber srcReg = op1Reg; - emitAttr emitSize = emitActualTypeSize(simdType); - - instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - - emitAttr attr = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) ? EA_16BYTE : EA_8BYTE; - insOpts opt = genGetSimdInsOpt(attr, baseType); - - GetEmitter()->emitIns_R_R(ins, attr, targetReg, op1Reg, opt); - - genProduceReg(simdNode); -} - -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// This intrinsic takes two arguments. The first operand is narrowed to produce the -// lower elements of the results, and the second operand produces the high elements. -// -void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - emitAttr emitSize = emitTypeSize(simdType); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - assert(genIsValidFloatReg(op1Reg)); - assert(genIsValidFloatReg(op2Reg)); - assert(genIsValidFloatReg(targetReg)); - assert(op2Reg != targetReg); - assert(simdNode->GetSimdSize() == 16); - - instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - assert((ins == INS_fcvtn) || (ins == INS_xtn)); - - instruction ins2 = (ins == INS_fcvtn) ? INS_fcvtn2 : INS_xtn2; - - insOpts opt = INS_OPTS_NONE; - insOpts opt2 = INS_OPTS_NONE; - - // This is not the same as genGetSimdInsOpt() - // Basetype is the soure operand type - // However encoding is based on the destination operand type which is 1/2 the basetype. - switch (baseType) - { - case TYP_ULONG: - case TYP_LONG: - case TYP_DOUBLE: - opt = INS_OPTS_2S; - opt2 = INS_OPTS_4S; - break; - case TYP_UINT: - case TYP_INT: - opt = INS_OPTS_4H; - opt2 = INS_OPTS_8H; - break; - case TYP_USHORT: - case TYP_SHORT: - opt = INS_OPTS_8B; - opt2 = INS_OPTS_16B; - break; - default: - assert(!"Unsupported narrowing element type"); - unreached(); - } - - GetEmitter()->emitIns_R_R(ins, EA_8BYTE, targetReg, op1Reg, opt); - GetEmitter()->emitIns_R_R(ins2, EA_16BYTE, targetReg, op2Reg, opt2); - - genProduceReg(simdNode); -} - //-------------------------------------------------------------------------------- // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations // add, sub, mul, bit-wise And, AndNot and Or. diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 4f1895918545c..18fce48d72b34 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3283,6 +3283,13 @@ class Compiler unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdNarrowNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdSqrtNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); @@ -3293,6 +3300,12 @@ class Compiler unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdWidenLowerNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); + + GenTree* gtNewSimdWidenUpperNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdWithElementNode(var_types type, GenTree* op1, GenTree* op2, diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7791a09d9c6fc..e7aded8d92c92 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20918,6 +20918,431 @@ GenTree* Compiler::gtNewSimdMinNode(var_types type, return gtNewSimdCndSelNode(type, op1, op1Dup, op2Dup, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } +GenTree* Compiler::gtNewSimdNarrowNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + assert(IsBaselineSimdIsaSupportedDebugOnly()); + + assert(varTypeIsSIMD(type)); + assert(getSIMDTypeForSize(simdSize) == type); + + assert(op1 != nullptr); + assert(op1->TypeIs(type)); + + assert(op2 != nullptr); + assert(op2->TypeIs(type)); + + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType) && !varTypeIsLong(simdBaseType)); + + GenTree* tmp1; + GenTree* tmp2; + +#if defined(TARGET_XARCH) + GenTree* tmp3; + GenTree* tmp4; + + if (simdSize == 32) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + // This is the same in principle to the other comments below, however due to + // code formatting, its too long to reasonably display here. + + CorInfoType opBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_SHORT : CORINFO_TYPE_USHORT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x00FF), NI_Vector256_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp4 = gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, + simdSize, isSimdAsHWIntrinsic); + + CorInfoType permuteBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + return gtNewSimdHWIntrinsicNode(type, tmp4, gtNewIconNode(SHUFFLE_WYZX), NI_AVX2_Permute4x64, + permuteBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + case TYP_SHORT: + case TYP_USHORT: + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + // op1 = Elements 0L, 0U, 1L, 1U, 2L, 2U, 3L, 3U | 4L, 4U, 5L, 5U, 6L, 6U, 7L, 7U + // op2 = Elements 8L, 8U, 9L, 9U, AL, AU, BL, BU | CL, CU, DL, DU, EL, EU, FL, FU + // + // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, -- | 4L, --, 5L, --, 6L, --, 7L, -- + // tmp3 = Elements 8L, --, 9L, --, AL, --, BL, -- | CL, --, DL, --, EL, --, FL, -- + // tmp4 = Elements 0L, 1L, 2L, 3L, 8L, 9L, AL, BL | 4L, 5L, 6L, 7L, CL, DL, EL, FL + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L | 8L, 9L, AL, BL, CL, DL, EL, FL + // + // var tmp1 = Vector256.Create(0x0000FFFF).AsInt16(); + // var tmp2 = Avx2.And(op1.AsInt16(), tmp1); + // var tmp3 = Avx2.And(op2.AsInt16(), tmp1); + // var tmp4 = Avx2.PackUnsignedSaturate(tmp2, tmp3); + // return Avx2.Permute4x64(tmp4.AsUInt64(), SHUFFLE_WYZX).As(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_SHORT) ? CORINFO_TYPE_INT : CORINFO_TYPE_UINT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x0000FFFF), NI_Vector256_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp4 = gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE41_PackUnsignedSaturate, CORINFO_TYPE_USHORT, + simdSize, isSimdAsHWIntrinsic); + + CorInfoType permuteBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + return gtNewSimdHWIntrinsicNode(type, tmp4, gtNewIconNode(SHUFFLE_WYZX), NI_AVX2_Permute4x64, + permuteBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + case TYP_INT: + case TYP_UINT: + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + // op1 = Elements 0, 1 | 2, 3; 0L, 0U, 1L, 1U | 2L, 2U, 3L, 3U + // op2 = Elements 4, 5 | 6, 7; 4L, 4U, 5L, 5U | 6L, 6U, 7L, 7U + // + // tmp1 = Elements 0L, 4L, 0U, 4U | 2L, 6L, 2U, 6U + // tmp2 = Elements 1L, 5L, 1U, 5U | 3L, 7L, 3U, 7U + // tmp3 = Elements 0L, 1L, 4L, 5L | 2L, 3L, 6L, 7L + // return Elements 0L, 1L, 2L, 3L | 4L, 5L, 6L, 7L + // + // var tmp1 = Avx2.UnpackLow(op1, op2); + // var tmp2 = Avx2.UnpackHigh(op1, op2); + // var tmp3 = Avx2.UnpackLow(tmp1, tmp2); + // return Avx2.Permute4x64(tmp3.AsUInt64(), SHUFFLE_WYZX).AsUInt32(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_INT) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + GenTree* op2Dup; + op2 = impCloneExpr(op2, &op2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op2 for vector narrow")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op1Dup, op2Dup, NI_AVX2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp3, gtNewIconNode(SHUFFLE_WYZX), NI_AVX2_Permute4x64, + opBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + case TYP_FLOAT: + { + // op1 = Elements 0, 1 | 2, 3 + // op2 = Elements 4, 5 | 6, 7 + // + // tmp1 = Elements 0, 1, 2, 3 | -, -, -, - + // tmp1 = Elements 4, 5, 6, 7 + // return Elements 0, 1, 2, 3 | 4, 5, 6, 7 + // + // var tmp1 = Avx.ConvertToVector128Single(op1).ToVector256Unsafe(); + // var tmp2 = Avx.ConvertToVector128Single(op2); + // return Avx.InsertVector128(tmp1, tmp2, 1); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_ToVector256Unsafe, simdBaseJitType, 16, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, gtNewIconNode(1), NI_AVX_InsertVector128, + simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + default: + { + unreached(); + } + } + } + else + { + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + // op1 = Elements 0, 1, 2, 3, 4, 5, 6, 7; 0L, 0U, 1L, 1U, 2L, 2U, 3L, 3U, 4L, 4U, 5L, 5U, 6L, 6U, 7L, 7U + // op2 = Elements 8, 9, A, B, C, D, E, F; 8L, 8U, 9L, 9U, AL, AU, BL, BU, CL, CU, DL, DU, EL, EU, FL, FU + // + // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, --, 4L, --, 5L, --, 6L, --, 7L, -- + // tmp3 = Elements 8L, --, 9L, --, AL, --, BL, --, CL, --, DL, --, EL, --, FL, -- + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, AL, BL, CL, DL, EL, FL + // + // var tmp1 = Vector128.Create((ushort)(0x00FF)).AsSByte(); + // var tmp2 = Sse2.And(op1.AsSByte(), tmp1); + // var tmp3 = Sse2.And(op2.AsSByte(), tmp1); + // return Sse2.PackUnsignedSaturate(tmp1, tmp2).As(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_SHORT : CORINFO_TYPE_USHORT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x00FF), NI_Vector128_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, + simdSize, isSimdAsHWIntrinsic); + } + + case TYP_SHORT: + case TYP_USHORT: + { + // op1 = Elements 0, 1, 2, 3; 0L, 0U, 1L, 1U, 2L, 2U, 3L, 3U + // op2 = Elements 4, 5, 6, 7; 4L, 4U, 5L, 5U, 6L, 6U, 7L, 7U + // + // ... + + CorInfoType opBaseJitType = (simdBaseType == TYP_SHORT) ? CORINFO_TYPE_INT : CORINFO_TYPE_UINT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // ... + // + // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, -- + // tmp3 = Elements 4L, --, 5L, --, 6L, --, 7L, -- + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L + // + // var tmp1 = Vector128.Create(0x0000FFFF).AsInt16(); + // var tmp2 = Sse2.And(op1.AsInt16(), tmp1); + // var tmp3 = Sse2.And(op2.AsInt16(), tmp1); + // return Sse2.PackUnsignedSaturate(tmp2, tmp3).As(); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x0000FFFF), NI_Vector128_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE41_PackUnsignedSaturate, + CORINFO_TYPE_USHORT, simdSize, isSimdAsHWIntrinsic); + } + else + { + // ... + // + // tmp1 = Elements 0L, 4L, 0U, 4U, 1L, 5L, 1U, 5U + // tmp2 = Elements 2L, 6L, 2U, 6U, 3L, 7L, 3U, 7U + // tmp3 = Elements 0L, 2L, 4L, 6L, 0U, 2U, 4U, 6U + // tmp4 = Elements 1L, 3L, 5L, 7L, 1U, 3U, 5U, 7U + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L + // + // var tmp1 = Sse2.UnpackLow(op1.AsUInt16(), op2.AsUInt16()); + // var tmp2 = Sse2.UnpackHigh(op1.AsUInt16(), op2.AsUInt16()); + // var tmp3 = Sse2.UnpackLow(tmp1, tmp2); + // var tmp4 = Sse2.UnpackHigh(tmp1, tmp2); + // return Sse2.UnpackLow(tmp3, tmp4).As(); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + GenTree* op2Dup; + op2 = impCloneExpr(op2, &op2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op1Dup, op2Dup, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + GenTree* tmp2Dup; + tmp2 = impCloneExpr(tmp2, &tmp2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp2 for vector narrow")); + + tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp4 = gtNewSimdHWIntrinsicNode(type, tmp1Dup, tmp2Dup, NI_SSE2_UnpackHigh, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp3, tmp4, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + } + + case TYP_INT: + case TYP_UINT: + { + // op1 = Elements 0, 1; 0L, 0U, 1L, 1U + // op2 = Elements 2, 3; 2L, 2U, 3L, 3U + // + // tmp1 = Elements 0L, 2L, 0U, 2U + // tmp2 = Elements 1L, 3L, 1U, 3U + // return Elements 0L, 1L, 2L, 3L + // + // var tmp1 = Sse2.UnpackLow(op1.AsUInt32(), op2.AsUInt32()); + // var tmp2 = Sse2.UnpackHigh(op1.AsUInt32(), op2.AsUInt32()); + // return Sse2.UnpackLow(tmp1, tmp2).As(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_INT) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + GenTree* op2Dup; + op2 = impCloneExpr(op2, &op2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op2 for vector narrow")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op1Dup, op2Dup, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + + case TYP_FLOAT: + { + // op1 = Elements 0, 1 + // op2 = Elements 2, 3 + // + // tmp1 = Elements 0, 1, -, - + // tmp1 = Elements 2, 3, -, - + // return Elements 0, 1, 2, 3 + // + // var tmp1 = Sse2.ConvertToVector128Single(op1); + // var tmp2 = Sse2.ConvertToVector128Single(op2); + // return Sse.MoveLowToHigh(tmp1, tmp2); + + CorInfoType opBaseJitType = CORINFO_TYPE_DOUBLE; + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, NI_SSE2_ConvertToVector128Single, opBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op2, NI_SSE2_ConvertToVector128Single, opBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE_MoveLowToHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + + default: + { + unreached(); + } + } + } +#elif defined(TARGET_ARM64) + if (simdSize == 16) + { + if (varTypeIsFloating(simdBaseType)) + { + // var tmp1 = AdvSimd.Arm64.ConvertToSingleLower(op1); + // return AdvSimd.Arm64.ConvertToSingleUpper(tmp1, op2); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_ConvertToSingleLower, simdBaseJitType, 8, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_Arm64_ConvertToSingleUpper, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + } + else + { + // var tmp1 = AdvSimd.ExtractNarrowingLower(op1); + // return AdvSimd.ExtractNarrowingUpper(tmp1, op2); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_ExtractNarrowingLower, simdBaseJitType, 8, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_ExtractNarrowingUpper, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + } + } + else if (varTypeIsFloating(simdBaseType)) + { + // var tmp1 = op1.ToVector128Unsafe(); + // return AdvSimd.Arm64.ConvertToSingleLower(tmp1); + + CorInfoType tmp2BaseJitType = CORINFO_TYPE_DOUBLE; + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128Unsafe, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, gtNewIconNode(1), op2, NI_AdvSimd_InsertScalar, + tmp2BaseJitType, 16, isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, NI_AdvSimd_Arm64_ConvertToSingleLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + else + { + // var tmp1 = op1.ToVector128Unsafe(); + // var tmp2 = AdvSimd.InsertScalar(tmp1.AsUInt64(), 1, op2.AsUInt64()); + // return AdvSimd.ExtractNarrowingUpper(tmp2).As(); + + CorInfoType tmp2BaseJitType = varTypeIsSigned(simdBaseType) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128Unsafe, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, gtNewIconNode(1), op2, NI_AdvSimd_InsertScalar, + tmp2BaseJitType, 16, isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, NI_AdvSimd_ExtractNarrowingLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 +} + GenTree* Compiler::gtNewSimdSqrtNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) { @@ -21055,6 +21480,364 @@ GenTree* Compiler::gtNewSimdUnOpNode(genTreeOps op, } } +GenTree* Compiler::gtNewSimdWidenLowerNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) +{ + assert(IsBaselineSimdIsaSupportedDebugOnly()); + + assert(varTypeIsSIMD(type)); + assert(getSIMDTypeForSize(simdSize) == type); + + assert(op1 != nullptr); + assert(op1->TypeIs(type)); + + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType) && !varTypeIsLong(simdBaseType)); + + NamedIntrinsic intrinsic = NI_Illegal; + + GenTree* tmp1; + +#if defined(TARGET_XARCH) + if (simdSize == 32) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + tmp1 = + gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_AVX2_ConvertToVector256Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_AVX2_ConvertToVector256Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_AVX2_ConvertToVector256Int64; + break; + } + + case TYP_FLOAT: + { + intrinsic = NI_AVX_ConvertToVector256Double; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else if ((simdBaseType == TYP_FLOAT) || compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_SSE41_ConvertToVector128Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_SSE41_ConvertToVector128Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_SSE41_ConvertToVector128Int64; + break; + } + + case TYP_FLOAT: + { + intrinsic = NI_SSE2_ConvertToVector128Double; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else + { + tmp1 = gtNewSimdZeroNode(type, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + if (varTypeIsSigned(simdBaseType)) + { + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector widen lower")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + op1 = op1Dup; + } + + return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } +#elif defined(TARGET_ARM64) + if (simdSize == 16) + { + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + else + { + assert(simdSize == 8); + tmp1 = op1; + } + + if (varTypeIsFloating(simdBaseType)) + { + assert(simdBaseType == TYP_FLOAT); + intrinsic = NI_AdvSimd_Arm64_ConvertToDouble; + } + else if (varTypeIsSigned(simdBaseType)) + { + intrinsic = NI_AdvSimd_SignExtendWideningLower; + } + else + { + intrinsic = NI_AdvSimd_ZeroExtendWideningLower; + } + + assert(intrinsic != NI_Illegal); + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, 8, isSimdAsHWIntrinsic); + + if (simdSize == 8) + { + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_GetLower, simdBaseJitType, 16, isSimdAsHWIntrinsic); + } + + return tmp1; +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 +} + +GenTree* Compiler::gtNewSimdWidenUpperNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) +{ + assert(IsBaselineSimdIsaSupportedDebugOnly()); + + assert(varTypeIsSIMD(type)); + assert(getSIMDTypeForSize(simdSize) == type); + + assert(op1 != nullptr); + assert(op1->TypeIs(type)); + + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType) && !varTypeIsLong(simdBaseType)); + + NamedIntrinsic intrinsic = NI_Illegal; + + GenTree* tmp1; + +#if defined(TARGET_XARCH) + if (simdSize == 32) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_AVX2_ConvertToVector256Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_AVX2_ConvertToVector256Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_AVX2_ConvertToVector256Int64; + break; + } + + case TYP_FLOAT: + { + intrinsic = NI_AVX_ConvertToVector256Double; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else if (varTypeIsFloating(simdBaseType)) + { + assert(simdBaseType == TYP_FLOAT); + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector widen upper")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op1Dup, NI_SSE_MoveHighToLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, NI_SSE2_ConvertToVector128Double, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + else if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(8), NI_SSE2_ShiftRightLogical128BitLane, + simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_SSE41_ConvertToVector128Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_SSE41_ConvertToVector128Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_SSE41_ConvertToVector128Int64; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else + { + tmp1 = gtNewSimdZeroNode(type, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + if (varTypeIsSigned(simdBaseType)) + { + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector widen upper")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + op1 = op1Dup; + } + + return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } +#elif defined(TARGET_ARM64) + GenTree* zero; + + if (simdSize == 16) + { + if (varTypeIsFloating(simdBaseType)) + { + assert(simdBaseType == TYP_FLOAT); + intrinsic = NI_AdvSimd_Arm64_ConvertToDoubleUpper; + } + else if (varTypeIsSigned(simdBaseType)) + { + intrinsic = NI_AdvSimd_SignExtendWideningUpper; + } + else + { + intrinsic = NI_AdvSimd_ZeroExtendWideningUpper; + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else + { + assert(simdSize == 8); + ssize_t index = 8 / genTypeSize(simdBaseType); + + if (varTypeIsFloating(simdBaseType)) + { + assert(simdBaseType == TYP_FLOAT); + intrinsic = NI_AdvSimd_Arm64_ConvertToDouble; + } + else if (varTypeIsSigned(simdBaseType)) + { + intrinsic = NI_AdvSimd_SignExtendWideningLower; + } + else + { + intrinsic = NI_AdvSimd_ZeroExtendWideningLower; + } + + assert(intrinsic != NI_Illegal); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + zero = gtNewSimdZeroNode(TYP_SIMD16, simdBaseJitType, 16, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, + simdBaseJitType, 16, isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_GetLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 +} + GenTree* Compiler::gtNewSimdWithElementNode(var_types type, GenTree* op1, GenTree* op2, diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 98795c7941f89..8f4d0f0ebf96f 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -821,7 +821,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Narrow: { assert(sig->numArgs == 2); - // TODO-ARM64-CQ: These intrinsics should be accelerated. + + op2 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(retType); + + retNode = + gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } @@ -899,6 +904,28 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector64_WidenLower: + case NI_Vector128_WidenLower: + { + assert(sig->numArgs == 1); + + op1 = impSIMDPopStack(retType); + + retNode = gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector64_WidenUpper: + case NI_Vector128_WidenUpper: + { + assert(sig->numArgs == 1); + + op1 = impSIMDPopStack(retType); + + retNode = gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + case NI_Vector64_WithElement: case NI_Vector128_WithElement: { diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index ba185f3afeb91..3f6c90a1315f8 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -87,6 +87,8 @@ HARDWARE_INTRINSIC(Vector64, Sqrt, HARDWARE_INTRINSIC(Vector64, ToScalar, 8, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector64, ToVector128, 8, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector64, ToVector128Unsafe, 8, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector64, WidenLower, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector64, WidenUpper, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector64, WithElement, 8, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, Xor, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -171,6 +173,8 @@ HARDWARE_INTRINSIC(Vector128, op_UnaryPlus, HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, WithElement, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 00a402260fc42..7179e9854b382 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -104,6 +104,8 @@ HARDWARE_INTRINSIC(Vector128, Sqrt, HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, WithElement, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -185,6 +187,8 @@ HARDWARE_INTRINSIC(Vector256, op_UnaryPlus, HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 46fb8b7c3f86c..c0ed9a4de197f 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1294,7 +1294,15 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_Narrow: { assert(sig->numArgs == 2); - // TODO-XARCH-CQ: These intrinsics should be accelerated + + if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2)) + { + op2 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(retType); + + retNode = + gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + } break; } @@ -1416,6 +1424,36 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_WidenLower: + case NI_Vector256_WidenLower: + { + assert(sig->numArgs == 1); + + if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2)) + { + op1 = impSIMDPopStack(retType); + + retNode = + gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + } + break; + } + + case NI_Vector128_WidenUpper: + case NI_Vector256_WidenUpper: + { + assert(sig->numArgs == 1); + + if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2)) + { + op1 = impSIMDPopStack(retType); + + retNode = + gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + } + break; + } + case NI_Vector128_WithElement: case NI_Vector256_WithElement: { diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 7b5a012e1dd2c..8044043ba58fd 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -820,8 +820,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicConvertToInt32: case SIMDIntrinsicConvertToDouble: case SIMDIntrinsicConvertToInt64: - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: // No special handling required. break; @@ -832,17 +830,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) // No special handling required. break; - case SIMDIntrinsicNarrow: - { - // Op1 will write to dst before Op2 is free - BuildUse(op1); - RefPosition* op2Use = BuildUse(op2); - setDelayFree(op2Use); - srcCount = 2; - buildUses = false; - break; - } - case SIMDIntrinsicInitN: { var_types baseType = simdTree->GetSimdBaseType(); @@ -879,7 +866,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicCopyToArrayX: case SIMDIntrinsicNone: case SIMDIntrinsicHWAccel: - case SIMDIntrinsicWiden: case SIMDIntrinsicInvalid: assert(!"These intrinsics should not be seen during register allocation"); FALLTHROUGH; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 358630dc9fa30..9b23c3a57d223 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1975,16 +1975,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicConvertToInt32: break; - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: - if (varTypeIsIntegral(simdTree->GetSimdBaseType())) - { - // We need an internal register different from targetReg. - setInternalRegsDelayFree = true; - buildInternalFloatRegisterDefForNode(simdTree); - } - break; - case SIMDIntrinsicConvertToInt64: // We need an internal register different from targetReg. setInternalRegsDelayFree = true; @@ -2018,16 +2008,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) buildInternalIntRegisterDefForNode(simdTree); break; - case SIMDIntrinsicNarrow: - // We need an internal register different from targetReg. - setInternalRegsDelayFree = true; - buildInternalFloatRegisterDefForNode(simdTree); - if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->GetSimdBaseType() != TYP_DOUBLE)) - { - buildInternalFloatRegisterDefForNode(simdTree); - } - break; - case SIMDIntrinsicShuffleSSE2: // Second operand is an integer constant and marked as contained. assert(simdTree->gtGetOp2()->isContainedIntOrIImmed()); diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index a3573b6eeb51e..d60fb49a5d9b0 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -1204,9 +1204,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in case SIMDIntrinsicConvertToDouble: case SIMDIntrinsicConvertToInt32: case SIMDIntrinsicConvertToInt64: - case SIMDIntrinsicNarrow: - case SIMDIntrinsicWidenHi: - case SIMDIntrinsicWidenLo: return true; default: @@ -2359,50 +2356,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, } break; - case SIMDIntrinsicNarrow: - { - assert(!instMethod); - op2 = impSIMDPopStack(simdType); - op1 = impSIMDPopStack(simdType); - // op1 and op2 are two input Vector. - simdTree = gtNewSIMDNode(simdType, op1, op2, simdIntrinsicID, simdBaseJitType, size); - retVal = simdTree; - } - break; - - case SIMDIntrinsicWiden: - { - GenTree* dstAddrHi = impSIMDPopStack(TYP_BYREF); - GenTree* dstAddrLo = impSIMDPopStack(TYP_BYREF); - op1 = impSIMDPopStack(simdType); - // op1 must have a valid class handle; the following method will assert it. - CORINFO_CLASS_HANDLE op1Handle = gtGetStructHandle(op1); - GenTree* dupOp1 = fgInsertCommaFormTemp(&op1, op1Handle); - - // Widen the lower half and assign it to dstAddrLo. - simdTree = gtNewSIMDNode(simdType, op1, nullptr, SIMDIntrinsicWidenLo, simdBaseJitType, size); - // TODO-1stClassStructs: With the introduction of ClassLayout it would be preferrable to use - // GT_OBJ instead of GT_BLK nodes to avoid losing information about the actual vector type. - GenTree* loDest = new (this, GT_BLK) - GenTreeBlk(GT_BLK, simdType, dstAddrLo, typGetBlkLayout(getSIMDTypeSizeInBytes(clsHnd))); - GenTree* loAsg = gtNewBlkOpNode(loDest, simdTree, - false, // not volatile - true); // copyBlock - loAsg->gtFlags |= ((simdTree->gtFlags | dstAddrLo->gtFlags) & GTF_ALL_EFFECT); - - // Widen the upper half and assign it to dstAddrHi. - simdTree = gtNewSIMDNode(simdType, dupOp1, nullptr, SIMDIntrinsicWidenHi, simdBaseJitType, size); - GenTree* hiDest = new (this, GT_BLK) - GenTreeBlk(GT_BLK, simdType, dstAddrHi, typGetBlkLayout(getSIMDTypeSizeInBytes(clsHnd))); - GenTree* hiAsg = gtNewBlkOpNode(hiDest, simdTree, - false, // not volatile - true); // copyBlock - hiAsg->gtFlags |= ((simdTree->gtFlags | dstAddrHi->gtFlags) & GTF_ALL_EFFECT); - - retVal = gtNewOperNode(GT_COMMA, simdType, loAsg, hiAsg); - } - break; - case SIMDIntrinsicHWAccel: { GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, 1); diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index 3aa47a6b50d98..7f704cd567292 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -229,7 +229,8 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic intrinsic, simdBaseJitType = getBaseJitTypeAndSizeOfSIMDType(clsHnd, &simdSize); } } - else if ((clsHnd == m_simdHandleCache->SIMDVectorHandle) && (numArgs != 0)) + else if ((clsHnd == m_simdHandleCache->SIMDVectorHandle) && (numArgs != 0) && + !SimdAsHWIntrinsicInfo::KeepBaseTypeFromRet(intrinsic)) { // We need to fixup the clsHnd in the case we are an intrinsic on Vector // The first argument will be the appropriate Vector handle to use @@ -674,6 +675,7 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, { return gtNewSimdAbsNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Sum: { @@ -693,6 +695,7 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return gtNewSimdAsHWIntrinsicNode(retType, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize); } + case NI_VectorT256_Sum: { // HorizontalAdd combines pairs so we need log2(vectorLength) passes to sum all elements together. @@ -730,11 +733,26 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return gtNewSimdAsHWIntrinsicNode(retType, op1, NI_Vector128_ToScalar, simdBaseJitType, 16); } + + case NI_VectorT128_WidenLower: + case NI_VectorT256_WidenLower: + { + return gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + + case NI_VectorT128_WidenUpper: + case NI_VectorT256_WidenUpper: + { + return gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } #elif defined(TARGET_ARM64) case NI_VectorT128_Abs: { return gtNewSimdAbsNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Sum: { GenTree* tmp; @@ -782,6 +800,18 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, } } } + + case NI_VectorT128_WidenLower: + { + return gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + + case NI_VectorT128_WidenUpper: + { + return gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } #else #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 @@ -915,6 +945,13 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Narrow: + case NI_VectorT256_Narrow: + { + return gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_op_Multiply: case NI_VectorT256_op_Multiply: { @@ -954,6 +991,12 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Narrow: + { + return gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_op_Multiply: { return gtNewSimdBinOpNode(GT_MUL, retType, op1, op2, simdBaseJitType, simdSize, diff --git a/src/coreclr/jit/simdashwintrinsic.h b/src/coreclr/jit/simdashwintrinsic.h index 176507d0b6653..a48729412b954 100644 --- a/src/coreclr/jit/simdashwintrinsic.h +++ b/src/coreclr/jit/simdashwintrinsic.h @@ -29,6 +29,9 @@ enum class SimdAsHWIntrinsicFlag : unsigned int // Base type should come from the this argument BaseTypeFromThisArg = 0x08, + + // For SIMDVectorHandle, keep the base type from the result type + KeepBaseTypeFromRet = 0x10, }; inline SimdAsHWIntrinsicFlag operator~(SimdAsHWIntrinsicFlag value) @@ -133,6 +136,12 @@ struct SimdAsHWIntrinsicInfo SimdAsHWIntrinsicFlag flags = lookupFlags(id); return (flags & SimdAsHWIntrinsicFlag::BaseTypeFromThisArg) == SimdAsHWIntrinsicFlag::BaseTypeFromThisArg; } + + static bool KeepBaseTypeFromRet(NamedIntrinsic id) + { + SimdAsHWIntrinsicFlag flags = lookupFlags(id); + return (flags & SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) == SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet; + } }; #endif // _SIMD_AS_HWINTRINSIC_H_ diff --git a/src/coreclr/jit/simdashwintrinsiclistarm64.h b/src/coreclr/jit/simdashwintrinsiclistarm64.h index 229222882f720..2810a0e6ecfb1 100644 --- a/src/coreclr/jit/simdashwintrinsiclistarm64.h +++ b/src/coreclr/jit/simdashwintrinsiclistarm64.h @@ -121,6 +121,7 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThan, SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThanOrEqual, 2, {NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_Arm64_CompareLessThanOrEqual, NI_AdvSimd_Arm64_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_Arm64_CompareLessThanOrEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Max, 2, {NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_AdvSimd_Max, NI_AdvSimd_Arm64_Max}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Min, 2, {NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_AdvSimd_Min, NI_AdvSimd_Arm64_Min}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, Narrow, 2, {NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow}, SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Addition, 2, {NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Arm64_Add}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseAnd, 2, {NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseOr, 2, {NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or}, SimdAsHWIntrinsicFlag::None) @@ -133,6 +134,8 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Multiply, SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Subtraction, 2, {NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Arm64_Subtract}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, SquareRoot, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Arm64_Sqrt, NI_AdvSimd_Arm64_Sqrt}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Sum, 1, {NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenLower, 1, {NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenUpper, 1, {NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper}, SimdAsHWIntrinsicFlag::None) #undef SIMD_AS_HWINTRINSIC_NM #undef SIMD_AS_HWINTRINSIC_ID diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h index 92d665c2de8a7..08cf517828283 100644 --- a/src/coreclr/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h @@ -121,6 +121,7 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThan, SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThanOrEqual, 2, {NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_SSE_CompareLessThanOrEqual, NI_SSE2_CompareLessThanOrEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Max, 2, {NI_VectorT128_Max, NI_SSE2_Max, NI_SSE2_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_SSE_Max, NI_SSE2_Max}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Min, 2, {NI_VectorT128_Min, NI_SSE2_Min, NI_SSE2_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_SSE_Min, NI_SSE2_Min}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, Narrow, 2, {NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow}, SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Addition, 2, {NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE_Add, NI_SSE2_Add}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseAnd, 2, {NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE_And, NI_SSE2_And}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseOr, 2, {NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE_Or, NI_SSE2_Or}, SimdAsHWIntrinsicFlag::None) @@ -133,6 +134,8 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Multiply, SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Subtraction, 2, {NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE_Subtract, NI_SSE2_Subtract}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, SquareRoot, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Sqrt, NI_SSE2_Sqrt}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Sum, 1, {NI_Illegal, NI_Illegal, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_Illegal, NI_Illegal, NI_VectorT128_Sum, NI_VectorT128_Sum}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenLower, 1, {NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenUpper, 1, {NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper}, SimdAsHWIntrinsicFlag::None) // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // ISA ID Name NumArg Instructions Flags @@ -160,6 +163,7 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, LessThan, SIMD_AS_HWINTRINSIC_ID(VectorT256, LessThanOrEqual, 2, {NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_AVX_CompareLessThanOrEqual, NI_AVX_CompareLessThanOrEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Max, 2, {NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_VectorT256_Max, NI_VectorT256_Max, NI_AVX_Max, NI_AVX_Max}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Min, 2, {NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_VectorT256_Min, NI_VectorT256_Min, NI_AVX_Min, NI_AVX_Min}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, Narrow, 2, {NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow}, SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) SIMD_AS_HWINTRINSIC_ID(VectorT256, op_Addition, 2, {NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX_Add, NI_AVX_Add}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, op_BitwiseAnd, 2, {NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX_And, NI_AVX_And}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, op_BitwiseOr, 2, {NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX_Or, NI_AVX_Or}, SimdAsHWIntrinsicFlag::None) @@ -172,6 +176,8 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, op_Multiply, SIMD_AS_HWINTRINSIC_ID(VectorT256, op_Subtraction, 2, {NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX_Subtract, NI_AVX_Subtract}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, SquareRoot, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AVX_Sqrt, NI_AVX_Sqrt}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Sum, 1, {NI_Illegal, NI_Illegal, NI_VectorT256_Sum, NI_VectorT256_Sum, NI_VectorT256_Sum, NI_VectorT256_Sum, NI_Illegal, NI_Illegal, NI_VectorT256_Sum, NI_VectorT256_Sum}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, WidenLower, 1, {NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, WidenUpper, 1, {NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper}, SimdAsHWIntrinsicFlag::None) #undef SIMD_AS_HWINTRINSIC_NM #undef SIMD_AS_HWINTRINSIC_ID diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index 4523fe48a896e..5a3f0296d1813 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -241,86 +241,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type result = INS_cvttsd2si; break; - case SIMDIntrinsicNarrow: - // Note that for the integer types the caller must zero the upper bits of - // each source element, since the instructions saturate. - switch (baseType) - { - case TYP_INT: - case TYP_UINT: - if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) - { - result = INS_packusdw; - } - else - { - result = INS_packssdw; - } - break; - case TYP_SHORT: - case TYP_USHORT: - result = INS_packuswb; - break; - default: - assert(!"Invalid baseType for SIMDIntrinsicNarrow"); - result = INS_invalid; - break; - } - break; - - case SIMDIntrinsicWidenLo: - // Some of these have multiple instruction implementations, with one instruction to widen the lo half, - // and another to widen the hi half. - switch (baseType) - { - case TYP_FLOAT: - result = INS_cvtps2pd; - break; - case TYP_INT: - case TYP_UINT: - result = INS_punpckldq; - break; - case TYP_SHORT: - case TYP_USHORT: - result = INS_punpcklwd; - break; - case TYP_BYTE: - case TYP_UBYTE: - result = INS_punpcklbw; - break; - default: - assert(!"Invalid baseType for SIMDIntrinsicWidenLo"); - result = INS_invalid; - break; - } - break; - - case SIMDIntrinsicWidenHi: - switch (baseType) - { - case TYP_FLOAT: - // For this case, we actually use the same instruction. - result = INS_cvtps2pd; - break; - case TYP_INT: - case TYP_UINT: - result = INS_punpckhdq; - break; - case TYP_SHORT: - case TYP_USHORT: - result = INS_punpckhwd; - break; - case TYP_BYTE: - case TYP_UBYTE: - result = INS_punpckhbw; - break; - default: - assert(!"Invalid baseType for SIMDIntrinsicWidenHi"); - result = INS_invalid; - break; - } - break; - case SIMDIntrinsicShiftLeftInternal: switch (baseType) { @@ -1193,245 +1113,6 @@ void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, r } } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// The Widen intrinsics are broken into separate intrinsics for the two results. -// -void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) -{ - assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) || - (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)); - - GenTree* op1 = simdNode->gtGetOp1(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - SIMDLevel level = compiler->getSIMDSupportLevel(); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber srcReg = op1Reg; - emitAttr emitSize = emitActualTypeSize(simdType); - instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - - if (baseType == TYP_FLOAT) - { - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) - { - genSIMDExtractUpperHalf(simdNode, srcReg, targetReg); - srcReg = targetReg; - } - inst_RV_RV(widenIns, targetReg, srcReg, simdType); - } - else - { - // We will generate the following on AVX: - // vpermq targetReg, op1Reg, 0xd4|0xe8 - // vpxor tmpReg, tmpReg - // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed) - // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - assert(tmpReg != op1Reg); - - if (level == SIMD_AVX2_Supported) - { - // permute op1Reg and put it into targetReg - unsigned ival = 0xd4; - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) - { - ival = 0xe8; - } - assert((ival >= 0) && (ival <= 255)); - GetEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, (int8_t)ival); - } - else - { - inst_Mov(simdType, targetReg, op1Reg, /* canSkip */ true); - } - - genSIMDZero(simdType, baseType, tmpReg); - if (!varTypeIsUnsigned(baseType)) - { - instruction compareIns = INS_invalid; - - if (baseType == TYP_INT) - { - compareIns = INS_pcmpgtd; - } - else if (baseType == TYP_SHORT) - { - compareIns = INS_pcmpgtw; - } - else if (baseType == TYP_BYTE) - { - compareIns = INS_pcmpgtb; - } - else if ((baseType == TYP_LONG) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) - { - compareIns = INS_pcmpgtq; - } - - assert(compareIns != INS_invalid); - inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize); - } - inst_RV_RV(widenIns, targetReg, tmpReg, simdType); - } - genProduceReg(simdNode); -} - -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// This intrinsic takes two arguments. The first operand is narrowed to produce the -// lower elements of the results, and the second operand produces the high elements. -// -void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - emitAttr emitSize = emitTypeSize(simdType); - SIMDLevel level = compiler->getSIMDSupportLevel(); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - if (baseType == TYP_DOUBLE) - { - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - - inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType); - inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType); - // Now insert the high-order result (in tmpReg) into the upper half of targetReg. - if (level == SIMD_AVX2_Supported) - { - GetEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01); - } - else - { - inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, (int8_t)SHUFFLE_YXYX); - } - } - else if (varTypeIsLong(baseType)) - { - if (level == SIMD_AVX2_Supported) - { - // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg. - // We will generate the following: - // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg) - // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2) - // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg) - // mov tmpReg2, op1Reg - // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2) - // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg - // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg - // punpcklqdq tgtReg, tmpReg - regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); - regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - GetEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); - GetEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); - GetEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01); - inst_Mov(simdType, tmpReg2, op1Reg, /* canSkip */ false, emitSize); - GetEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, (int8_t)SHUFFLE_XXZX); - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, (int8_t)SHUFFLE_XXZX); - inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize); - } - else - { - // We will generate the following: - // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements) - // psrldq targetReg, 8 (shift them right to get zeros in the high elements) - // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements) - // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements) - // por targetReg, tmpReg - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); - instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); - emitAttr emitSize = emitTypeSize(simdType); - - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, (int8_t)SHUFFLE_ZXXX); - GetEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8); - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, (int8_t)SHUFFLE_XXZX); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8); - inst_RV_RV(INS_por, targetReg, tmpReg, simdType); - } - } - else - { - // We will generate the following: - // mov targetReg, op1Reg - // mov tmpReg, op2Reg - // psll? targetReg, shiftCount - // pslr? targetReg, shiftCount - // psll? tmpReg, shiftCount - // pslr? tmpReg, shiftCount - // targetReg, tmpReg - // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType), - // and is the appropriate instruction to pack the result (note that we have to truncate to - // get CLR type semantics; otherwise it will saturate). - // - int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2); - instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); - instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); - - assert((shiftCount >= 0) && (shiftCount <= 127)); - - if (level == SIMD_AVX2_Supported) - { - regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); - regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - - // The AVX instructions generally operate on "lanes", so we have to permute the - // inputs so that the destination register has the low 128-bit halves of the two - // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs. - GetEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20); - GetEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount); - GetEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); - GetEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount); - inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType)); - } - else - { - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - - inst_Mov(simdType, targetReg, op1Reg, /* canSkip */ false, emitSize); - inst_Mov(simdType, tmpReg, op2Reg, /* canSkip */ false, emitSize); - - instruction tmpShiftRight = shiftRightIns; - if ((baseType == TYP_INT || baseType == TYP_UINT) && level == SIMD_SSE2_Supported) - { - tmpShiftRight = INS_psrad; - } - - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount); - GetEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); - GetEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount); - inst_RV_RV(ins, targetReg, tmpReg, simdType); - } - } - genProduceReg(simdNode); -} - //-------------------------------------------------------------------------------- // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations // add, sub, mul, bit-wise And, AndNot and Or. @@ -1955,15 +1636,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsic64BitConvert(simdNode); break; - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: - genSIMDIntrinsicWiden(simdNode); - break; - - case SIMDIntrinsicNarrow: - genSIMDIntrinsicNarrow(simdNode); - break; - case SIMDIntrinsicSub: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: diff --git a/src/coreclr/jit/simdintrinsiclist.h b/src/coreclr/jit/simdintrinsiclist.h index 258fecfdd6578..0b354b533702c 100644 --- a/src/coreclr/jit/simdintrinsiclist.h +++ b/src/coreclr/jit/simdintrinsiclist.h @@ -78,10 +78,6 @@ SIMD_INTRINSIC("ConvertToDouble", false, ConvertToDouble, SIMD_INTRINSIC("ConvertToInt32", false, ConvertToInt32, "ConvertToInt32", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) // Convert double to long SIMD_INTRINSIC("ConvertToInt64", false, ConvertToInt64, "ConvertToInt64", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -// Narrow two input Vectors to a single Vector. The return value's lower elements are the elements from src1, and the upper elements are from src2. -SIMD_INTRINSIC("Narrow", false, Narrow, "Narrow", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_SHORT, TYP_UINT, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -// Widen one input Vector to two Vectors: dest1 contains the lower half of elements in src, and dest2 contains the upper half of elements in src. -SIMD_INTRINSIC("Widen", false, Widen, "Widen", TYP_VOID, 3, {TYP_STRUCT, TYP_BYREF, TYP_BYREF}, {TYP_INT, TYP_FLOAT, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) // Miscellaneous SIMD_INTRINSIC("get_IsHardwareAccelerated", false, HWAccel, "HWAccel", TYP_BOOL, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) @@ -100,10 +96,6 @@ SIMD_INTRINSIC("ShiftRightInternal", false, ShiftRightInternal, SIMD_INTRINSIC("UpperSave", false, UpperSave, "UpperSave Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) SIMD_INTRINSIC("UpperRestore", false, UpperRestore, "UpperRestore Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -// Internal intrinsics for Widen -SIMD_INTRINSIC("WidenHi", false, WidenHi, "WidenHi", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("WidenLo", false, WidenLo, "WidenLo", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) - SIMD_INTRINSIC(nullptr, false, Invalid, "Invalid", TYP_UNDEF, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) #undef SIMD_INTRINSIC #else // !defined(TARGET_XARCH) && !defined(TARGET_ARM64) diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 385cd20a124c3..a09e2f84c78f4 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -6241,10 +6241,6 @@ void ValueNumStore::InitValueNumStoreStatics() // SIMDIntrinsicInit has an entry of 2 for numArgs, but it only has one normal arg ValueNumFuncSetArity(VNF_SIMD_Init, 1); - // SIMDIntrinsicWidenHi has an entry of 2 for numArgs, but it only has one normal arg - ValueNumFuncSetArity(VNF_SIMD_WidenHi, 1); - // SIMDIntrinsicWidenLo has an entry of 2 for numArgs, but it only has one normal arg - ValueNumFuncSetArity(VNF_SIMD_WidenLo, 1); // Some SIMD intrinsic nodes have an extra VNF_SimdType arg // diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs index 2e030002cd6e0..70c36a619ec63 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs @@ -1181,210 +1181,295 @@ public static Vector Subtract(Vector left, Vector right) /// A vector that will contain the widened result of the lower half of . /// A vector that will contain the widened result of the upper half of . [CLSCompliant(false)] - [Intrinsic] public static unsafe void Widen(Vector source, out Vector low, out Vector high) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + low = WidenLower(source); + high = WidenUpper(source); + } - for (int i = 0; i < Vector.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); - } + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } - for (int i = Vector.Count; i < Vector.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); - } + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + [CLSCompliant(false)] + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } - low = lowerResult; - high = upperResult; + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); } - /// Widens a into two . + /// Widens a into two . /// The vector whose elements are to be widened. /// A vector that will contain the widened result of the lower half of . /// A vector that will contain the widened result of the upper half of . + [CLSCompliant(false)] + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } + + /// Computes the exclusive-or of two vectors. + /// The vector to exclusive-or with . + /// The vector to exclusive-or with . + /// The type of the elements in the vector. + /// The exclusive-or of and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector Xor(Vector left, Vector right) + where T : struct => left ^ right; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static T GetElementUnsafe(in this Vector vector, int index) + where T : struct { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Debug.Assert((index >= 0) && (index < Vector.Count)); + return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + } - for (int i = 0; i < Vector.Count; i++) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SetElementUnsafe(in this Vector vector, int index, T value) + where T : struct + { + Debug.Assert((index >= 0) && (index < Vector.Count)); + Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + } + + [Intrinsic] + internal static Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { - var value = (int)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (ushort)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return lower; + } + + [Intrinsic] + internal static unsafe Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { var value = (int)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + lower.SetElementUnsafe(i, value); } - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenLower(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector lower); for (int i = 0; i < Vector.Count; i++) { var value = (long)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); - } - - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenLower(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector lower); for (int i = 0; i < Vector.Count; i++) { var value = (short)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return lower; + } + + [Intrinsic] + internal static unsafe Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { - var value = (short)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (double)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenLower(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector lower); - for (int i = 0; i < Vector.Count; i++) + for (int i = 0; i < Vector.Count; i++) { - var value = (double)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (uint)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return lower; + } + + [Intrinsic] + internal static unsafe Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { - var value = (double)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (ulong)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static Vector WidenUpper(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector upper); - for (int i = 0; i < Vector.Count; i++) + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (uint)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return upper; + } + + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) + { + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (uint)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - low = lowerResult; - high = upperResult; + return upper; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenUpper(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector upper); - for (int i = 0; i < Vector.Count; i++) + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (ulong)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return upper; + } + + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) + { + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (ulong)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - low = lowerResult; - high = upperResult; + return upper; } - /// Computes the exclusive-or of two vectors. - /// The vector to exclusive-or with . - /// The vector to exclusive-or with . - /// The type of the elements in the vector. - /// The exclusive-or of and . - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector Xor(Vector left, Vector right) - where T : struct => left ^ right; + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) + { + Unsafe.SkipInit(out Vector upper); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static T GetElementUnsafe(in this Vector vector, int index) - where T : struct + for (int i = Vector.Count; i < Vector.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) { - Debug.Assert((index >= 0) && (index < Vector.Count)); - return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); + } + + return upper; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void SetElementUnsafe(in this Vector vector, int index, T value) - where T : struct + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) { - Debug.Assert((index >= 0) && (index < Vector.Count)); - Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); + } + + return upper; } /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index f2c2784969b3b..445a0af111baa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -2690,173 +2690,47 @@ public static bool TryCopyTo(this Vector128 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. /// The type of the input vector. @@ -2962,5 +2836,201 @@ internal static void SetElementUnsafe(in this Vector128 vector, int index, Debug.Assert((index >= 0) && (index < Vector128.Count)); Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; } + + [Intrinsic] + internal static Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 72ab4796336f5..d696075b9c972 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -2788,173 +2788,47 @@ public static bool TryCopyTo(this Vector256 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. /// The type of the input vector. @@ -3076,5 +2950,201 @@ internal static void SetElementUnsafe(in this Vector256 vector, int index, Debug.Assert((index >= 0) && (index < Vector256.Count)); Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; } + + [Intrinsic] + internal static Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + internal static Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs index dbfea1c89883e..6783c87b037f8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs @@ -1994,35 +1994,116 @@ public static bool TryCopyTo(this Vector64 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + [CLSCompliant(false)] + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + [CLSCompliant(false)] + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + [CLSCompliant(false)] + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. + /// The type of the input vector. + /// The vector to get the remaining elements from. + /// The index of the element to set. + /// The value to set the element to. + /// A with the value of the element at set to and the remaining elements set to the same value as that in . + /// The type of () is not supported. + /// was less than zero or greater than the number of elements. + [Intrinsic] + public static Vector64 WithElement(this Vector64 vector, int index, T value) + where T : struct { - Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); + ThrowHelper.ThrowForUnsupportedIntrinsicsVector64BaseType(); - for (int i = 0; i < Vector64.Count; i++) + if ((uint)(index) >= (uint)(Vector64.Count)) { - var value = (ushort)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index); } - for (int i = Vector64.Count; i < Vector64.Count; i++) + Vector64 result = vector; + result.SetElementUnsafe(index, value); + return result; + } + + /// Computes the exclusive-or of two vectors. + /// The vector to exclusive-or with . + /// The vector to exclusive-or with . + /// The type of the elements in the vector. + /// The exclusive-or of and . + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector64 Xor(Vector64 left, Vector64 right) + where T : struct => left ^ right; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static T GetElementUnsafe(in this Vector64 vector, int index) + where T : struct + { + Debug.Assert((index >= 0) && (index < Vector64.Count)); + return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SetElementUnsafe(in this Vector64 vector, int index, T value) + where T : struct + { + Debug.Assert((index >= 0) && (index < Vector64.Count)); + Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + } + + [Intrinsic] + internal static Vector64 WidenLower(Vector64 source) + { + Unsafe.SkipInit(out Vector64 lower); + + for (int i = 0; i < Vector64.Count; i++) { var value = (ushort)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); + lower.SetElementUnsafe(i, value); } - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2030,23 +2111,13 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64.Count; i < Vector64.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2054,24 +2125,13 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2079,23 +2139,13 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2103,24 +2153,13 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vect lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2128,24 +2167,13 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2153,63 +2181,105 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) + return lower; + } + + [Intrinsic] + internal static Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) { - var value = (ulong)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); } - return (lower, upper); + return upper; } - /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. - /// The type of the input vector. - /// The vector to get the remaining elements from. - /// The index of the element to set. - /// The value to set the element to. - /// A with the value of the element at set to and the remaining elements set to the same value as that in . - /// The type of () is not supported. - /// was less than zero or greater than the number of elements. [Intrinsic] - public static Vector64 WithElement(this Vector64 vector, int index, T value) - where T : struct + internal static unsafe Vector64 WidenUpper(Vector64 source) { - ThrowHelper.ThrowForUnsupportedIntrinsicsVector64BaseType(); + Unsafe.SkipInit(out Vector64 upper); - if ((uint)(index) >= (uint)(Vector64.Count)) + for (int i = Vector64.Count; i < Vector64.Count; i++) { - ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index); + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); } - Vector64 result = vector; - result.SetElementUnsafe(index, value); - return result; + return upper; } - /// Computes the exclusive-or of two vectors. - /// The vector to exclusive-or with . - /// The vector to exclusive-or with . - /// The type of the elements in the vector. - /// The exclusive-or of and . [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector64 Xor(Vector64 left, Vector64 right) - where T : struct => left ^ right; + internal static unsafe Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static T GetElementUnsafe(in this Vector64 vector, int index) - where T : struct + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector64 WidenUpper(Vector64 source) { - Debug.Assert((index >= 0) && (index < Vector64.Count)); - return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void SetElementUnsafe(in this Vector64 vector, int index, T value) - where T : struct + [Intrinsic] + internal static unsafe Vector64 WidenUpper(Vector64 source) { - Debug.Assert((index >= 0) && (index < Vector64.Count)); - Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; } } }