-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[API Proposal]: VPCLMULQDQ Intrinsics #95772
Comments
Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics Issue DetailsBackground and motivation
API Proposalnamespace System.Runtime.Intrinsics.X86;
/// <summary>
/// This class provides access to Intel VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq256 : Pclmulqdq
{
internal Pclmulqdq256() { }
// This would depend on the VPCLMULQDQ CPUID bit for VEX encoding and VPCLMULQDQ + AVX512VL for EVEX
public static new bool IsSupported { get => IsSupported; }
[Intrinsic]
public new abstract class X64 : Pclmulqdq.X64
{
internal X64() { }
public static new bool IsSupported { get => IsSupported; }
}
/// <summary>
/// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
/// VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
/// </summary>
public static Vector256<long> CarrylessMultiply(Vector256<long> left, Vector256<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
/// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
/// VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
/// </summary>
public static Vector256<ulong> CarrylessMultiply(Vector256<ulong> left, Vector256<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
}
/// <summary>
/// This class provides access to Intel AVX-512 VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq512 : Pclmulqdq
{
internal Pclmulqdq256() { }
// This would depend on the VPCLMULQDQ + AVX512F CPUID bits
public static new bool IsSupported { get => IsSupported; }
[Intrinsic]
public new abstract class X64 : Pclmulqdq.X64
{
internal X64() { }
public static new bool IsSupported { get => IsSupported; }
}
/// <summary>
/// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
/// VPCLMULQDQ ymm1, ymm2, ymm3/m512, imm8
/// </summary>
public static Vector512<long> CarrylessMultiply(Vector512<long> left, Vector512<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
/// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
/// VPCLMULQDQ ymm1, ymm2, ymm3/m512, imm8
/// </summary>
public static Vector512<ulong> CarrylessMultiply(Vector512<ulong> left, Vector512<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
} API UsageExamples of vectorized CRC32 implementations using the equivalent C intrinsics abound. One such example: https://github.com/corsix/fast-crc32/blob/main/sample_avx512_vpclmulqdq_crc32c_v4s5x3.c Alternative DesignsThe A case could be made for making RisksN/A
|
|
Looks good as proposed. namespace System.Runtime.Intrinsics.X86;
/// <summary>
/// This class provides access to Intel VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq256 : Pclmulqdq
{
internal Pclmulqdq256() { }
// This would depend on the VPCLMULQDQ CPUID bit for VEX encoding and VPCLMULQDQ + AVX512VL for EVEX
public static new bool IsSupported { get => IsSupported; }
[Intrinsic]
public new abstract class X64 : Pclmulqdq.X64
{
internal X64() { }
public static new bool IsSupported { get => IsSupported; }
}
/// <summary>
/// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
/// VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
/// </summary>
public static Vector256<long> CarrylessMultiply(Vector256<long> left, Vector256<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
/// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
/// VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
/// </summary>
public static Vector256<ulong> CarrylessMultiply(Vector256<ulong> left, Vector256<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
}
/// <summary>
/// This class provides access to Intel AVX-512 VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq512 : Pclmulqdq
{
internal Pclmulqdq512() { }
// This would depend on the VPCLMULQDQ + AVX512F CPUID bits
public static new bool IsSupported { get => IsSupported; }
[Intrinsic]
public new abstract class X64 : Pclmulqdq.X64
{
internal X64() { }
public static new bool IsSupported { get => IsSupported; }
}
/// <summary>
/// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
/// VPCLMULQDQ zmm1, zmm2, zmm3/m512, imm8
/// </summary>
public static Vector512<long> CarrylessMultiply(Vector512<long> left, Vector512<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
/// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
/// VPCLMULQDQ zmm1, zmm2, zmm3/m512, imm8
/// </summary>
public static Vector512<ulong> CarrylessMultiply(Vector512<ulong> left, Vector512<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
} |
For consistency with the AVX10 surface (and #86952), this should probably be revised to namespace System.Runtime.Intrinsics.X86;
public abstract class Pclmulqdq : Sse2
{
public abstract class V256
{
public static new bool IsSupported { get; }
public static Vector256<long> CarrylessMultiply(Vector256<long> left, Vector256<long> right, [ConstantExpected] byte control);
public static Vector256<ulong> CarrylessMultiply(Vector256<ulong> left, Vector256<ulong> right, [ConstantExpected] byte control);
}
public abstract class V512
{
public static new bool IsSupported { get; }
public static Vector512<long> CarrylessMultiply(Vector512<long> left, Vector512<long> right, [ConstantExpected] byte control);
public static Vector512<ulong> CarrylessMultiply(Vector512<ulong> left, Vector512<ulong> right, [ConstantExpected] byte control);
}
} |
Looks good as proposed namespace System.Runtime.Intrinsics.X86;
public abstract class Pclmulqdq : Sse2
{
public abstract class V256
{
public static new bool IsSupported { get; }
public static Vector256<long> CarrylessMultiply(Vector256<long> left, Vector256<long> right, [ConstantExpected] byte control);
public static Vector256<ulong> CarrylessMultiply(Vector256<ulong> left, Vector256<ulong> right, [ConstantExpected] byte control);
}
public abstract class V512
{
public static new bool IsSupported { get; }
public static Vector512<long> CarrylessMultiply(Vector512<long> left, Vector512<long> right, [ConstantExpected] byte control);
public static Vector512<ulong> CarrylessMultiply(Vector512<ulong> left, Vector512<ulong> right, [ConstantExpected] byte control);
}
} |
Background and motivation
VPCLMULQDQ
is supported by Intel in the Ice Lake and newer architectures, and by AMD in Zen 4. It allows for parallelpclmulqdq
inVector256
andVector512
and is important for implementing vectorized CRC32 among other things.API Proposal
API Usage
Examples of vectorized CRC32 implementations using the equivalent C intrinsics abound. One such example: https://github.com/corsix/fast-crc32/blob/main/sample_avx512_vpclmulqdq_crc32c_v4s5x3.c
Alternative Designs
The
Pclmulqdq256
andPclmulqdq512
classes could be nested underPclmulqdq
rather than being top-level classes inheriting from it. Since this ISA includes only a single instruction, that may be preferable.A case could be made for making
Avx
the base ofPclulqdq256
, as VEX encoding is required forvpclmulqdq
. Likewise,Pclmulqdq512
could haveAvx512F
as a base given its requirement ofEVEX
encoding. However, the relationship will change with AVX10, where EVEX support will not imply 512-bit vector support.Risks
N/A
The text was updated successfully, but these errors were encountered: