From 5f2a0b4beca9ff017ba016ee38a55b580fa66206 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 18 Feb 2024 14:45:54 -0800 Subject: [PATCH 1/3] lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32 This improves CRC-32 performance on some of the latest x86 CPUs. Three implementations are added: VPCLMULQDQ/AVX2 and VPCLMULQDQ/AVX512VL which use 256-bit vectors, and VPCLMULQDQ/AVX512F/AVX512VL which uses 512-bit vectors. To reduce downclocking effects, the implementation with 512-bit vectors isn't used on Intel CPUs 10th generation and older. --- lib/arm/crc32_impl.h | 8 +- lib/arm/crc32_pmull_wide.h | 10 +- lib/crc32.c | 2 +- lib/crc32_multipliers.h | 122 ++++++--- lib/x86/cpu_features.c | 54 +++- lib/x86/cpu_features.h | 65 ++++- lib/x86/crc32_impl.h | 132 +++++++--- lib/x86/crc32_pclmul_template.h | 443 +++++++++++++++++++++----------- scripts/checksum_benchmarks.sh | 33 ++- scripts/gen_crc32_multipliers.c | 23 +- scripts/run_tests.sh | 3 +- 11 files changed, 637 insertions(+), 258 deletions(-) diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index c802cdf0..d6ea30c0 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -474,12 +474,12 @@ static u32 ATTRIBUTES MAYBE_UNUSED crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) { static const u64 _aligned_attribute(16) mults[3][2] = { - CRC32_1VECS_MULTS, - CRC32_4VECS_MULTS, - CRC32_2VECS_MULTS, + { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ + { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */ + { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */ }; static const u64 _aligned_attribute(16) final_mults[3][2] = { - { CRC32_FINAL_MULT, 0 }, + { CRC32_X63_MODG, 0 }, { CRC32_BARRETT_CONSTANT_1, 0 }, { CRC32_BARRETT_CONSTANT_2, 0 }, }; diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h index a72e1d87..c2f8af06 100644 --- a/lib/arm/crc32_pmull_wide.h +++ b/lib/arm/crc32_pmull_wide.h @@ -59,7 +59,9 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) if (len < 3 * 192) { static const u64 _aligned_attribute(16) mults[3][2] = { - CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS, + { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */ + { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */ + { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ }; poly64x2_t multipliers_4, multipliers_2, multipliers_1; @@ -97,8 +99,10 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) v0 = fold_vec(v0, v1, multipliers_1); } else { static const u64 _aligned_attribute(16) mults[4][2] = { - CRC32_12VECS_MULTS, CRC32_6VECS_MULTS, - CRC32_3VECS_MULTS, CRC32_1VECS_MULTS, + { CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */ + { CRC32_X799_MODG, CRC32_X735_MODG }, /* 6 vecs */ + { CRC32_X415_MODG, CRC32_X351_MODG }, /* 3 vecs */ + { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ }; const poly64x2_t multipliers_12 = load_multipliers(mults[0]); const poly64x2_t multipliers_6 = load_multipliers(mults[1]); diff --git a/lib/crc32.c b/lib/crc32.c index c3a4da48..a0ec0223 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -165,7 +165,7 @@ * intermediate remainder (which we never actually store explicitly) is 96 bits. * * On CPUs that support fast carryless multiplication, CRCs can be computed even - * more quickly via "folding". See e.g. the x86 PCLMUL implementation. + * more quickly via "folding". See e.g. the x86 PCLMUL implementations. */ #include "lib_common.h" diff --git a/lib/crc32_multipliers.h b/lib/crc32_multipliers.h index 580b775b..65a9bf3e 100644 --- a/lib/crc32_multipliers.h +++ b/lib/crc32_multipliers.h @@ -4,55 +4,103 @@ * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT. */ -#define CRC32_1VECS_MULT_1 0xae689191 /* x^159 mod G(x) */ -#define CRC32_1VECS_MULT_2 0xccaa009e /* x^95 mod G(x) */ -#define CRC32_1VECS_MULTS { CRC32_1VECS_MULT_1, CRC32_1VECS_MULT_2 } +#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */ +#define CRC32_X95_MODG 0xccaa009e /* x^95 mod G(x) */ -#define CRC32_2VECS_MULT_1 0xf1da05aa /* x^287 mod G(x) */ -#define CRC32_2VECS_MULT_2 0x81256527 /* x^223 mod G(x) */ -#define CRC32_2VECS_MULTS { CRC32_2VECS_MULT_1, CRC32_2VECS_MULT_2 } +#define CRC32_X287_MODG 0xf1da05aa /* x^287 mod G(x) */ +#define CRC32_X223_MODG 0x81256527 /* x^223 mod G(x) */ -#define CRC32_3VECS_MULT_1 0x3db1ecdc /* x^415 mod G(x) */ -#define CRC32_3VECS_MULT_2 0xaf449247 /* x^351 mod G(x) */ -#define CRC32_3VECS_MULTS { CRC32_3VECS_MULT_1, CRC32_3VECS_MULT_2 } +#define CRC32_X415_MODG 0x3db1ecdc /* x^415 mod G(x) */ +#define CRC32_X351_MODG 0xaf449247 /* x^351 mod G(x) */ -#define CRC32_4VECS_MULT_1 0x8f352d95 /* x^543 mod G(x) */ -#define CRC32_4VECS_MULT_2 0x1d9513d7 /* x^479 mod G(x) */ -#define CRC32_4VECS_MULTS { CRC32_4VECS_MULT_1, CRC32_4VECS_MULT_2 } +#define CRC32_X543_MODG 0x8f352d95 /* x^543 mod G(x) */ +#define CRC32_X479_MODG 0x1d9513d7 /* x^479 mod G(x) */ -#define CRC32_5VECS_MULT_1 0x1c279815 /* x^671 mod G(x) */ -#define CRC32_5VECS_MULT_2 0xae0b5394 /* x^607 mod G(x) */ -#define CRC32_5VECS_MULTS { CRC32_5VECS_MULT_1, CRC32_5VECS_MULT_2 } +#define CRC32_X671_MODG 0x1c279815 /* x^671 mod G(x) */ +#define CRC32_X607_MODG 0xae0b5394 /* x^607 mod G(x) */ -#define CRC32_6VECS_MULT_1 0xdf068dc2 /* x^799 mod G(x) */ -#define CRC32_6VECS_MULT_2 0x57c54819 /* x^735 mod G(x) */ -#define CRC32_6VECS_MULTS { CRC32_6VECS_MULT_1, CRC32_6VECS_MULT_2 } +#define CRC32_X799_MODG 0xdf068dc2 /* x^799 mod G(x) */ +#define CRC32_X735_MODG 0x57c54819 /* x^735 mod G(x) */ -#define CRC32_7VECS_MULT_1 0x31f8303f /* x^927 mod G(x) */ -#define CRC32_7VECS_MULT_2 0x0cbec0ed /* x^863 mod G(x) */ -#define CRC32_7VECS_MULTS { CRC32_7VECS_MULT_1, CRC32_7VECS_MULT_2 } +#define CRC32_X927_MODG 0x31f8303f /* x^927 mod G(x) */ +#define CRC32_X863_MODG 0x0cbec0ed /* x^863 mod G(x) */ -#define CRC32_8VECS_MULT_1 0x33fff533 /* x^1055 mod G(x) */ -#define CRC32_8VECS_MULT_2 0x910eeec1 /* x^991 mod G(x) */ -#define CRC32_8VECS_MULTS { CRC32_8VECS_MULT_1, CRC32_8VECS_MULT_2 } +#define CRC32_X1055_MODG 0x33fff533 /* x^1055 mod G(x) */ +#define CRC32_X991_MODG 0x910eeec1 /* x^991 mod G(x) */ -#define CRC32_9VECS_MULT_1 0x26b70c3d /* x^1183 mod G(x) */ -#define CRC32_9VECS_MULT_2 0x3f41287a /* x^1119 mod G(x) */ -#define CRC32_9VECS_MULTS { CRC32_9VECS_MULT_1, CRC32_9VECS_MULT_2 } +#define CRC32_X1183_MODG 0x26b70c3d /* x^1183 mod G(x) */ +#define CRC32_X1119_MODG 0x3f41287a /* x^1119 mod G(x) */ -#define CRC32_10VECS_MULT_1 0xe3543be0 /* x^1311 mod G(x) */ -#define CRC32_10VECS_MULT_2 0x9026d5b1 /* x^1247 mod G(x) */ -#define CRC32_10VECS_MULTS { CRC32_10VECS_MULT_1, CRC32_10VECS_MULT_2 } +#define CRC32_X1311_MODG 0xe3543be0 /* x^1311 mod G(x) */ +#define CRC32_X1247_MODG 0x9026d5b1 /* x^1247 mod G(x) */ -#define CRC32_11VECS_MULT_1 0x5a1bb05d /* x^1439 mod G(x) */ -#define CRC32_11VECS_MULT_2 0xd1df2327 /* x^1375 mod G(x) */ -#define CRC32_11VECS_MULTS { CRC32_11VECS_MULT_1, CRC32_11VECS_MULT_2 } +#define CRC32_X1439_MODG 0x5a1bb05d /* x^1439 mod G(x) */ +#define CRC32_X1375_MODG 0xd1df2327 /* x^1375 mod G(x) */ -#define CRC32_12VECS_MULT_1 0x596c8d81 /* x^1567 mod G(x) */ -#define CRC32_12VECS_MULT_2 0xf5e48c85 /* x^1503 mod G(x) */ -#define CRC32_12VECS_MULTS { CRC32_12VECS_MULT_1, CRC32_12VECS_MULT_2 } +#define CRC32_X1567_MODG 0x596c8d81 /* x^1567 mod G(x) */ +#define CRC32_X1503_MODG 0xf5e48c85 /* x^1503 mod G(x) */ -#define CRC32_FINAL_MULT 0xb8bc6765 /* x^63 mod G(x) */ +#define CRC32_X1695_MODG 0x682bdd4f /* x^1695 mod G(x) */ +#define CRC32_X1631_MODG 0x3c656ced /* x^1631 mod G(x) */ + +#define CRC32_X1823_MODG 0x4a28bd43 /* x^1823 mod G(x) */ +#define CRC32_X1759_MODG 0xfe807bbd /* x^1759 mod G(x) */ + +#define CRC32_X1951_MODG 0x0077f00d /* x^1951 mod G(x) */ +#define CRC32_X1887_MODG 0x1f0c2cdd /* x^1887 mod G(x) */ + +#define CRC32_X2079_MODG 0xce3371cb /* x^2079 mod G(x) */ +#define CRC32_X2015_MODG 0xe95c1271 /* x^2015 mod G(x) */ + +#define CRC32_X2207_MODG 0xa749e894 /* x^2207 mod G(x) */ +#define CRC32_X2143_MODG 0xb918a347 /* x^2143 mod G(x) */ + +#define CRC32_X2335_MODG 0x2c538639 /* x^2335 mod G(x) */ +#define CRC32_X2271_MODG 0x71d54a59 /* x^2271 mod G(x) */ + +#define CRC32_X2463_MODG 0x32b0733c /* x^2463 mod G(x) */ +#define CRC32_X2399_MODG 0xff6f2fc2 /* x^2399 mod G(x) */ + +#define CRC32_X2591_MODG 0x0e9bd5cc /* x^2591 mod G(x) */ +#define CRC32_X2527_MODG 0xcec97417 /* x^2527 mod G(x) */ + +#define CRC32_X2719_MODG 0x76278617 /* x^2719 mod G(x) */ +#define CRC32_X2655_MODG 0x1c63267b /* x^2655 mod G(x) */ + +#define CRC32_X2847_MODG 0xc51b93e3 /* x^2847 mod G(x) */ +#define CRC32_X2783_MODG 0xf183c71b /* x^2783 mod G(x) */ + +#define CRC32_X2975_MODG 0x7eaed122 /* x^2975 mod G(x) */ +#define CRC32_X2911_MODG 0x9b9bdbd0 /* x^2911 mod G(x) */ + +#define CRC32_X3103_MODG 0x2ce423f1 /* x^3103 mod G(x) */ +#define CRC32_X3039_MODG 0xd31343ea /* x^3039 mod G(x) */ + +#define CRC32_X3231_MODG 0x8b8d8645 /* x^3231 mod G(x) */ +#define CRC32_X3167_MODG 0x4470ac44 /* x^3167 mod G(x) */ + +#define CRC32_X3359_MODG 0x4b700aa8 /* x^3359 mod G(x) */ +#define CRC32_X3295_MODG 0xeea395c4 /* x^3295 mod G(x) */ + +#define CRC32_X3487_MODG 0xeff5e99d /* x^3487 mod G(x) */ +#define CRC32_X3423_MODG 0xf9d9c7ee /* x^3423 mod G(x) */ + +#define CRC32_X3615_MODG 0xad0d2bb2 /* x^3615 mod G(x) */ +#define CRC32_X3551_MODG 0xcd669a40 /* x^3551 mod G(x) */ + +#define CRC32_X3743_MODG 0x9fb66bd3 /* x^3743 mod G(x) */ +#define CRC32_X3679_MODG 0x6d40f445 /* x^3679 mod G(x) */ + +#define CRC32_X3871_MODG 0xc2dcc467 /* x^3871 mod G(x) */ +#define CRC32_X3807_MODG 0x9ee62949 /* x^3807 mod G(x) */ + +#define CRC32_X3999_MODG 0x398e2ff2 /* x^3999 mod G(x) */ +#define CRC32_X3935_MODG 0x145575d5 /* x^3935 mod G(x) */ + +#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */ +#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */ + +#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */ #define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */ #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */ #define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 } diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 8df855ff..fa8fd019 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -86,32 +86,71 @@ read_xcr(u32 index) static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_SSE2, "sse2"}, - {X86_CPU_FEATURE_PCLMUL, "pclmul"}, + {X86_CPU_FEATURE_PCLMULQDQ, "pclmulqdq"}, {X86_CPU_FEATURE_AVX, "avx"}, {X86_CPU_FEATURE_AVX2, "avx2"}, {X86_CPU_FEATURE_BMI2, "bmi2"}, + {X86_CPU_FEATURE_AVX512F, "avx512f"}, + {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, + {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, }; volatile u32 libdeflate_x86_cpu_features = 0; +/* + * Don't use 512-bit vectors on Intel CPUs 10th generation and older, due to the + * downclocking penalty. + */ +static inline bool +allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model) +{ +#ifdef TEST_SUPPORT__DO_NOT_USE + return true; +#endif + if (memcmp(manufacturer, "GenuineIntel", 12) != 0) + return true; + if (family != 6) + return true; + switch (model) { + case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */ + case 106: /* Ice Lake (Server) */ + case 108: /* Ice Lake (Server) */ + case 126: /* Ice Lake (Client) */ + case 140: /* Tiger Lake */ + case 141: /* Tiger Lake */ + return false; + } + return true; +} + /* Initialize libdeflate_x86_cpu_features. */ void libdeflate_init_x86_cpu_features(void) { - u32 max_leaf, a, b, c, d; + u32 max_leaf; + u32 manufacturer[3]; + u32 family, model; + u32 a, b, c, d; u64 xcr0 = 0; u32 features = 0; /* EAX=0: Highest Function Parameter and Manufacturer ID */ - cpuid(0, 0, &max_leaf, &b, &c, &d); + cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2], + &manufacturer[1]); if (max_leaf < 1) goto out; /* EAX=1: Processor Info and Feature Bits */ cpuid(1, 0, &a, &b, &c, &d); + family = (a >> 8) & 0xf; + model = (a >> 4) & 0xf; + if (family == 6 || family == 0xf) + model += (a >> 12) & 0xf0; + if (family == 0xf) + family += (a >> 20) & 0xff; if (d & (1 << 26)) features |= X86_CPU_FEATURE_SSE2; if (c & (1 << 1)) - features |= X86_CPU_FEATURE_PCLMUL; + features |= X86_CPU_FEATURE_PCLMULQDQ; if (c & (1 << 27)) xcr0 = read_xcr(0); if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) @@ -126,6 +165,13 @@ void libdeflate_init_x86_cpu_features(void) features |= X86_CPU_FEATURE_AVX2; if (b & (1 << 8)) features |= X86_CPU_FEATURE_BMI2; + if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) && + allow_512bit_vectors(manufacturer, family, model)) + features |= X86_CPU_FEATURE_AVX512F; + if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6)) + features |= X86_CPU_FEATURE_AVX512VL; + if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_VPCLMULQDQ; out: disable_cpu_features_for_testing(&features, x86_cpu_feature_table, diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index ad14e435..b5fbf573 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -40,16 +40,22 @@ #endif #define X86_CPU_FEATURE_SSE2 0x00000001 -#define X86_CPU_FEATURE_PCLMUL 0x00000002 +#define X86_CPU_FEATURE_PCLMULQDQ 0x00000002 #define X86_CPU_FEATURE_AVX 0x00000004 #define X86_CPU_FEATURE_AVX2 0x00000008 #define X86_CPU_FEATURE_BMI2 0x00000010 +#define X86_CPU_FEATURE_AVX512F 0x00000020 +#define X86_CPU_FEATURE_AVX512VL 0x00000040 +#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000080 #define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) -#define HAVE_PCLMUL(features) (HAVE_PCLMUL_NATIVE || ((features) & X86_CPU_FEATURE_PCLMUL)) +#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) #define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX)) #define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) #define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) +#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F)) +#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL)) +#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ)) #if HAVE_DYNAMIC_X86_CPU_FEATURES #define X86_CPU_FEATURES_KNOWN 0x80000000 @@ -90,18 +96,18 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #endif #define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS) -/* PCLMUL */ +/* PCLMULQDQ */ #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) -# define HAVE_PCLMUL_NATIVE 1 +# define HAVE_PCLMULQDQ_NATIVE 1 #else -# define HAVE_PCLMUL_NATIVE 0 +# define HAVE_PCLMULQDQ_NATIVE 0 #endif -#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ - defined(_MSC_VER))) -# define HAVE_PCLMUL_INTRIN 1 +#if HAVE_PCLMULQDQ_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ + defined(_MSC_VER))) +# define HAVE_PCLMULQDQ_INTRIN 1 #else -# define HAVE_PCLMUL_INTRIN 0 +# define HAVE_PCLMULQDQ_INTRIN 0 #endif /* AVX */ @@ -156,6 +162,45 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_BMI2_INTRIN 0 #endif +/* AVX-512F */ +#ifdef __AVX512F__ +# define HAVE_AVX512F_NATIVE 1 +#else +# define HAVE_AVX512F_NATIVE 0 +#endif +#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \ + defined(_MSC_VER) +# define HAVE_AVX512F_INTRIN 1 +#else +# define HAVE_AVX512F_INTRIN 0 +#endif + +/* AVX-512VL */ +#ifdef __AVX512VL__ +# define HAVE_AVX512VL_NATIVE 1 +#else +# define HAVE_AVX512VL_NATIVE 0 +#endif +#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \ + defined(_MSC_VER) +# define HAVE_AVX512VL_INTRIN 1 +#else +# define HAVE_AVX512VL_INTRIN 0 +#endif + +/* VPCLMULQDQ */ +#ifdef __VPCLMULQDQ__ +# define HAVE_VPCLMULQDQ_NATIVE 1 +#else +# define HAVE_VPCLMULQDQ_NATIVE 0 +#endif +#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ + defined(_MSC_VER)) +# define HAVE_VPCLMULQDQ_INTRIN 1 +#else +# define HAVE_VPCLMULQDQ_INTRIN 0 +#endif + #endif /* ARCH_X86_32 || ARCH_X86_64 */ #endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index 79cc7944..e818d0a6 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -30,67 +30,137 @@ #include "cpu_features.h" -/* PCLMUL implementation */ -#if HAVE_PCLMUL_INTRIN -# define crc32_x86_pclmul crc32_x86_pclmul -# define SUFFIX _pclmul -# if HAVE_PCLMUL_NATIVE +/* PCLMULQDQ implementation */ +#if HAVE_PCLMULQDQ_INTRIN +# define crc32_x86_pclmulqdq crc32_x86_pclmulqdq +# define SUFFIX _pclmulqdq +# if HAVE_PCLMULQDQ_NATIVE # define ATTRIBUTES # else # define ATTRIBUTES _target_attribute("pclmul") # endif -# define FOLD_PARTIAL_VECS 0 +# define VL 16 +# define FOLD_LESSTHAN16BYTES 0 +# define USE_TERNARYLOGIC 0 # include "crc32_pclmul_template.h" #endif /* - * PCLMUL/AVX implementation. This implementation has two benefits over the - * regular PCLMUL one. First, simply compiling against the AVX target can - * improve performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) - * without actually using any AVX intrinsics, probably due to the availability - * of non-destructive VEX-encoded instructions. Second, AVX support implies - * SSSE3 and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for - * efficient handling of partial blocks. (We *could* compile a variant with - * PCLMUL+SSSE3+SSE4.1 w/o AVX, but for simplicity we don't currently bother.) + * PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ + * implementation, this still uses 128-bit vectors, but it has two potential + * benefits. First, simply compiling against the AVX target can improve + * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without + * actually using any AVX intrinsics, probably due to the availability of + * non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3 + * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient + * handling of partial blocks. (We *could* compile a variant with + * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.) * * FIXME: with MSVC, this isn't actually compiled with AVX code generation * enabled yet. That would require that this be moved to its own .c file. */ -#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN -# define crc32_x86_pclmul_avx crc32_x86_pclmul_avx -# define SUFFIX _pclmul_avx -# if HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE +#if HAVE_PCLMULQDQ_INTRIN && HAVE_AVX_INTRIN +# define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx +# define SUFFIX _pclmulqdq_avx +# if HAVE_PCLMULQDQ_NATIVE && HAVE_AVX_NATIVE # define ATTRIBUTES # else # define ATTRIBUTES _target_attribute("pclmul,avx") # endif -# define FOLD_PARTIAL_VECS 1 +# define VL 16 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 0 +# include "crc32_pclmul_template.h" +#endif + +/* VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors. */ +#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX2_INTRIN && \ + /* + * This has to be disabled on MSVC because MSVC has a bug where it + * incorrectly assumes that VPCLMULQDQ implies AVX-512: + * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest + */ \ + !(defined(_MSC_VER) && !defined(__clang__)) +# define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 +# define SUFFIX _vpclmulqdq_avx2 +# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX2_NATIVE +# define ATTRIBUTES +# else +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") +# endif +# define VL 32 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 0 # include "crc32_pclmul_template.h" #endif /* - * If the best implementation is statically available, use it unconditionally. - * Otherwise choose the best implementation at runtime. + * VPCLMULQDQ/AVX512VL implementation. This takes advantage of some AVX-512 + * instructions but uses 256-bit vectors rather than 512-bit. This can be + * useful on CPUs where 512-bit vectors cause downclocking. */ -#if defined(crc32_x86_pclmul_avx) && HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE -#define DEFAULT_IMPL crc32_x86_pclmul_avx -#else +#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX512VL_INTRIN +# define crc32_x86_vpclmulqdq_avx512vl crc32_x86_vpclmulqdq_avx512vl +# define SUFFIX _vpclmulqdq_avx512vl +# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX512VL_NATIVE +# define ATTRIBUTES +# else +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") +# endif +# define VL 32 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 1 +# include "crc32_pclmul_template.h" +#endif + +/* VPCLMULQDQ/AVX512F/AVX512VL implementation. Uses 512-bit vectors. */ +#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \ + HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN +# define crc32_x86_vpclmulqdq_avx512f_avx512vl crc32_x86_vpclmulqdq_avx512f_avx512vl +# define SUFFIX _vpclmulqdq_avx512f_avx512vl +#if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ + HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE +# define ATTRIBUTES +# else +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512f,avx512vl") +# endif +# define VL 64 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 1 +# include "crc32_pclmul_template.h" +#endif + +/* Choose the best implementation at runtime. */ static inline crc32_func_t arch_select_crc32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); -#ifdef crc32_x86_pclmul_avx - if (HAVE_PCLMUL(features) && HAVE_AVX(features)) - return crc32_x86_pclmul_avx; +#ifdef crc32_x86_vpclmulqdq_avx512f_avx512vl + if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && + HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + return crc32_x86_vpclmulqdq_avx512f_avx512vl; +#endif +#ifdef crc32_x86_vpclmulqdq_avx512vl + if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && + HAVE_AVX512VL(features)) + return crc32_x86_vpclmulqdq_avx512vl; +#endif +#ifdef crc32_x86_vpclmulqdq_avx2 + if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && + HAVE_AVX2(features)) + return crc32_x86_vpclmulqdq_avx2; #endif -#ifdef crc32_x86_pclmul - if (HAVE_PCLMUL(features)) - return crc32_x86_pclmul; +#ifdef crc32_x86_pclmulqdq_avx + if (HAVE_PCLMULQDQ(features) && HAVE_AVX(features)) + return crc32_x86_pclmulqdq_avx; +#endif +#ifdef crc32_x86_pclmulqdq + if (HAVE_PCLMULQDQ(features)) + return crc32_x86_pclmulqdq; #endif return NULL; } #define arch_select_crc32_func arch_select_crc32_func -#endif #endif /* LIB_X86_CRC32_IMPL_H */ diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index 1d578237..16e4ebf5 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -32,11 +32,20 @@ * SUFFIX: * Name suffix to append to all instantiated functions. * ATTRIBUTES: - * Target function attributes to use. - * FOLD_PARTIAL_VECS: + * Target function attributes to use. Must satisfy the dependencies of the + * other parameters as follows: + * VL=16 && FOLD_LESSTHAN16BYTES=0: at least pclmul + * VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1 + * VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2 + * VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl + * VL=64: at least vpclmulqdq,pclmul,avx512f,avx512vl + * VL: + * Vector length in bytes. Supported values are 16, 32, and 64. + * FOLD_LESSTHAN16BYTES: * Use vector instructions to handle any partial blocks at the beginning * and end, instead of falling back to scalar instructions for those parts. - * Requires SSSE3 and SSE4.1 intrinsics. + * USE_TERNARYLOGIC: + * Use the vpternlog instruction to do three-argument XORs. * * The overall algorithm used is CRC folding with carryless multiplication * instructions. Note that the x86 crc32 instruction cannot be used, as it is @@ -46,6 +55,11 @@ * * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + * + * The original pclmulqdq instruction does one 64x64 to 128-bit carryless + * multiplication. The VPCLMULQDQ feature added instructions that do two + * parallel 64x64 to 128-bit carryless multiplications in combination with AVX + * or AVX512VL, or four in combination with AVX512F. */ #include @@ -57,41 +71,110 @@ # include # include # include +# include +# include +# include +# include +# include #endif -#undef fold_vec +#undef fold_vec128 static forceinline ATTRIBUTES __m128i -ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __m128i /* __v2di */ multipliers) +ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers) { - /* - * The immediate constant for PCLMULQDQ specifies which 64-bit halves of - * the 128-bit vectors to multiply: - * - * 0x00 means low halves (higher degree polynomial terms for us) - * 0x11 means high halves (lower degree polynomial terms for us) - */ dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00)); dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11)); return dst; } -#define fold_vec ADD_SUFFIX(fold_vec) +#define fold_vec128 ADD_SUFFIX(fold_vec128) -#if FOLD_PARTIAL_VECS +#if VL >= 32 +#undef fold_vec256 +static forceinline ATTRIBUTES __m256i +ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i multipliers) +{ +#if USE_TERNARYLOGIC + return _mm256_ternarylogic_epi32( + _mm256_clmulepi64_epi128(src, multipliers, 0x00), + _mm256_clmulepi64_epi128(src, multipliers, 0x11), + dst, + 0x96); +#else + return _mm256_xor_si256( + _mm256_xor_si256(dst, + _mm256_clmulepi64_epi128(src, multipliers, 0x00)), + _mm256_clmulepi64_epi128(src, multipliers, 0x11)); +#endif +} +#define fold_vec256 ADD_SUFFIX(fold_vec256) +#endif /* VL >= 32 */ + +#if VL >= 64 +#undef fold_vec512 +static forceinline ATTRIBUTES __m512i +ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers) +{ + return _mm512_ternarylogic_epi32( + _mm512_clmulepi64_epi128(src, multipliers, 0x00), + _mm512_clmulepi64_epi128(src, multipliers, 0x11), + dst, + 0x96); +} +#define fold_vec512 ADD_SUFFIX(fold_vec512) +#endif /* VL >= 64 */ + +#if VL == 16 +# define vec_t __m128i +# define fold_vec fold_vec128 +# define VLOAD_UNALIGNED(p) _mm_loadu_si128((const void *)(p)) +# define VXOR(a, b) _mm_xor_si128((a), (b)) +# define M128I_TO_VEC(a) a +# define MULTS_8V _mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG) +# define MULTS_4V _mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG) +# define MULTS_2V _mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG) +# define MULTS_1V _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG) +#elif VL == 32 +# define vec_t __m256i +# define fold_vec fold_vec256 +# define VLOAD_UNALIGNED(p) _mm256_loadu_si256((const void *)(p)) +# define VXOR(a, b) _mm256_xor_si256((a), (b)) +# define M128I_TO_VEC(a) _mm256_castsi128_si256(a) +# define MULTS(a, b) _mm256_set_epi64x(a, b, a, b) +# define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG) +# define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG) +# define MULTS_2V MULTS(CRC32_X479_MODG, CRC32_X543_MODG) +# define MULTS_1V MULTS(CRC32_X223_MODG, CRC32_X287_MODG) +#elif VL == 64 +# define vec_t __m512i +# define fold_vec fold_vec512 +# define VLOAD_UNALIGNED(p) _mm512_loadu_si512((const void *)(p)) +# define VXOR(a, b) _mm512_xor_si512((a), (b)) +# define M128I_TO_VEC(a) _mm512_castsi128_si512(a) +# define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b) +# define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG) +# define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG) +# define MULTS_2V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG) +# define MULTS_1V MULTS(CRC32_X479_MODG, CRC32_X543_MODG) +#else +# error "unsupported vector length" +#endif + +#if FOLD_LESSTHAN16BYTES /* - * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the - * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the - * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes, - * respectively. Then fold x0 into x1 and return the result. Assumes that - * 'p + len - 16' is in-bounds. + * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to + * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and + * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes, + * respectively. Then fold x0 into x1 and return the result. + * Assumes that 'p + len - 16' is in-bounds. */ -#undef fold_partial_vec +#undef fold_lessthan16bytes static forceinline ATTRIBUTES __m128i -ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len, - __m128i /* __v2du */ multipliers_1) +ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, + __m128i /* __v2du */ multipliers_128b) { /* - * pshufb(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. - * pshufb(v, shift_tab[len+16..len+31]) right shifts v by len bytes. + * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. + * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. */ static const u8 shift_tab[48] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, @@ -105,41 +188,40 @@ ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len, __m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]); __m128i x0, x1; - /* x0 = v left-shifted by '16 - len' bytes */ - x0 = _mm_shuffle_epi8(v, lshift); + /* x0 = x left-shifted by '16 - len' bytes */ + x0 = _mm_shuffle_epi8(x, lshift); /* - * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len' + * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len' * bytes) followed by the remaining data. */ - x1 = _mm_blendv_epi8(_mm_shuffle_epi8(v, rshift), + x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift), _mm_loadu_si128((const void *)(p + len - 16)), /* msb 0/1 of each byte selects byte from arg1/2 */ rshift); - return fold_vec(x0, x1, multipliers_1); + return fold_vec128(x0, x1, multipliers_128b); } -#define fold_partial_vec ADD_SUFFIX(fold_partial_vec) -#endif /* FOLD_PARTIAL_VECS */ +#define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes) +#endif /* FOLD_LESSTHAN16BYTES */ -static u32 ATTRIBUTES MAYBE_UNUSED +static u32 ATTRIBUTES ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) { - const __m128i /* __v2du */ multipliers_8 = - _mm_set_epi64x(CRC32_8VECS_MULT_2, CRC32_8VECS_MULT_1); - const __m128i /* __v2du */ multipliers_4 = - _mm_set_epi64x(CRC32_4VECS_MULT_2, CRC32_4VECS_MULT_1); - const __m128i /* __v2du */ multipliers_2 = - _mm_set_epi64x(CRC32_2VECS_MULT_2, CRC32_2VECS_MULT_1); - const __m128i /* __v2du */ multipliers_1 = - _mm_set_epi64x(CRC32_1VECS_MULT_2, CRC32_1VECS_MULT_1); + const vec_t multipliers_8v = MULTS_8V; /* 8 vecs */ + const vec_t multipliers_4v = MULTS_4V; /* 4 vecs */ + const vec_t multipliers_2v = MULTS_2V; /* 2 vecs */ + const vec_t multipliers_1v = MULTS_1V; /* 1 vecs */ + const __m128i /* __v2du */ multipliers_128b = + _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG); const __m128i /* __v2du */ final_multiplier = - _mm_set_epi64x(0, CRC32_FINAL_MULT); + _mm_set_epi64x(0, CRC32_X63_MODG); const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); const __m128i /* __v2du */ barrett_reduction_constants = _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1); - __m128i v0, v1, v2, v3, v4, v5, v6, v7; + vec_t v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1; /* * There are two overall code paths. The first path supports all @@ -148,83 +230,102 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * lengths, aligns the pointer in order to do aligned loads, and does up * to 8-way folds. The length check below decides which path to take. */ - if (len < 1024) { - if (len < 16) + if (len < 64*VL) { + if (len < VL) return crc32_slice1(crc, p, len); - v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), - _mm_cvtsi32_si128(crc)); - p += 16; + v0 = VXOR(VLOAD_UNALIGNED(p), + M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + p += VL; - if (len >= 64) { - v1 = _mm_loadu_si128((const void *)(p + 0)); - v2 = _mm_loadu_si128((const void *)(p + 16)); - v3 = _mm_loadu_si128((const void *)(p + 32)); - p += 48; - while (len >= 64 + 64) { - v0 = fold_vec(v0, _mm_loadu_si128((const void *)(p + 0)), - multipliers_4); - v1 = fold_vec(v1, _mm_loadu_si128((const void *)(p + 16)), - multipliers_4); - v2 = fold_vec(v2, _mm_loadu_si128((const void *)(p + 32)), - multipliers_4); - v3 = fold_vec(v3, _mm_loadu_si128((const void *)(p + 48)), - multipliers_4); - p += 64; - len -= 64; + if (len >= 4*VL) { + v1 = VLOAD_UNALIGNED(p + 0*VL); + v2 = VLOAD_UNALIGNED(p + 1*VL); + v3 = VLOAD_UNALIGNED(p + 2*VL); + p += 3*VL; + while (len >= 8*VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL), + multipliers_4v); + v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL), + multipliers_4v); + v2 = fold_vec(v2, VLOAD_UNALIGNED(p + 2*VL), + multipliers_4v); + v3 = fold_vec(v3, VLOAD_UNALIGNED(p + 3*VL), + multipliers_4v); + p += 4*VL; + len -= 4*VL; } - v0 = fold_vec(v0, v2, multipliers_2); - v1 = fold_vec(v1, v3, multipliers_2); - if (len & 32) { - v0 = fold_vec(v0, _mm_loadu_si128((const void *)(p + 0)), - multipliers_2); - v1 = fold_vec(v1, _mm_loadu_si128((const void *)(p + 16)), - multipliers_2); - p += 32; + v0 = fold_vec(v0, v2, multipliers_2v); + v1 = fold_vec(v1, v3, multipliers_2v); + if (len & (2*VL)) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL), + multipliers_2v); + v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL), + multipliers_2v); + p += 2*VL; } - v0 = fold_vec(v0, v1, multipliers_1); - if (len & 16) { - v0 = fold_vec(v0, _mm_loadu_si128((const void *)p), - multipliers_1); - p += 16; + v0 = fold_vec(v0, v1, multipliers_1v); + if (len & VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p), + multipliers_1v); + p += VL; } } else { - if (len >= 32) { - v0 = fold_vec(v0, _mm_loadu_si128((const void *)p), - multipliers_1); - p += 16; - if (len >= 48) { - v0 = fold_vec(v0, _mm_loadu_si128((const void *)p), - multipliers_1); - p += 16; + if (len >= 2*VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p), + multipliers_1v); + p += VL; + if (len >= 3*VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p), + multipliers_1v); + p += VL; } } } } else { - const size_t align = -(uintptr_t)p & 15; - const __m128i *vp; + size_t align = -(uintptr_t)p & (VL-1); + const vec_t *vp; - #if FOLD_PARTIAL_VECS - v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), - _mm_cvtsi32_si128(crc)); - p += 16; - /* Align p to the next 16-byte boundary. */ - if (align) { - v0 = fold_partial_vec(v0, p, align, multipliers_1); - p += align; + /* Align p to the next VL-byte boundary. */ + if (align == 0) { + vp = (const vec_t *)p; + v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + } else { len -= align; - } - vp = (const __m128i *)p; - #else - /* Align p to the next 16-byte boundary. */ - if (align) { + #if FOLD_LESSTHAN16BYTES + x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), + _mm_cvtsi32_si128(crc)); + p += 16; + if (align & 15) { + x0 = fold_lessthan16bytes(x0, p, align & 15, + multipliers_128b); + p += align & 15; + align &= ~15; + } + while (align >= 16) { + x0 = fold_vec128(x0, *(const __m128i *)p, + multipliers_128b); + p += 16; + align -= 16; + } + v0 = M128I_TO_VEC(x0); + # if VL == 32 + v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1); + p += 16; + # elif VL == 64 + v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1); + p += 16; + v0 = _mm512_inserti64x4(v0, *(const __m256i *)p, 1); + p += 32; + # endif + vp = (const vec_t *)p; + #else crc = crc32_slice1(crc, p, align); p += align; - len -= align; + vp = (const vec_t *)p; + v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + #endif } - vp = (const __m128i *)p; - v0 = _mm_xor_si128(*vp++, _mm_cvtsi32_si128(crc)); - #endif v1 = *vp++; v2 = *vp++; v3 = *vp++; @@ -233,50 +334,85 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) v6 = *vp++; v7 = *vp++; do { - v0 = fold_vec(v0, *vp++, multipliers_8); - v1 = fold_vec(v1, *vp++, multipliers_8); - v2 = fold_vec(v2, *vp++, multipliers_8); - v3 = fold_vec(v3, *vp++, multipliers_8); - v4 = fold_vec(v4, *vp++, multipliers_8); - v5 = fold_vec(v5, *vp++, multipliers_8); - v6 = fold_vec(v6, *vp++, multipliers_8); - v7 = fold_vec(v7, *vp++, multipliers_8); - len -= 128; - } while (len >= 128 + 128); + v0 = fold_vec(v0, *vp++, multipliers_8v); + v1 = fold_vec(v1, *vp++, multipliers_8v); + v2 = fold_vec(v2, *vp++, multipliers_8v); + v3 = fold_vec(v3, *vp++, multipliers_8v); + v4 = fold_vec(v4, *vp++, multipliers_8v); + v5 = fold_vec(v5, *vp++, multipliers_8v); + v6 = fold_vec(v6, *vp++, multipliers_8v); + v7 = fold_vec(v7, *vp++, multipliers_8v); + len -= 8*VL; + } while (len >= 16*VL); - v0 = fold_vec(v0, v4, multipliers_4); - v1 = fold_vec(v1, v5, multipliers_4); - v2 = fold_vec(v2, v6, multipliers_4); - v3 = fold_vec(v3, v7, multipliers_4); - if (len & 64) { - v0 = fold_vec(v0, *vp++, multipliers_4); - v1 = fold_vec(v1, *vp++, multipliers_4); - v2 = fold_vec(v2, *vp++, multipliers_4); - v3 = fold_vec(v3, *vp++, multipliers_4); + /* + * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes) + * and fold in any VL-byte data segments that remain. + */ + v0 = fold_vec(v0, v4, multipliers_4v); + v1 = fold_vec(v1, v5, multipliers_4v); + v2 = fold_vec(v2, v6, multipliers_4v); + v3 = fold_vec(v3, v7, multipliers_4v); + if (len & (4*VL)) { + v0 = fold_vec(v0, *vp++, multipliers_4v); + v1 = fold_vec(v1, *vp++, multipliers_4v); + v2 = fold_vec(v2, *vp++, multipliers_4v); + v3 = fold_vec(v3, *vp++, multipliers_4v); + } + v0 = fold_vec(v0, v2, multipliers_2v); + v1 = fold_vec(v1, v3, multipliers_2v); + if (len & (2*VL)) { + v0 = fold_vec(v0, *vp++, multipliers_2v); + v1 = fold_vec(v1, *vp++, multipliers_2v); } + v0 = fold_vec(v0, v1, multipliers_1v); + if (len & VL) + v0 = fold_vec(v0, *vp++, multipliers_1v); + p = (const u8 *)vp; + } - v0 = fold_vec(v0, v2, multipliers_2); - v1 = fold_vec(v1, v3, multipliers_2); + /* + * Reduce v0 (length VL bytes) to x0 (length 16 bytes) + * and fold in any 16-byte data segments that remain. + */ +#if VL == 16 + x0 = v0; +#else + { +# if VL == 32 + __m256i y0 = v0; +# else + const __m256i multipliers_256b = + _mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG, + CRC32_X223_MODG, CRC32_X287_MODG); + __m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0), + _mm512_extracti64x4_epi64(v0, 1), + multipliers_256b); if (len & 32) { - v0 = fold_vec(v0, *vp++, multipliers_2); - v1 = fold_vec(v1, *vp++, multipliers_2); + y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p), + multipliers_256b); + p += 32; } - - v0 = fold_vec(v0, v1, multipliers_1); - if (len & 16) - v0 = fold_vec(v0, *vp++, multipliers_1); - - p = (const u8 *)vp; +# endif + x0 = fold_vec128(_mm256_extracti128_si256(y0, 0), + _mm256_extracti128_si256(y0, 1), + multipliers_128b); } + if (len & 16) { + x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p), + multipliers_128b); + p += 16; + } +#endif len &= 15; /* - * If fold_partial_vec() is available, handle any remaining partial - * block now before reducing to 32 bits. + * If fold_lessthan16bytes() is available, handle any remainder + * of 1 to 15 bytes now, before reducing to 32 bits. */ -#if FOLD_PARTIAL_VECS +#if FOLD_LESSTHAN16BYTES if (len) - v0 = fold_partial_vec(v0, p, len, multipliers_1); + x0 = fold_lessthan16bytes(x0, p, len, multipliers_128b); #endif /* @@ -284,12 +420,12 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * which is equivalent to multiplying by x^32. This is needed because * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). */ - v0 = _mm_xor_si128(_mm_srli_si128(v0, 8), - _mm_clmulepi64_si128(v0, multipliers_1, 0x10)); + x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), + _mm_clmulepi64_si128(x0, multipliers_128b, 0x10)); /* Fold 96 => 64 bits. */ - v0 = _mm_xor_si128(_mm_srli_si128(v0, 4), - _mm_clmulepi64_si128(_mm_and_si128(v0, mask32), + x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), + _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00)); /* @@ -334,21 +470,34 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * R(x) = B(x) + G(x)*floor ( ------------------------- ) * \ x^32 / */ - v1 = _mm_clmulepi64_si128(_mm_and_si128(v0, mask32), + x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00); - v1 = _mm_clmulepi64_si128(_mm_and_si128(v1, mask32), + x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), barrett_reduction_constants, 0x10); - v0 = _mm_xor_si128(v0, v1); -#if FOLD_PARTIAL_VECS - crc = _mm_extract_epi32(v0, 1); + x0 = _mm_xor_si128(x0, x1); +#if FOLD_LESSTHAN16BYTES + crc = _mm_extract_epi32(x0, 1); #else - crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(v0, 0x01)); + crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01)); /* Process up to 15 bytes left over at the end. */ crc = crc32_slice1(crc, p, len); #endif return crc; } +#undef vec_t +#undef fold_vec +#undef VLOAD_UNALIGNED +#undef VXOR +#undef M128I_TO_VEC +#undef MULTS +#undef MULTS_8V +#undef MULTS_4V +#undef MULTS_2V +#undef MULTS_1V + #undef SUFFIX #undef ATTRIBUTES -#undef FOLD_PARTIAL_VECS +#undef VL +#undef FOLD_LESSTHAN16BYTES +#undef USE_TERNARYLOGIC diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index 83667608..b359fba3 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -16,6 +16,13 @@ have_cpu_feature() { grep -q "^$tag"$'[ \t]'"*:.*\<$feature\>" /proc/cpuinfo } +have_cpu_features() { + local feature + for feature; do + have_cpu_feature "$feature" || return 1 + done +} + make_and_test() { # Build the checksum program and tests. Set the special test support # flag to get support for LIBDEFLATE_DISABLE_CPU_FEATURES. @@ -37,7 +44,7 @@ __do_benchmark() { speed=$(./build/programs/checksum "${CKSUM_FLAGS[@]}" \ "${flags[@]}" -t "$FILE" | \ grep -o '[0-9]\+ MB/s' | grep -o '[0-9]\+') - printf "%-45s%-10s\n" "$CKSUM_NAME ($impl)" "$speed" + printf "%-60s%-10s\n" "$CKSUM_NAME ($impl)" "$speed" } do_benchmark() { @@ -95,8 +102,8 @@ else fi cat << EOF -Method Speed (MB/s) ------- ------------ +Method Speed (MB/s) +------ ------------ EOF # CRC-32 @@ -107,13 +114,25 @@ export LIBDEFLATE_DISABLE_CPU_FEATURES="" { case $ARCH in i386|x86_64) - if have_cpu_feature pclmulqdq && have_cpu_feature avx; then - do_benchmark "PCLMUL/AVX" + if have_cpu_features vpclmulqdq avx512f avx512vl; then + do_benchmark "VPCLMULQDQ/AVX512F/AVX512VL" + disable_cpu_feature "avx512f" "-mno-avx512f" + fi + if have_cpu_features vpclmulqdq avx512vl; then + do_benchmark "VPCLMULQDQ/AVX512VL" + disable_cpu_feature "avx512vl" "-mno-avx512vl" + fi + if have_cpu_features vpclmulqdq avx2; then + do_benchmark "VPCLMULQDQ/AVX2" + disable_cpu_feature "vpclmulqdq" "-mno-vpclmulqdq" + fi + if have_cpu_features pclmulqdq avx; then + do_benchmark "PCLMULQDQ/AVX" disable_cpu_feature "avx" "-mno-avx" fi if have_cpu_feature pclmulqdq; then - do_benchmark "PCLMUL" - disable_cpu_feature "pclmul" "-mno-pclmul" + do_benchmark "PCLMULQDQ" + disable_cpu_feature "pclmulqdq" "-mno-pclmul" fi ;; arm*|aarch*) diff --git a/scripts/gen_crc32_multipliers.c b/scripts/gen_crc32_multipliers.c index 42470c13..1073bfc8 100644 --- a/scripts/gen_crc32_multipliers.c +++ b/scripts/gen_crc32_multipliers.c @@ -97,37 +97,34 @@ gen_vec_folding_constants(void) /* * Compute the multipliers needed for CRC-32 folding with carryless * multiplication instructions that operate on the 64-bit halves of - * 128-bit vectors. Using the terminology from earlier, for each 64-bit + * 128-bit segments. Using the terminology from earlier, for each 64-bit * fold len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial * multiplied by a 32-bit one produces a 95-bit one. When A(x) is the - * low order polynomial half of a 128-bit vector (high order physical + * low order polynomial half of a 128-bit segments (high order physical * half), the separation between the message parts is the total length - * of the 128-bit vectors separating the values. When A(x) is the high + * of the 128-bit segments separating the values. When A(x) is the high * order polynomial half, the separation is 64 bits greater. */ - for (int num_vecs = 1; num_vecs <= 12; num_vecs++) { - const int sep_lo = 128 * (num_vecs - 1); + for (int i = 1; i <= 32; i++) { + const int sep_lo = 128 * (i - 1); const int sep_hi = sep_lo + 64; const int len_B = 95; int D; /* A(x) = high 64 polynomial bits (low 64 physical bits) */ D = sep_hi + len_B; - printf("#define CRC32_%dVECS_MULT_1 0x%08"PRIx32" /* x^%d mod G(x) */\n", - num_vecs, compute_xD_modG(D), D); + printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n", + D, compute_xD_modG(D), D); /* A(x) = low 64 polynomial bits (high 64 physical bits) */ D = sep_lo + len_B; - printf("#define CRC32_%dVECS_MULT_2 0x%08"PRIx32" /* x^%d mod G(x) */\n", - num_vecs, compute_xD_modG(D), D); - - printf("#define CRC32_%dVECS_MULTS { CRC32_%dVECS_MULT_1, CRC32_%dVECS_MULT_2 }\n", - num_vecs, num_vecs, num_vecs); + printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n", + D, compute_xD_modG(D), D); printf("\n"); } /* Multiplier for final 96 => 64 bit fold */ - printf("#define CRC32_FINAL_MULT 0x%08"PRIx32" /* x^63 mod G(x) */\n", + printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n", compute_xD_modG(63)); /* diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 44d3b9e2..ff15cc19 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,7 +142,8 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(avx2 avx bmi2 pclmul sse2) + features+=(vpclmulqdq avx512vl avx512f + avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*) features+=(dotprod sha3 crc32 pmull neon) From 513cec0f5597de958c319432a7da510d1b7d7d4c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 18 Feb 2024 15:15:46 -0800 Subject: [PATCH 2/3] test_util: fix timer_frequency() in direct compilation case --- programs/test_util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/programs/test_util.c b/programs/test_util.c index 77517e1a..11cd4875 100644 --- a/programs/test_util.c +++ b/programs/test_util.c @@ -174,7 +174,9 @@ timer_frequency(void) QueryPerformanceFrequency(&freq); return freq.QuadPart; -#elif defined(HAVE_CLOCK_GETTIME) +#elif defined(HAVE_CLOCK_GETTIME) || \ + /* fallback detection method for direct compilation */ \ + (!defined(HAVE_CONFIG_H) && defined(CLOCK_MONOTONIC)) return 1000000000; #else return 1000000; From 6f4b83f523d3aaadd43b9736e87b94e97c7c3837 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 18 Feb 2024 15:36:42 -0800 Subject: [PATCH 3/3] ci.yml: upgrade to microsoft/setup-msbuild@v2 --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f758e97a..f738eed4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,7 +117,7 @@ jobs: runs-on: ${{matrix.os}} steps: - uses: actions/checkout@v4 - - uses: microsoft/setup-msbuild@v1.1 + - uses: microsoft/setup-msbuild@v2 - run: vcpkg install zlib:${{matrix.vcpkg}} - run: > echo C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\bin @@ -145,7 +145,7 @@ jobs: runs-on: windows-latest steps: - uses: actions/checkout@v4 - - uses: microsoft/setup-msbuild@v1.1 + - uses: microsoft/setup-msbuild@v2 # Note: as per the CMake documentation, DESTDIR is unsupported on Windows. - run: > cmake -B build -G "Visual Studio 17 2022" -T ${{matrix.toolset}}