From 5f2a0b4beca9ff017ba016ee38a55b580fa66206 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sun, 18 Feb 2024 14:45:54 -0800
Subject: [PATCH 1/3] lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32

This improves CRC-32 performance on some of the latest x86 CPUs.  Three
implementations are added: VPCLMULQDQ/AVX2 and VPCLMULQDQ/AVX512VL which
use 256-bit vectors, and VPCLMULQDQ/AVX512F/AVX512VL which uses 512-bit
vectors.  To reduce downclocking effects, the implementation with
512-bit vectors isn't used on Intel CPUs 10th generation and older.
---
 lib/arm/crc32_impl.h            |   8 +-
 lib/arm/crc32_pmull_wide.h      |  10 +-
 lib/crc32.c                     |   2 +-
 lib/crc32_multipliers.h         | 122 ++++++---
 lib/x86/cpu_features.c          |  54 +++-
 lib/x86/cpu_features.h          |  65 ++++-
 lib/x86/crc32_impl.h            | 132 +++++++---
 lib/x86/crc32_pclmul_template.h | 443 +++++++++++++++++++++-----------
 scripts/checksum_benchmarks.sh  |  33 ++-
 scripts/gen_crc32_multipliers.c |  23 +-
 scripts/run_tests.sh            |   3 +-
 11 files changed, 637 insertions(+), 258 deletions(-)

diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h
index c802cdf0..d6ea30c0 100644
--- a/lib/arm/crc32_impl.h
+++ b/lib/arm/crc32_impl.h
@@ -474,12 +474,12 @@ static u32 ATTRIBUTES MAYBE_UNUSED
 crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 {
 	static const u64 _aligned_attribute(16) mults[3][2] = {
-		CRC32_1VECS_MULTS,
-		CRC32_4VECS_MULTS,
-		CRC32_2VECS_MULTS,
+		{ CRC32_X159_MODG, CRC32_X95_MODG },  /* 1 vecs */
+		{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
+		{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
 	};
 	static const u64 _aligned_attribute(16) final_mults[3][2] = {
-		{ CRC32_FINAL_MULT, 0 },
+		{ CRC32_X63_MODG, 0 },
 		{ CRC32_BARRETT_CONSTANT_1, 0 },
 		{ CRC32_BARRETT_CONSTANT_2, 0 },
 	};
diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h
index a72e1d87..c2f8af06 100644
--- a/lib/arm/crc32_pmull_wide.h
+++ b/lib/arm/crc32_pmull_wide.h
@@ -59,7 +59,9 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 
 	if (len < 3 * 192) {
 		static const u64 _aligned_attribute(16) mults[3][2] = {
-			CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS,
+			{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
+			{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
+			{ CRC32_X159_MODG, CRC32_X95_MODG },  /* 1 vecs */
 		};
 		poly64x2_t multipliers_4, multipliers_2, multipliers_1;
 
@@ -97,8 +99,10 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 		v0 = fold_vec(v0, v1, multipliers_1);
 	} else {
 		static const u64 _aligned_attribute(16) mults[4][2] = {
-			CRC32_12VECS_MULTS, CRC32_6VECS_MULTS,
-			CRC32_3VECS_MULTS, CRC32_1VECS_MULTS,
+			{ CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */
+			{ CRC32_X799_MODG, CRC32_X735_MODG },   /* 6 vecs */
+			{ CRC32_X415_MODG, CRC32_X351_MODG },   /* 3 vecs */
+			{ CRC32_X159_MODG, CRC32_X95_MODG },    /* 1 vecs */
 		};
 		const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
 		const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
diff --git a/lib/crc32.c b/lib/crc32.c
index c3a4da48..a0ec0223 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -165,7 +165,7 @@
  * intermediate remainder (which we never actually store explicitly) is 96 bits.
  *
  * On CPUs that support fast carryless multiplication, CRCs can be computed even
- * more quickly via "folding".  See e.g. the x86 PCLMUL implementation.
+ * more quickly via "folding".  See e.g. the x86 PCLMUL implementations.
  */
 
 #include "lib_common.h"
diff --git a/lib/crc32_multipliers.h b/lib/crc32_multipliers.h
index 580b775b..65a9bf3e 100644
--- a/lib/crc32_multipliers.h
+++ b/lib/crc32_multipliers.h
@@ -4,55 +4,103 @@
  * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.
  */
 
-#define CRC32_1VECS_MULT_1 0xae689191 /* x^159 mod G(x) */
-#define CRC32_1VECS_MULT_2 0xccaa009e /* x^95 mod G(x) */
-#define CRC32_1VECS_MULTS { CRC32_1VECS_MULT_1, CRC32_1VECS_MULT_2 }
+#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
+#define CRC32_X95_MODG 0xccaa009e /* x^95 mod G(x) */
 
-#define CRC32_2VECS_MULT_1 0xf1da05aa /* x^287 mod G(x) */
-#define CRC32_2VECS_MULT_2 0x81256527 /* x^223 mod G(x) */
-#define CRC32_2VECS_MULTS { CRC32_2VECS_MULT_1, CRC32_2VECS_MULT_2 }
+#define CRC32_X287_MODG 0xf1da05aa /* x^287 mod G(x) */
+#define CRC32_X223_MODG 0x81256527 /* x^223 mod G(x) */
 
-#define CRC32_3VECS_MULT_1 0x3db1ecdc /* x^415 mod G(x) */
-#define CRC32_3VECS_MULT_2 0xaf449247 /* x^351 mod G(x) */
-#define CRC32_3VECS_MULTS { CRC32_3VECS_MULT_1, CRC32_3VECS_MULT_2 }
+#define CRC32_X415_MODG 0x3db1ecdc /* x^415 mod G(x) */
+#define CRC32_X351_MODG 0xaf449247 /* x^351 mod G(x) */
 
-#define CRC32_4VECS_MULT_1 0x8f352d95 /* x^543 mod G(x) */
-#define CRC32_4VECS_MULT_2 0x1d9513d7 /* x^479 mod G(x) */
-#define CRC32_4VECS_MULTS { CRC32_4VECS_MULT_1, CRC32_4VECS_MULT_2 }
+#define CRC32_X543_MODG 0x8f352d95 /* x^543 mod G(x) */
+#define CRC32_X479_MODG 0x1d9513d7 /* x^479 mod G(x) */
 
-#define CRC32_5VECS_MULT_1 0x1c279815 /* x^671 mod G(x) */
-#define CRC32_5VECS_MULT_2 0xae0b5394 /* x^607 mod G(x) */
-#define CRC32_5VECS_MULTS { CRC32_5VECS_MULT_1, CRC32_5VECS_MULT_2 }
+#define CRC32_X671_MODG 0x1c279815 /* x^671 mod G(x) */
+#define CRC32_X607_MODG 0xae0b5394 /* x^607 mod G(x) */
 
-#define CRC32_6VECS_MULT_1 0xdf068dc2 /* x^799 mod G(x) */
-#define CRC32_6VECS_MULT_2 0x57c54819 /* x^735 mod G(x) */
-#define CRC32_6VECS_MULTS { CRC32_6VECS_MULT_1, CRC32_6VECS_MULT_2 }
+#define CRC32_X799_MODG 0xdf068dc2 /* x^799 mod G(x) */
+#define CRC32_X735_MODG 0x57c54819 /* x^735 mod G(x) */
 
-#define CRC32_7VECS_MULT_1 0x31f8303f /* x^927 mod G(x) */
-#define CRC32_7VECS_MULT_2 0x0cbec0ed /* x^863 mod G(x) */
-#define CRC32_7VECS_MULTS { CRC32_7VECS_MULT_1, CRC32_7VECS_MULT_2 }
+#define CRC32_X927_MODG 0x31f8303f /* x^927 mod G(x) */
+#define CRC32_X863_MODG 0x0cbec0ed /* x^863 mod G(x) */
 
-#define CRC32_8VECS_MULT_1 0x33fff533 /* x^1055 mod G(x) */
-#define CRC32_8VECS_MULT_2 0x910eeec1 /* x^991 mod G(x) */
-#define CRC32_8VECS_MULTS { CRC32_8VECS_MULT_1, CRC32_8VECS_MULT_2 }
+#define CRC32_X1055_MODG 0x33fff533 /* x^1055 mod G(x) */
+#define CRC32_X991_MODG 0x910eeec1 /* x^991 mod G(x) */
 
-#define CRC32_9VECS_MULT_1 0x26b70c3d /* x^1183 mod G(x) */
-#define CRC32_9VECS_MULT_2 0x3f41287a /* x^1119 mod G(x) */
-#define CRC32_9VECS_MULTS { CRC32_9VECS_MULT_1, CRC32_9VECS_MULT_2 }
+#define CRC32_X1183_MODG 0x26b70c3d /* x^1183 mod G(x) */
+#define CRC32_X1119_MODG 0x3f41287a /* x^1119 mod G(x) */
 
-#define CRC32_10VECS_MULT_1 0xe3543be0 /* x^1311 mod G(x) */
-#define CRC32_10VECS_MULT_2 0x9026d5b1 /* x^1247 mod G(x) */
-#define CRC32_10VECS_MULTS { CRC32_10VECS_MULT_1, CRC32_10VECS_MULT_2 }
+#define CRC32_X1311_MODG 0xe3543be0 /* x^1311 mod G(x) */
+#define CRC32_X1247_MODG 0x9026d5b1 /* x^1247 mod G(x) */
 
-#define CRC32_11VECS_MULT_1 0x5a1bb05d /* x^1439 mod G(x) */
-#define CRC32_11VECS_MULT_2 0xd1df2327 /* x^1375 mod G(x) */
-#define CRC32_11VECS_MULTS { CRC32_11VECS_MULT_1, CRC32_11VECS_MULT_2 }
+#define CRC32_X1439_MODG 0x5a1bb05d /* x^1439 mod G(x) */
+#define CRC32_X1375_MODG 0xd1df2327 /* x^1375 mod G(x) */
 
-#define CRC32_12VECS_MULT_1 0x596c8d81 /* x^1567 mod G(x) */
-#define CRC32_12VECS_MULT_2 0xf5e48c85 /* x^1503 mod G(x) */
-#define CRC32_12VECS_MULTS { CRC32_12VECS_MULT_1, CRC32_12VECS_MULT_2 }
+#define CRC32_X1567_MODG 0x596c8d81 /* x^1567 mod G(x) */
+#define CRC32_X1503_MODG 0xf5e48c85 /* x^1503 mod G(x) */
 
-#define CRC32_FINAL_MULT 0xb8bc6765 /* x^63 mod G(x) */
+#define CRC32_X1695_MODG 0x682bdd4f /* x^1695 mod G(x) */
+#define CRC32_X1631_MODG 0x3c656ced /* x^1631 mod G(x) */
+
+#define CRC32_X1823_MODG 0x4a28bd43 /* x^1823 mod G(x) */
+#define CRC32_X1759_MODG 0xfe807bbd /* x^1759 mod G(x) */
+
+#define CRC32_X1951_MODG 0x0077f00d /* x^1951 mod G(x) */
+#define CRC32_X1887_MODG 0x1f0c2cdd /* x^1887 mod G(x) */
+
+#define CRC32_X2079_MODG 0xce3371cb /* x^2079 mod G(x) */
+#define CRC32_X2015_MODG 0xe95c1271 /* x^2015 mod G(x) */
+
+#define CRC32_X2207_MODG 0xa749e894 /* x^2207 mod G(x) */
+#define CRC32_X2143_MODG 0xb918a347 /* x^2143 mod G(x) */
+
+#define CRC32_X2335_MODG 0x2c538639 /* x^2335 mod G(x) */
+#define CRC32_X2271_MODG 0x71d54a59 /* x^2271 mod G(x) */
+
+#define CRC32_X2463_MODG 0x32b0733c /* x^2463 mod G(x) */
+#define CRC32_X2399_MODG 0xff6f2fc2 /* x^2399 mod G(x) */
+
+#define CRC32_X2591_MODG 0x0e9bd5cc /* x^2591 mod G(x) */
+#define CRC32_X2527_MODG 0xcec97417 /* x^2527 mod G(x) */
+
+#define CRC32_X2719_MODG 0x76278617 /* x^2719 mod G(x) */
+#define CRC32_X2655_MODG 0x1c63267b /* x^2655 mod G(x) */
+
+#define CRC32_X2847_MODG 0xc51b93e3 /* x^2847 mod G(x) */
+#define CRC32_X2783_MODG 0xf183c71b /* x^2783 mod G(x) */
+
+#define CRC32_X2975_MODG 0x7eaed122 /* x^2975 mod G(x) */
+#define CRC32_X2911_MODG 0x9b9bdbd0 /* x^2911 mod G(x) */
+
+#define CRC32_X3103_MODG 0x2ce423f1 /* x^3103 mod G(x) */
+#define CRC32_X3039_MODG 0xd31343ea /* x^3039 mod G(x) */
+
+#define CRC32_X3231_MODG 0x8b8d8645 /* x^3231 mod G(x) */
+#define CRC32_X3167_MODG 0x4470ac44 /* x^3167 mod G(x) */
+
+#define CRC32_X3359_MODG 0x4b700aa8 /* x^3359 mod G(x) */
+#define CRC32_X3295_MODG 0xeea395c4 /* x^3295 mod G(x) */
+
+#define CRC32_X3487_MODG 0xeff5e99d /* x^3487 mod G(x) */
+#define CRC32_X3423_MODG 0xf9d9c7ee /* x^3423 mod G(x) */
+
+#define CRC32_X3615_MODG 0xad0d2bb2 /* x^3615 mod G(x) */
+#define CRC32_X3551_MODG 0xcd669a40 /* x^3551 mod G(x) */
+
+#define CRC32_X3743_MODG 0x9fb66bd3 /* x^3743 mod G(x) */
+#define CRC32_X3679_MODG 0x6d40f445 /* x^3679 mod G(x) */
+
+#define CRC32_X3871_MODG 0xc2dcc467 /* x^3871 mod G(x) */
+#define CRC32_X3807_MODG 0x9ee62949 /* x^3807 mod G(x) */
+
+#define CRC32_X3999_MODG 0x398e2ff2 /* x^3999 mod G(x) */
+#define CRC32_X3935_MODG 0x145575d5 /* x^3935 mod G(x) */
+
+#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
+#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
+
+#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
 #define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
 #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
 #define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
index 8df855ff..fa8fd019 100644
--- a/lib/x86/cpu_features.c
+++ b/lib/x86/cpu_features.c
@@ -86,32 +86,71 @@ read_xcr(u32 index)
 
 static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_SSE2,		"sse2"},
-	{X86_CPU_FEATURE_PCLMUL,	"pclmul"},
+	{X86_CPU_FEATURE_PCLMULQDQ,	"pclmulqdq"},
 	{X86_CPU_FEATURE_AVX,		"avx"},
 	{X86_CPU_FEATURE_AVX2,		"avx2"},
 	{X86_CPU_FEATURE_BMI2,		"bmi2"},
+	{X86_CPU_FEATURE_AVX512F,	"avx512f"},
+	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
+	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
 };
 
 volatile u32 libdeflate_x86_cpu_features = 0;
 
+/*
+ * Don't use 512-bit vectors on Intel CPUs 10th generation and older, due to the
+ * downclocking penalty.
+ */
+static inline bool
+allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
+{
+#ifdef TEST_SUPPORT__DO_NOT_USE
+	return true;
+#endif
+	if (memcmp(manufacturer, "GenuineIntel", 12) != 0)
+		return true;
+	if (family != 6)
+		return true;
+	switch (model) {
+	case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */
+	case 106: /* Ice Lake (Server) */
+	case 108: /* Ice Lake (Server) */
+	case 126: /* Ice Lake (Client) */
+	case 140: /* Tiger Lake */
+	case 141: /* Tiger Lake */
+		return false;
+	}
+	return true;
+}
+
 /* Initialize libdeflate_x86_cpu_features. */
 void libdeflate_init_x86_cpu_features(void)
 {
-	u32 max_leaf, a, b, c, d;
+	u32 max_leaf;
+	u32 manufacturer[3];
+	u32 family, model;
+	u32 a, b, c, d;
 	u64 xcr0 = 0;
 	u32 features = 0;
 
 	/* EAX=0: Highest Function Parameter and Manufacturer ID */
-	cpuid(0, 0, &max_leaf, &b, &c, &d);
+	cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2],
+	      &manufacturer[1]);
 	if (max_leaf < 1)
 		goto out;
 
 	/* EAX=1: Processor Info and Feature Bits */
 	cpuid(1, 0, &a, &b, &c, &d);
+	family = (a >> 8) & 0xf;
+	model = (a >> 4) & 0xf;
+	if (family == 6 || family == 0xf)
+		model += (a >> 12) & 0xf0;
+	if (family == 0xf)
+		family += (a >> 20) & 0xff;
 	if (d & (1 << 26))
 		features |= X86_CPU_FEATURE_SSE2;
 	if (c & (1 << 1))
-		features |= X86_CPU_FEATURE_PCLMUL;
+		features |= X86_CPU_FEATURE_PCLMULQDQ;
 	if (c & (1 << 27))
 		xcr0 = read_xcr(0);
 	if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
@@ -126,6 +165,13 @@ void libdeflate_init_x86_cpu_features(void)
 		features |= X86_CPU_FEATURE_AVX2;
 	if (b & (1 << 8))
 		features |= X86_CPU_FEATURE_BMI2;
+	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) &&
+	    allow_512bit_vectors(manufacturer, family, model))
+		features |= X86_CPU_FEATURE_AVX512F;
+	if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6))
+		features |= X86_CPU_FEATURE_AVX512VL;
+	if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
+		features |= X86_CPU_FEATURE_VPCLMULQDQ;
 
 out:
 	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
index ad14e435..b5fbf573 100644
--- a/lib/x86/cpu_features.h
+++ b/lib/x86/cpu_features.h
@@ -40,16 +40,22 @@
 #endif
 
 #define X86_CPU_FEATURE_SSE2		0x00000001
-#define X86_CPU_FEATURE_PCLMUL		0x00000002
+#define X86_CPU_FEATURE_PCLMULQDQ	0x00000002
 #define X86_CPU_FEATURE_AVX		0x00000004
 #define X86_CPU_FEATURE_AVX2		0x00000008
 #define X86_CPU_FEATURE_BMI2		0x00000010
+#define X86_CPU_FEATURE_AVX512F		0x00000020
+#define X86_CPU_FEATURE_AVX512VL	0x00000040
+#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000080
 
 #define HAVE_SSE2(features)	(HAVE_SSE2_NATIVE     || ((features) & X86_CPU_FEATURE_SSE2))
-#define HAVE_PCLMUL(features)	(HAVE_PCLMUL_NATIVE   || ((features) & X86_CPU_FEATURE_PCLMUL))
+#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
 #define HAVE_AVX(features)	(HAVE_AVX_NATIVE      || ((features) & X86_CPU_FEATURE_AVX))
 #define HAVE_AVX2(features)	(HAVE_AVX2_NATIVE     || ((features) & X86_CPU_FEATURE_AVX2))
 #define HAVE_BMI2(features)	(HAVE_BMI2_NATIVE     || ((features) & X86_CPU_FEATURE_BMI2))
+#define HAVE_AVX512F(features)	(HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
+#define HAVE_AVX512VL(features)	(HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
+#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
 #define X86_CPU_FEATURES_KNOWN		0x80000000
@@ -90,18 +96,18 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #endif
 #define HAVE_SSE2_INTRIN	(HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS)
 
-/* PCLMUL */
+/* PCLMULQDQ */
 #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
-#  define HAVE_PCLMUL_NATIVE	1
+#  define HAVE_PCLMULQDQ_NATIVE	1
 #else
-#  define HAVE_PCLMUL_NATIVE	0
+#  define HAVE_PCLMULQDQ_NATIVE	0
 #endif
-#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \
-			   (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
-			    defined(_MSC_VER)))
-#  define HAVE_PCLMUL_INTRIN	1
+#if HAVE_PCLMULQDQ_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			      (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
+			       defined(_MSC_VER)))
+#  define HAVE_PCLMULQDQ_INTRIN	1
 #else
-#  define HAVE_PCLMUL_INTRIN	0
+#  define HAVE_PCLMULQDQ_INTRIN	0
 #endif
 
 /* AVX */
@@ -156,6 +162,45 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_BMI2_INTRIN	0
 #endif
 
+/* AVX-512F */
+#ifdef __AVX512F__
+#  define HAVE_AVX512F_NATIVE	1
+#else
+#  define HAVE_AVX512F_NATIVE	0
+#endif
+#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
+			   defined(_MSC_VER)
+#  define HAVE_AVX512F_INTRIN	1
+#else
+#  define HAVE_AVX512F_INTRIN	0
+#endif
+
+/* AVX-512VL */
+#ifdef __AVX512VL__
+#  define HAVE_AVX512VL_NATIVE	1
+#else
+#  define HAVE_AVX512VL_NATIVE	0
+#endif
+#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
+			    defined(_MSC_VER)
+#  define HAVE_AVX512VL_INTRIN	1
+#else
+#  define HAVE_AVX512VL_INTRIN	0
+#endif
+
+/* VPCLMULQDQ */
+#ifdef __VPCLMULQDQ__
+#  define HAVE_VPCLMULQDQ_NATIVE	1
+#else
+#  define HAVE_VPCLMULQDQ_NATIVE	0
+#endif
+#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
+			       defined(_MSC_VER))
+#  define HAVE_VPCLMULQDQ_INTRIN	1
+#else
+#  define HAVE_VPCLMULQDQ_INTRIN	0
+#endif
+
 #endif /* ARCH_X86_32 || ARCH_X86_64 */
 
 #endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
index 79cc7944..e818d0a6 100644
--- a/lib/x86/crc32_impl.h
+++ b/lib/x86/crc32_impl.h
@@ -30,67 +30,137 @@
 
 #include "cpu_features.h"
 
-/* PCLMUL implementation */
-#if HAVE_PCLMUL_INTRIN
-#  define crc32_x86_pclmul	crc32_x86_pclmul
-#  define SUFFIX			 _pclmul
-#  if HAVE_PCLMUL_NATIVE
+/* PCLMULQDQ implementation */
+#if HAVE_PCLMULQDQ_INTRIN
+#  define crc32_x86_pclmulqdq	crc32_x86_pclmulqdq
+#  define SUFFIX			 _pclmulqdq
+#  if HAVE_PCLMULQDQ_NATIVE
 #    define ATTRIBUTES
 #  else
 #    define ATTRIBUTES		_target_attribute("pclmul")
 #  endif
-#  define FOLD_PARTIAL_VECS	0
+#  define VL			16
+#  define FOLD_LESSTHAN16BYTES	0
+#  define USE_TERNARYLOGIC	0
 #  include "crc32_pclmul_template.h"
 #endif
 
 /*
- * PCLMUL/AVX implementation.  This implementation has two benefits over the
- * regular PCLMUL one.  First, simply compiling against the AVX target can
- * improve performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake)
- * without actually using any AVX intrinsics, probably due to the availability
- * of non-destructive VEX-encoded instructions.  Second, AVX support implies
- * SSSE3 and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for
- * efficient handling of partial blocks.  (We *could* compile a variant with
- * PCLMUL+SSSE3+SSE4.1 w/o AVX, but for simplicity we don't currently bother.)
+ * PCLMULQDQ/AVX implementation.  Compared to the regular PCLMULQDQ
+ * implementation, this still uses 128-bit vectors, but it has two potential
+ * benefits.  First, simply compiling against the AVX target can improve
+ * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
+ * actually using any AVX intrinsics, probably due to the availability of
+ * non-destructive VEX-encoded instructions.  Second, AVX support implies SSSE3
+ * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
+ * handling of partial blocks.  (We *could* compile a variant with
+ * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.)
  *
  * FIXME: with MSVC, this isn't actually compiled with AVX code generation
  * enabled yet.  That would require that this be moved to its own .c file.
  */
-#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN
-#  define crc32_x86_pclmul_avx	crc32_x86_pclmul_avx
-#  define SUFFIX			 _pclmul_avx
-#  if HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE
+#if HAVE_PCLMULQDQ_INTRIN && HAVE_AVX_INTRIN
+#  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
+#  define SUFFIX				 _pclmulqdq_avx
+#  if HAVE_PCLMULQDQ_NATIVE && HAVE_AVX_NATIVE
 #    define ATTRIBUTES
 #  else
 #    define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  endif
-#  define FOLD_PARTIAL_VECS	1
+#  define VL			16
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	0
+#  include "crc32_pclmul_template.h"
+#endif
+
+/* VPCLMULQDQ/AVX2 implementation.  Uses 256-bit vectors. */
+#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX2_INTRIN && \
+	/*
+	 * This has to be disabled on MSVC because MSVC has a bug where it
+	 * incorrectly assumes that VPCLMULQDQ implies AVX-512:
+	 * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest
+	 */ \
+	!(defined(_MSC_VER) && !defined(__clang__))
+#  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
+#  define SUFFIX				 _vpclmulqdq_avx2
+#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX2_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
+#  endif
+#  define VL			32
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	0
 #  include "crc32_pclmul_template.h"
 #endif
 
 /*
- * If the best implementation is statically available, use it unconditionally.
- * Otherwise choose the best implementation at runtime.
+ * VPCLMULQDQ/AVX512VL implementation.  This takes advantage of some AVX-512
+ * instructions but uses 256-bit vectors rather than 512-bit.  This can be
+ * useful on CPUs where 512-bit vectors cause downclocking.
  */
-#if defined(crc32_x86_pclmul_avx) && HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE
-#define DEFAULT_IMPL	crc32_x86_pclmul_avx
-#else
+#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX512VL_INTRIN
+#  define crc32_x86_vpclmulqdq_avx512vl	crc32_x86_vpclmulqdq_avx512vl
+#  define SUFFIX				 _vpclmulqdq_avx512vl
+#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX512VL_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
+#  endif
+#  define VL			32
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	1
+#  include "crc32_pclmul_template.h"
+#endif
+
+/* VPCLMULQDQ/AVX512F/AVX512VL implementation.  Uses 512-bit vectors. */
+#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
+	HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
+#  define crc32_x86_vpclmulqdq_avx512f_avx512vl	crc32_x86_vpclmulqdq_avx512f_avx512vl
+#  define SUFFIX					 _vpclmulqdq_avx512f_avx512vl
+#if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
+	HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512f,avx512vl")
+#  endif
+#  define VL			64
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	1
+#  include "crc32_pclmul_template.h"
+#endif
+
+/* Choose the best implementation at runtime. */
 static inline crc32_func_t
 arch_select_crc32_func(void)
 {
 	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
 
-#ifdef crc32_x86_pclmul_avx
-	if (HAVE_PCLMUL(features) && HAVE_AVX(features))
-		return crc32_x86_pclmul_avx;
+#ifdef crc32_x86_vpclmulqdq_avx512f_avx512vl
+	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
+	    HAVE_AVX512F(features) && HAVE_AVX512VL(features))
+		return crc32_x86_vpclmulqdq_avx512f_avx512vl;
+#endif
+#ifdef crc32_x86_vpclmulqdq_avx512vl
+	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
+	    HAVE_AVX512VL(features))
+		return crc32_x86_vpclmulqdq_avx512vl;
+#endif
+#ifdef crc32_x86_vpclmulqdq_avx2
+	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
+	    HAVE_AVX2(features))
+		return crc32_x86_vpclmulqdq_avx2;
 #endif
-#ifdef crc32_x86_pclmul
-	if (HAVE_PCLMUL(features))
-		return crc32_x86_pclmul;
+#ifdef crc32_x86_pclmulqdq_avx
+	if (HAVE_PCLMULQDQ(features) && HAVE_AVX(features))
+		return crc32_x86_pclmulqdq_avx;
+#endif
+#ifdef crc32_x86_pclmulqdq
+	if (HAVE_PCLMULQDQ(features))
+		return crc32_x86_pclmulqdq;
 #endif
 	return NULL;
 }
 #define arch_select_crc32_func	arch_select_crc32_func
-#endif
 
 #endif /* LIB_X86_CRC32_IMPL_H */
diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
index 1d578237..16e4ebf5 100644
--- a/lib/x86/crc32_pclmul_template.h
+++ b/lib/x86/crc32_pclmul_template.h
@@ -32,11 +32,20 @@
  * SUFFIX:
  *	Name suffix to append to all instantiated functions.
  * ATTRIBUTES:
- *	Target function attributes to use.
- * FOLD_PARTIAL_VECS:
+ *	Target function attributes to use.  Must satisfy the dependencies of the
+ *	other parameters as follows:
+ *	   VL=16 && FOLD_LESSTHAN16BYTES=0: at least pclmul
+ *	   VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1
+ *	   VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2
+ *	   VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl
+ *	   VL=64: at least vpclmulqdq,pclmul,avx512f,avx512vl
+ * VL:
+ *	Vector length in bytes.  Supported values are 16, 32, and 64.
+ * FOLD_LESSTHAN16BYTES:
  *	Use vector instructions to handle any partial blocks at the beginning
  *	and end, instead of falling back to scalar instructions for those parts.
- *	Requires SSSE3 and SSE4.1 intrinsics.
+ * USE_TERNARYLOGIC:
+ *	Use the vpternlog instruction to do three-argument XORs.
  *
  * The overall algorithm used is CRC folding with carryless multiplication
  * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
@@ -46,6 +55,11 @@
  *
  *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  *	https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ *
+ * The original pclmulqdq instruction does one 64x64 to 128-bit carryless
+ * multiplication.  The VPCLMULQDQ feature added instructions that do two
+ * parallel 64x64 to 128-bit carryless multiplications in combination with AVX
+ * or AVX512VL, or four in combination with AVX512F.
  */
 
 #include <immintrin.h>
@@ -57,41 +71,110 @@
 #  include <tmmintrin.h>
 #  include <smmintrin.h>
 #  include <wmmintrin.h>
+#  include <avxintrin.h>
+#  include <avx2intrin.h>
+#  include <avx512fintrin.h>
+#  include <avx512vlintrin.h>
+#  include <vpclmulqdqintrin.h>
 #endif
 
-#undef fold_vec
+#undef fold_vec128
 static forceinline ATTRIBUTES __m128i
-ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __m128i /* __v2di */ multipliers)
+ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers)
 {
-	/*
-	 * The immediate constant for PCLMULQDQ specifies which 64-bit halves of
-	 * the 128-bit vectors to multiply:
-	 *
-	 * 0x00 means low halves (higher degree polynomial terms for us)
-	 * 0x11 means high halves (lower degree polynomial terms for us)
-	 */
 	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00));
 	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11));
 	return dst;
 }
-#define fold_vec	ADD_SUFFIX(fold_vec)
+#define fold_vec128	ADD_SUFFIX(fold_vec128)
 
-#if FOLD_PARTIAL_VECS
+#if VL >= 32
+#undef fold_vec256
+static forceinline ATTRIBUTES __m256i
+ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i multipliers)
+{
+#if USE_TERNARYLOGIC
+	return _mm256_ternarylogic_epi32(
+			_mm256_clmulepi64_epi128(src, multipliers, 0x00),
+			_mm256_clmulepi64_epi128(src, multipliers, 0x11),
+			dst,
+			0x96);
+#else
+	return _mm256_xor_si256(
+			_mm256_xor_si256(dst,
+					 _mm256_clmulepi64_epi128(src, multipliers, 0x00)),
+			_mm256_clmulepi64_epi128(src, multipliers, 0x11));
+#endif
+}
+#define fold_vec256	ADD_SUFFIX(fold_vec256)
+#endif /* VL >= 32 */
+
+#if VL >= 64
+#undef fold_vec512
+static forceinline ATTRIBUTES __m512i
+ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers)
+{
+	return _mm512_ternarylogic_epi32(
+			_mm512_clmulepi64_epi128(src, multipliers, 0x00),
+			_mm512_clmulepi64_epi128(src, multipliers, 0x11),
+			dst,
+			0x96);
+}
+#define fold_vec512	ADD_SUFFIX(fold_vec512)
+#endif /* VL >= 64 */
+
+#if VL == 16
+#  define vec_t			__m128i
+#  define fold_vec		fold_vec128
+#  define VLOAD_UNALIGNED(p)	_mm_loadu_si128((const void *)(p))
+#  define VXOR(a, b)		_mm_xor_si128((a), (b))
+#  define M128I_TO_VEC(a)	a
+#  define MULTS_8V		_mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_4V		_mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG)
+#  define MULTS_2V		_mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG)
+#  define MULTS_1V		_mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG)
+#elif VL == 32
+#  define vec_t			__m256i
+#  define fold_vec		fold_vec256
+#  define VLOAD_UNALIGNED(p)	_mm256_loadu_si256((const void *)(p))
+#  define VXOR(a, b)		_mm256_xor_si256((a), (b))
+#  define M128I_TO_VEC(a)	_mm256_castsi128_si256(a)
+#  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
+#  define MULTS_8V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
+#  define MULTS_4V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_2V		MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
+#  define MULTS_1V		MULTS(CRC32_X223_MODG, CRC32_X287_MODG)
+#elif VL == 64
+#  define vec_t			__m512i
+#  define fold_vec		fold_vec512
+#  define VLOAD_UNALIGNED(p)	_mm512_loadu_si512((const void *)(p))
+#  define VXOR(a, b)		_mm512_xor_si512((a), (b))
+#  define M128I_TO_VEC(a)	_mm512_castsi128_si512(a)
+#  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
+#  define MULTS_8V		MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
+#  define MULTS_4V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
+#  define MULTS_2V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_1V		MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
+#else
+#  error "unsupported vector length"
+#endif
+
+#if FOLD_LESSTHAN16BYTES
 /*
- * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
- * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
- * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
- * respectively.  Then fold x0 into x1 and return the result.  Assumes that
- * 'p + len - 16' is in-bounds.
+ * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
+ * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
+ * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
+ * respectively.  Then fold x0 into x1 and return the result.
+ * Assumes that 'p + len - 16' is in-bounds.
  */
-#undef fold_partial_vec
+#undef fold_lessthan16bytes
 static forceinline ATTRIBUTES __m128i
-ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len,
-			     __m128i /* __v2du */ multipliers_1)
+ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
+				 __m128i /* __v2du */ multipliers_128b)
 {
 	/*
-	 * pshufb(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
-	 * pshufb(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
+	 * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes.
+	 * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes.
 	 */
 	static const u8 shift_tab[48] = {
 		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -105,41 +188,40 @@ ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len,
 	__m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]);
 	__m128i x0, x1;
 
-	/* x0 = v left-shifted by '16 - len' bytes */
-	x0 = _mm_shuffle_epi8(v, lshift);
+	/* x0 = x left-shifted by '16 - len' bytes */
+	x0 = _mm_shuffle_epi8(x, lshift);
 
 	/*
-	 * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
+	 * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len'
 	 * bytes) followed by the remaining data.
 	 */
-	x1 = _mm_blendv_epi8(_mm_shuffle_epi8(v, rshift),
+	x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift),
 			     _mm_loadu_si128((const void *)(p + len - 16)),
 			     /* msb 0/1 of each byte selects byte from arg1/2 */
 			     rshift);
 
-	return fold_vec(x0, x1, multipliers_1);
+	return fold_vec128(x0, x1, multipliers_128b);
 }
-#define fold_partial_vec	ADD_SUFFIX(fold_partial_vec)
-#endif /* FOLD_PARTIAL_VECS */
+#define fold_lessthan16bytes	ADD_SUFFIX(fold_lessthan16bytes)
+#endif /* FOLD_LESSTHAN16BYTES */
 
-static u32 ATTRIBUTES MAYBE_UNUSED
+static u32 ATTRIBUTES
 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 {
-	const __m128i /* __v2du */ multipliers_8 =
-		_mm_set_epi64x(CRC32_8VECS_MULT_2, CRC32_8VECS_MULT_1);
-	const __m128i /* __v2du */ multipliers_4 =
-		_mm_set_epi64x(CRC32_4VECS_MULT_2, CRC32_4VECS_MULT_1);
-	const __m128i /* __v2du */ multipliers_2 =
-		_mm_set_epi64x(CRC32_2VECS_MULT_2, CRC32_2VECS_MULT_1);
-	const __m128i /* __v2du */ multipliers_1 =
-		_mm_set_epi64x(CRC32_1VECS_MULT_2, CRC32_1VECS_MULT_1);
+	const vec_t multipliers_8v = MULTS_8V; /* 8 vecs */
+	const vec_t multipliers_4v = MULTS_4V; /* 4 vecs */
+	const vec_t multipliers_2v = MULTS_2V; /* 2 vecs */
+	const vec_t multipliers_1v = MULTS_1V; /* 1 vecs */
+	const __m128i /* __v2du */ multipliers_128b =
+		_mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
 	const __m128i /* __v2du */ final_multiplier =
-		_mm_set_epi64x(0, CRC32_FINAL_MULT);
+		_mm_set_epi64x(0, CRC32_X63_MODG);
 	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
 	const __m128i /* __v2du */ barrett_reduction_constants =
 		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2,
 			       CRC32_BARRETT_CONSTANT_1);
-	__m128i v0, v1, v2, v3, v4, v5, v6, v7;
+	vec_t v0, v1, v2, v3, v4, v5, v6, v7;
+	__m128i x0, x1;
 
 	/*
 	 * There are two overall code paths.  The first path supports all
@@ -148,83 +230,102 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * lengths, aligns the pointer in order to do aligned loads, and does up
 	 * to 8-way folds.  The length check below decides which path to take.
 	 */
-	if (len < 1024) {
-		if (len < 16)
+	if (len < 64*VL) {
+		if (len < VL)
 			return crc32_slice1(crc, p, len);
 
-		v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
-				   _mm_cvtsi32_si128(crc));
-		p += 16;
+		v0 = VXOR(VLOAD_UNALIGNED(p),
+			  M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
+		p += VL;
 
-		if (len >= 64) {
-			v1 = _mm_loadu_si128((const void *)(p + 0));
-			v2 = _mm_loadu_si128((const void *)(p + 16));
-			v3 = _mm_loadu_si128((const void *)(p + 32));
-			p += 48;
-			while (len >= 64 + 64) {
-				v0 = fold_vec(v0, _mm_loadu_si128((const void *)(p + 0)),
-					      multipliers_4);
-				v1 = fold_vec(v1, _mm_loadu_si128((const void *)(p + 16)),
-					      multipliers_4);
-				v2 = fold_vec(v2, _mm_loadu_si128((const void *)(p + 32)),
-					      multipliers_4);
-				v3 = fold_vec(v3, _mm_loadu_si128((const void *)(p + 48)),
-					      multipliers_4);
-				p += 64;
-				len -= 64;
+		if (len >= 4*VL) {
+			v1 = VLOAD_UNALIGNED(p + 0*VL);
+			v2 = VLOAD_UNALIGNED(p + 1*VL);
+			v3 = VLOAD_UNALIGNED(p + 2*VL);
+			p += 3*VL;
+			while (len >= 8*VL) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL),
+					      multipliers_4v);
+				v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL),
+					      multipliers_4v);
+				v2 = fold_vec(v2, VLOAD_UNALIGNED(p + 2*VL),
+					      multipliers_4v);
+				v3 = fold_vec(v3, VLOAD_UNALIGNED(p + 3*VL),
+					      multipliers_4v);
+				p += 4*VL;
+				len -= 4*VL;
 			}
-			v0 = fold_vec(v0, v2, multipliers_2);
-			v1 = fold_vec(v1, v3, multipliers_2);
-			if (len & 32) {
-				v0 = fold_vec(v0, _mm_loadu_si128((const void *)(p + 0)),
-					      multipliers_2);
-				v1 = fold_vec(v1, _mm_loadu_si128((const void *)(p + 16)),
-					      multipliers_2);
-				p += 32;
+			v0 = fold_vec(v0, v2, multipliers_2v);
+			v1 = fold_vec(v1, v3, multipliers_2v);
+			if (len & (2*VL)) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL),
+					      multipliers_2v);
+				v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL),
+					      multipliers_2v);
+				p += 2*VL;
 			}
-			v0 = fold_vec(v0, v1, multipliers_1);
-			if (len & 16) {
-				v0 = fold_vec(v0, _mm_loadu_si128((const void *)p),
-					      multipliers_1);
-				p += 16;
+			v0 = fold_vec(v0, v1, multipliers_1v);
+			if (len & VL) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
+					      multipliers_1v);
+				p += VL;
 			}
 		} else {
-			if (len >= 32) {
-				v0 = fold_vec(v0, _mm_loadu_si128((const void *)p),
-					      multipliers_1);
-				p += 16;
-				if (len >= 48) {
-					v0 = fold_vec(v0, _mm_loadu_si128((const void *)p),
-						      multipliers_1);
-					p += 16;
+			if (len >= 2*VL) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
+					      multipliers_1v);
+				p += VL;
+				if (len >= 3*VL) {
+					v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
+						      multipliers_1v);
+					p += VL;
 				}
 			}
 		}
 	} else {
-		const size_t align = -(uintptr_t)p & 15;
-		const __m128i *vp;
+		size_t align = -(uintptr_t)p & (VL-1);
+		const vec_t *vp;
 
-	#if FOLD_PARTIAL_VECS
-		v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
-				   _mm_cvtsi32_si128(crc));
-		p += 16;
-		/* Align p to the next 16-byte boundary. */
-		if (align) {
-			v0 = fold_partial_vec(v0, p, align, multipliers_1);
-			p += align;
+		/* Align p to the next VL-byte boundary. */
+		if (align == 0) {
+			vp = (const vec_t *)p;
+			v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
+		} else {
 			len -= align;
-		}
-		vp = (const __m128i *)p;
-	#else
-		/* Align p to the next 16-byte boundary. */
-		if (align) {
+		#if FOLD_LESSTHAN16BYTES
+			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
+					   _mm_cvtsi32_si128(crc));
+			p += 16;
+			if (align & 15) {
+				x0 = fold_lessthan16bytes(x0, p, align & 15,
+							  multipliers_128b);
+				p += align & 15;
+				align &= ~15;
+			}
+			while (align >= 16) {
+				x0 = fold_vec128(x0, *(const __m128i *)p,
+						 multipliers_128b);
+				p += 16;
+				align -= 16;
+			}
+			v0 = M128I_TO_VEC(x0);
+		#  if VL == 32
+			v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1);
+			p += 16;
+		#  elif VL == 64
+			v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1);
+			p += 16;
+			v0 = _mm512_inserti64x4(v0, *(const __m256i *)p, 1);
+			p += 32;
+		#  endif
+			vp = (const vec_t *)p;
+		#else
 			crc = crc32_slice1(crc, p, align);
 			p += align;
-			len -= align;
+			vp = (const vec_t *)p;
+			v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
+		#endif
 		}
-		vp = (const __m128i *)p;
-		v0 = _mm_xor_si128(*vp++, _mm_cvtsi32_si128(crc));
-	#endif
 		v1 = *vp++;
 		v2 = *vp++;
 		v3 = *vp++;
@@ -233,50 +334,85 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 		v6 = *vp++;
 		v7 = *vp++;
 		do {
-			v0 = fold_vec(v0, *vp++, multipliers_8);
-			v1 = fold_vec(v1, *vp++, multipliers_8);
-			v2 = fold_vec(v2, *vp++, multipliers_8);
-			v3 = fold_vec(v3, *vp++, multipliers_8);
-			v4 = fold_vec(v4, *vp++, multipliers_8);
-			v5 = fold_vec(v5, *vp++, multipliers_8);
-			v6 = fold_vec(v6, *vp++, multipliers_8);
-			v7 = fold_vec(v7, *vp++, multipliers_8);
-			len -= 128;
-		} while (len >= 128 + 128);
+			v0 = fold_vec(v0, *vp++, multipliers_8v);
+			v1 = fold_vec(v1, *vp++, multipliers_8v);
+			v2 = fold_vec(v2, *vp++, multipliers_8v);
+			v3 = fold_vec(v3, *vp++, multipliers_8v);
+			v4 = fold_vec(v4, *vp++, multipliers_8v);
+			v5 = fold_vec(v5, *vp++, multipliers_8v);
+			v6 = fold_vec(v6, *vp++, multipliers_8v);
+			v7 = fold_vec(v7, *vp++, multipliers_8v);
+			len -= 8*VL;
+		} while (len >= 16*VL);
 
-		v0 = fold_vec(v0, v4, multipliers_4);
-		v1 = fold_vec(v1, v5, multipliers_4);
-		v2 = fold_vec(v2, v6, multipliers_4);
-		v3 = fold_vec(v3, v7, multipliers_4);
-		if (len & 64) {
-			v0 = fold_vec(v0, *vp++, multipliers_4);
-			v1 = fold_vec(v1, *vp++, multipliers_4);
-			v2 = fold_vec(v2, *vp++, multipliers_4);
-			v3 = fold_vec(v3, *vp++, multipliers_4);
+		/*
+		 * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes)
+		 * and fold in any VL-byte data segments that remain.
+		 */
+		v0 = fold_vec(v0, v4, multipliers_4v);
+		v1 = fold_vec(v1, v5, multipliers_4v);
+		v2 = fold_vec(v2, v6, multipliers_4v);
+		v3 = fold_vec(v3, v7, multipliers_4v);
+		if (len & (4*VL)) {
+			v0 = fold_vec(v0, *vp++, multipliers_4v);
+			v1 = fold_vec(v1, *vp++, multipliers_4v);
+			v2 = fold_vec(v2, *vp++, multipliers_4v);
+			v3 = fold_vec(v3, *vp++, multipliers_4v);
+		}
+		v0 = fold_vec(v0, v2, multipliers_2v);
+		v1 = fold_vec(v1, v3, multipliers_2v);
+		if (len & (2*VL)) {
+			v0 = fold_vec(v0, *vp++, multipliers_2v);
+			v1 = fold_vec(v1, *vp++, multipliers_2v);
 		}
+		v0 = fold_vec(v0, v1, multipliers_1v);
+		if (len & VL)
+			v0 = fold_vec(v0, *vp++, multipliers_1v);
+		p = (const u8 *)vp;
+	}
 
-		v0 = fold_vec(v0, v2, multipliers_2);
-		v1 = fold_vec(v1, v3, multipliers_2);
+	/*
+	 * Reduce v0 (length VL bytes) to x0 (length 16 bytes)
+	 * and fold in any 16-byte data segments that remain.
+	 */
+#if VL == 16
+	x0 = v0;
+#else
+	{
+#  if VL == 32
+		__m256i y0 = v0;
+#  else
+		const __m256i multipliers_256b =
+			_mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG,
+					  CRC32_X223_MODG, CRC32_X287_MODG);
+		__m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0),
+					 _mm512_extracti64x4_epi64(v0, 1),
+					 multipliers_256b);
 		if (len & 32) {
-			v0 = fold_vec(v0, *vp++, multipliers_2);
-			v1 = fold_vec(v1, *vp++, multipliers_2);
+			y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p),
+					 multipliers_256b);
+			p += 32;
 		}
-
-		v0 = fold_vec(v0, v1, multipliers_1);
-		if (len & 16)
-			v0 = fold_vec(v0, *vp++, multipliers_1);
-
-		p = (const u8 *)vp;
+#  endif
+		x0 = fold_vec128(_mm256_extracti128_si256(y0, 0),
+				 _mm256_extracti128_si256(y0, 1),
+				 multipliers_128b);
 	}
+	if (len & 16) {
+		x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p),
+				 multipliers_128b);
+		p += 16;
+	}
+#endif
 	len &= 15;
 
 	/*
-	 * If fold_partial_vec() is available, handle any remaining partial
-	 * block now before reducing to 32 bits.
+	 * If fold_lessthan16bytes() is available, handle any remainder
+	 * of 1 to 15 bytes now, before reducing to 32 bits.
 	 */
-#if FOLD_PARTIAL_VECS
+#if FOLD_LESSTHAN16BYTES
 	if (len)
-		v0 = fold_partial_vec(v0, p, len, multipliers_1);
+		x0 = fold_lessthan16bytes(x0, p, len, multipliers_128b);
 #endif
 
 	/*
@@ -284,12 +420,12 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * which is equivalent to multiplying by x^32.  This is needed because
 	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
 	 */
-	v0 = _mm_xor_si128(_mm_srli_si128(v0, 8),
-			   _mm_clmulepi64_si128(v0, multipliers_1, 0x10));
+	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
+			   _mm_clmulepi64_si128(x0, multipliers_128b, 0x10));
 
 	/* Fold 96 => 64 bits. */
-	v0 = _mm_xor_si128(_mm_srli_si128(v0, 4),
-			   _mm_clmulepi64_si128(_mm_and_si128(v0, mask32),
+	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
+			   _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
 						final_multiplier, 0x00));
 
 	/*
@@ -334,21 +470,34 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
 	 *	                          \           x^32            /
 	 */
-	v1 = _mm_clmulepi64_si128(_mm_and_si128(v0, mask32),
+	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
 				  barrett_reduction_constants, 0x00);
-	v1 = _mm_clmulepi64_si128(_mm_and_si128(v1, mask32),
+	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
 				  barrett_reduction_constants, 0x10);
-	v0 = _mm_xor_si128(v0, v1);
-#if FOLD_PARTIAL_VECS
-	crc = _mm_extract_epi32(v0, 1);
+	x0 = _mm_xor_si128(x0, x1);
+#if FOLD_LESSTHAN16BYTES
+	crc = _mm_extract_epi32(x0, 1);
 #else
-	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(v0, 0x01));
+	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
 	/* Process up to 15 bytes left over at the end. */
 	crc = crc32_slice1(crc, p, len);
 #endif
 	return crc;
 }
 
+#undef vec_t
+#undef fold_vec
+#undef VLOAD_UNALIGNED
+#undef VXOR
+#undef M128I_TO_VEC
+#undef MULTS
+#undef MULTS_8V
+#undef MULTS_4V
+#undef MULTS_2V
+#undef MULTS_1V
+
 #undef SUFFIX
 #undef ATTRIBUTES
-#undef FOLD_PARTIAL_VECS
+#undef VL
+#undef FOLD_LESSTHAN16BYTES
+#undef USE_TERNARYLOGIC
diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh
index 83667608..b359fba3 100755
--- a/scripts/checksum_benchmarks.sh
+++ b/scripts/checksum_benchmarks.sh
@@ -16,6 +16,13 @@ have_cpu_feature() {
 	grep -q "^$tag"$'[ \t]'"*:.*\<$feature\>" /proc/cpuinfo
 }
 
+have_cpu_features() {
+	local feature
+	for feature; do
+		have_cpu_feature "$feature" || return 1
+	done
+}
+
 make_and_test() {
 	# Build the checksum program and tests.  Set the special test support
 	# flag to get support for LIBDEFLATE_DISABLE_CPU_FEATURES.
@@ -37,7 +44,7 @@ __do_benchmark() {
 	speed=$(./build/programs/checksum "${CKSUM_FLAGS[@]}" \
 		"${flags[@]}" -t "$FILE" | \
 		grep -o '[0-9]\+ MB/s' | grep -o '[0-9]\+')
-	printf "%-45s%-10s\n" "$CKSUM_NAME ($impl)" "$speed"
+	printf "%-60s%-10s\n" "$CKSUM_NAME ($impl)" "$speed"
 }
 
 do_benchmark() {
@@ -95,8 +102,8 @@ else
 fi
 
 cat << EOF
-Method                                       Speed (MB/s)
-------                                       ------------
+Method                                                      Speed (MB/s)
+------                                                      ------------
 EOF
 
 # CRC-32
@@ -107,13 +114,25 @@ export LIBDEFLATE_DISABLE_CPU_FEATURES=""
 {
 case $ARCH in
 i386|x86_64)
-	if have_cpu_feature pclmulqdq && have_cpu_feature avx; then
-		do_benchmark "PCLMUL/AVX"
+	if have_cpu_features vpclmulqdq avx512f avx512vl; then
+		do_benchmark "VPCLMULQDQ/AVX512F/AVX512VL"
+		disable_cpu_feature "avx512f" "-mno-avx512f"
+	fi
+	if have_cpu_features vpclmulqdq avx512vl; then
+		do_benchmark "VPCLMULQDQ/AVX512VL"
+		disable_cpu_feature "avx512vl" "-mno-avx512vl"
+	fi
+	if have_cpu_features vpclmulqdq avx2; then
+		do_benchmark "VPCLMULQDQ/AVX2"
+		disable_cpu_feature "vpclmulqdq" "-mno-vpclmulqdq"
+	fi
+	if have_cpu_features pclmulqdq avx; then
+		do_benchmark "PCLMULQDQ/AVX"
 		disable_cpu_feature "avx" "-mno-avx"
 	fi
 	if have_cpu_feature pclmulqdq; then
-		do_benchmark "PCLMUL"
-		disable_cpu_feature "pclmul" "-mno-pclmul"
+		do_benchmark "PCLMULQDQ"
+		disable_cpu_feature "pclmulqdq" "-mno-pclmul"
 	fi
 	;;
 arm*|aarch*)
diff --git a/scripts/gen_crc32_multipliers.c b/scripts/gen_crc32_multipliers.c
index 42470c13..1073bfc8 100644
--- a/scripts/gen_crc32_multipliers.c
+++ b/scripts/gen_crc32_multipliers.c
@@ -97,37 +97,34 @@ gen_vec_folding_constants(void)
 	/*
 	 * Compute the multipliers needed for CRC-32 folding with carryless
 	 * multiplication instructions that operate on the 64-bit halves of
-	 * 128-bit vectors.  Using the terminology from earlier, for each 64-bit
+	 * 128-bit segments.  Using the terminology from earlier, for each 64-bit
 	 * fold len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial
 	 * multiplied by a 32-bit one produces a 95-bit one.  When A(x) is the
-	 * low order polynomial half of a 128-bit vector (high order physical
+	 * low order polynomial half of a 128-bit segments (high order physical
 	 * half), the separation between the message parts is the total length
-	 * of the 128-bit vectors separating the values.  When A(x) is the high
+	 * of the 128-bit segments separating the values.  When A(x) is the high
 	 * order polynomial half, the separation is 64 bits greater.
 	 */
-	for (int num_vecs = 1; num_vecs <= 12; num_vecs++) {
-		const int sep_lo = 128 * (num_vecs - 1);
+	for (int i = 1; i <= 32; i++) {
+		const int sep_lo = 128 * (i - 1);
 		const int sep_hi = sep_lo + 64;
 		const int len_B = 95;
 		int D;
 
 		/* A(x) = high 64 polynomial bits (low 64 physical bits) */
 		D = sep_hi + len_B;
-		printf("#define CRC32_%dVECS_MULT_1 0x%08"PRIx32" /* x^%d mod G(x) */\n",
-		       num_vecs, compute_xD_modG(D), D);
+		printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n",
+		       D, compute_xD_modG(D), D);
 
 		/* A(x) = low 64 polynomial bits (high 64 physical bits) */
 		D = sep_lo + len_B;
-		printf("#define CRC32_%dVECS_MULT_2 0x%08"PRIx32" /* x^%d mod G(x) */\n",
-		       num_vecs, compute_xD_modG(D), D);
-
-		printf("#define CRC32_%dVECS_MULTS { CRC32_%dVECS_MULT_1, CRC32_%dVECS_MULT_2 }\n",
-		       num_vecs, num_vecs, num_vecs);
+		printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n",
+		       D, compute_xD_modG(D), D);
 		printf("\n");
 	}
 
 	/* Multiplier for final 96 => 64 bit fold */
-	printf("#define CRC32_FINAL_MULT 0x%08"PRIx32" /* x^63 mod G(x) */\n",
+	printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
 	       compute_xD_modG(63));
 
 	/*
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 44d3b9e2..ff15cc19 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -142,7 +142,8 @@ build_and_run_tests()
 	if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
 		case "$ARCH" in
 		i386|x86_64)
-			features+=(avx2 avx bmi2 pclmul sse2)
+			features+=(vpclmulqdq avx512vl avx512f
+				   avx2 avx bmi2 pclmulqdq sse2)
 			;;
 		arm*|aarch*)
 			features+=(dotprod sha3 crc32 pmull neon)

From 513cec0f5597de958c319432a7da510d1b7d7d4c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sun, 18 Feb 2024 15:15:46 -0800
Subject: [PATCH 2/3] test_util: fix timer_frequency() in direct compilation
 case

---
 programs/test_util.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/programs/test_util.c b/programs/test_util.c
index 77517e1a..11cd4875 100644
--- a/programs/test_util.c
+++ b/programs/test_util.c
@@ -174,7 +174,9 @@ timer_frequency(void)
 
 	QueryPerformanceFrequency(&freq);
 	return freq.QuadPart;
-#elif defined(HAVE_CLOCK_GETTIME)
+#elif defined(HAVE_CLOCK_GETTIME) || \
+	/* fallback detection method for direct compilation */ \
+	(!defined(HAVE_CONFIG_H) && defined(CLOCK_MONOTONIC))
 	return 1000000000;
 #else
 	return 1000000;

From 6f4b83f523d3aaadd43b9736e87b94e97c7c3837 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sun, 18 Feb 2024 15:36:42 -0800
Subject: [PATCH 3/3] ci.yml: upgrade to microsoft/setup-msbuild@v2

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f758e97a..f738eed4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -117,7 +117,7 @@ jobs:
     runs-on: ${{matrix.os}}
     steps:
     - uses: actions/checkout@v4
-    - uses: microsoft/setup-msbuild@v1.1
+    - uses: microsoft/setup-msbuild@v2
     - run: vcpkg install zlib:${{matrix.vcpkg}}
     - run: >
         echo C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\bin
@@ -145,7 +145,7 @@ jobs:
     runs-on: windows-latest
     steps:
     - uses: actions/checkout@v4
-    - uses: microsoft/setup-msbuild@v1.1
+    - uses: microsoft/setup-msbuild@v2
     # Note: as per the CMake documentation, DESTDIR is unsupported on Windows.
     - run: >
         cmake -B build -G "Visual Studio 17 2022" -T ${{matrix.toolset}}