Skip to content

Commit

Permalink
lib/x86/adler32: add an AVX-VNNI implementation
Browse files Browse the repository at this point in the history
Also revise the AVX512VNNI implementation to be consistent.
  • Loading branch information
ebiggers committed Feb 24, 2024
1 parent a026a04 commit 7582012
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 36 deletions.
151 changes: 124 additions & 27 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,24 +277,115 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
#endif /* HAVE_AVX2_INTRIN */

/*
* AVX512VNNI/AVX512BW implementation. Uses the dot product instruction
* vpdpbusd from AVX512VNNI and the vpsadbw instruction from AVX512BW.
* (vpdpbusd with ones could be used instead of vpsadbw, but it's slower.)
* AVX2/AVX-VNNI implementation. This is similar to the AVX512BW/AVX512VNNI
* implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI.
* AVX-VNNI adds dot product instructions to CPUs without AVX-512.
*/
#if HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN
# define adler32_avx2_vnni adler32_avx2_vnni
# define FUNCNAME adler32_avx2_vnni
# define FUNCNAME_CHUNK adler32_avx2_vnni_chunk
# define IMPL_ALIGNMENT 32
# define IMPL_SEGMENT_LEN 128
# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN
# if HAVE_AVX2_NATIVE && HAVE_AVXVNNI_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx2,avxvnni")
# endif
# include <immintrin.h>
/*
* With clang in MSVC compatibility mode, immintrin.h incorrectly skips
* including some sub-headers.
*/
# if defined(__clang__) && defined(_MSC_VER)
# include <tmmintrin.h>
# include <smmintrin.h>
# include <wmmintrin.h>
# include <avxintrin.h>
# include <avx2intrin.h>
# include <avxvnniintrin.h>
# endif
static forceinline ATTRIBUTES void
adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
u32 *s1, u32 *s2)
{
static const u8 _aligned_attribute(32) mults[2][32] = {
{ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, },
{ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, },
};
const __m256i /* __v32qu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]);
const __m256i /* __v32qu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]);
const __m256i zeroes = _mm256_setzero_si256();
__m256i /* __v8su */ v_s1_a = zeroes;
__m256i /* __v8su */ v_s1_b = zeroes;
__m256i /* __v8su */ v_s1_sums_a = zeroes;
__m256i /* __v8su */ v_s1_sums_b = zeroes;
__m256i /* __v8su */ v_s2_a = zeroes;
__m256i /* __v8su */ v_s2_b = zeroes;
__m256i /* __v8su */ v_s2_c = zeroes;
__m256i /* __v8su */ v_s2_d = zeroes;

do {
__m256i bytes_a = *p++;
__m256i bytes_b = *p++;
__m256i bytes_c = *p++;
__m256i bytes_d = *p++;

v_s2_a = _mm256_dpbusd_epi32(v_s2_a, bytes_a, mults_a);
v_s2_b = _mm256_dpbusd_epi32(v_s2_b, bytes_b, mults_b);
v_s2_c = _mm256_dpbusd_epi32(v_s2_c, bytes_c, mults_a);
v_s2_d = _mm256_dpbusd_epi32(v_s2_d, bytes_d, mults_b);
v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_a);
v_s1_sums_b = _mm256_add_epi32(v_s1_sums_b, v_s1_b);
v_s1_a = _mm256_add_epi32(v_s1_a,
_mm256_add_epi32(_mm256_sad_epu8(bytes_a, zeroes),
_mm256_sad_epu8(bytes_b, zeroes)));
v_s1_b = _mm256_add_epi32(v_s1_b,
_mm256_add_epi32(_mm256_sad_epu8(bytes_c, zeroes),
_mm256_sad_epu8(bytes_d, zeroes)));
#if GCC_PREREQ(1, 0)
__asm__("" : "+x" (v_s1_a), "+x" (v_s1_b), "+x" (v_s1_sums_a),
"+x" (v_s1_sums_b), "+x" (v_s2_a), "+x" (v_s2_b),
"+x" (v_s2_c), "+x" (v_s2_d));
#endif
} while (p != end);

v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_sums_b);
v_s1_sums_a = _mm256_slli_epi32(v_s1_sums_a, 7);
v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, _mm256_slli_epi32(v_s1_a, 6));
v_s1_a = _mm256_add_epi32(v_s1_a, v_s1_b);
v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_b);
v_s2_c = _mm256_add_epi32(v_s2_c, v_s2_d);
v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_c);
v_s2_a = _mm256_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */

/*
* AVX512BW/AVX512VNNI implementation. Uses the vpsadbw (sum of absolute
* differences) instruction from AVX512BW and the vpdpbusd (dot product)
* instruction from AVX512VNNI. Note that vpdpbusd with all-ones could be used
* instead of vpsadbw, but vpsadbw is faster.
*
* We currently don't include an implementation using AVX512VNNI or AVX512BW
* We currently don't include an implementation using AVX512BW or AVX512VNNI
* alone, since CPUs with good AVX-512 implementations tend to have both.
*/
#if HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN
# define adler32_avx512vnni_avx512bw adler32_avx512vnni_avx512bw
# define FUNCNAME adler32_avx512vnni_avx512bw
# define FUNCNAME_CHUNK adler32_avx512vnni_avx512bw_chunk
#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN
# define adler32_avx512_vnni adler32_avx512_vnni
# define FUNCNAME adler32_avx512_vnni
# define FUNCNAME_CHUNK adler32_avx512_vnni_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_LEN 128
# define IMPL_MAX_CHUNK_LEN (128 * (0xFFFF / 0xFF))
# if HAVE_AVX512VNNI_NATIVE && HAVE_AVX512BW_NATIVE
# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN
# if HAVE_AVX512BW_NATIVE && HAVE_AVX512VNNI_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx512vnni,avx512bw")
# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni")
# endif
# include <immintrin.h>
/*
Expand All @@ -312,8 +403,8 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
# include <avx512vnniintrin.h>
# endif
static forceinline ATTRIBUTES void
adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
{
static const u8 _aligned_attribute(64) mults[64] = {
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
Expand All @@ -323,7 +414,8 @@ adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end,
};
const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults);
const __m512i zeroes = _mm512_setzero_si512();
__m512i /* __v16su */ v_s1 = zeroes;
__m512i /* __v16su */ v_s1_a = zeroes;
__m512i /* __v16su */ v_s1_b = zeroes;
__m512i /* __v16su */ v_s1_sums_a = zeroes;
__m512i /* __v16su */ v_s1_sums_b = zeroes;
__m512i /* __v16su */ v_s2_a = zeroes;
Expand All @@ -332,36 +424,41 @@ adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end,
do {
const __m512i bytes_a = *p++;
const __m512i bytes_b = *p++;
__m512i v_sad_a, v_sad_b;

v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a);
v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a);
v_sad_a = _mm512_sad_epu8(bytes_a, zeroes);
v_sad_b = _mm512_sad_epu8(bytes_b, zeroes);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1);
v_s1 = _mm512_add_epi32(v_s1, v_sad_a);
v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1);
v_s1 = _mm512_add_epi32(v_s1, v_sad_b);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a);
v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b);
v_s1_a = _mm512_add_epi32(v_s1_a, _mm512_sad_epu8(bytes_a, zeroes));
v_s1_b = _mm512_add_epi32(v_s1_b, _mm512_sad_epu8(bytes_b, zeroes));
GCC_UPDATE_VARS(v_s1_a, v_s1_b, v_s1_sums_a, v_s1_sums_b,
v_s2_a, v_s2_b);
} while (p != end);

v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b);
v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 6);
v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 7);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, _mm512_slli_epi32(v_s1_a, 6));
v_s1_a = _mm512_add_epi32(v_s1_a, v_s1_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN */
#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */

static inline adler32_func_t
arch_select_adler32_func(void)
{
const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef adler32_avx512vnni_avx512bw
#ifdef adler32_avx512_vnni
if ((features & X86_CPU_FEATURE_ZMM) &&
HAVE_AVX512VNNI(features) && HAVE_AVX512BW(features))
return adler32_avx512vnni_avx512bw;
HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features))
return adler32_avx512_vnni;
#endif
#ifdef adler32_avx2_vnni
if (HAVE_AVX2(features) && HAVE_AVXVNNI(features))
return adler32_avx2_vnni;
#endif
#ifdef adler32_avx2
if (HAVE_AVX2(features))
Expand Down
8 changes: 7 additions & 1 deletion lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
{X86_CPU_FEATURE_AVX512VNNI, "avx512vnni"},
{X86_CPU_FEATURE_AVX512VNNI, "avx512_vnni"},
{X86_CPU_FEATURE_AVXVNNI, "avx_vnni"},
};

volatile u32 libdeflate_x86_cpu_features = 0;
Expand Down Expand Up @@ -182,6 +183,11 @@ void libdeflate_init_x86_cpu_features(void)
if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512VNNI;

/* EAX=7, ECX=1: Extended Features */
cpuid(7, 1, &a, &b, &c, &d);
if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6))
features |= X86_CPU_FEATURE_AVXVNNI;

out:
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
ARRAY_LEN(x86_cpu_feature_table));
Expand Down
21 changes: 18 additions & 3 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
#define X86_CPU_FEATURE_AVX512VL 0x00000100
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200
#define X86_CPU_FEATURE_AVX512VNNI 0x00000400
#define X86_CPU_FEATURE_AVXVNNI 0x00000800

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
Expand All @@ -67,6 +68,7 @@
#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))
#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI))
#define HAVE_AVXVNNI(features) (HAVE_AVXVNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVXVNNI))

#if HAVE_DYNAMIC_X86_CPU_FEATURES
#define X86_CPU_FEATURES_KNOWN 0x80000000
Expand Down Expand Up @@ -173,7 +175,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_BMI2_INTRIN 0
#endif

/* AVX-512F */
/* AVX512F */
#ifdef __AVX512F__
# define HAVE_AVX512F_NATIVE 1
#else
Expand All @@ -186,7 +188,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_AVX512F_INTRIN 0
#endif

/* AVX-512BW */
/* AVX512BW */
#ifdef __AVX512BW__
# define HAVE_AVX512BW_NATIVE 1
#else
Expand All @@ -199,7 +201,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_AVX512BW_INTRIN 0
#endif

/* AVX-512VL */
/* AVX512VL */
#ifdef __AVX512VL__
# define HAVE_AVX512VL_NATIVE 1
#else
Expand Down Expand Up @@ -238,6 +240,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_AVX512VNNI_INTRIN 0
#endif

/* AVX-VNNI */
#ifdef __AVXVNNI__
# define HAVE_AVXVNNI_NATIVE 1
#else
# define HAVE_AVXVNNI_NATIVE 0
#endif
#if HAVE_AVXVNNI_NATIVE || GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || \
defined(_MSC_VER)
# define HAVE_AVXVNNI_INTRIN 1
#else
# define HAVE_AVXVNNI_INTRIN 0
#endif

#endif /* ARCH_X86_32 || ARCH_X86_64 */

#endif /* LIB_X86_CPU_FEATURES_H */
10 changes: 7 additions & 3 deletions scripts/checksum_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,15 @@ echo
{
case $ARCH in
i386|x86_64)
if have_cpu_features avx512_vnni avx512bw; then
do_benchmark "AVX512VNNI/AVX512BW"
disable_cpu_feature "avx512vnni" "-mno-avx512vnni"
if have_cpu_features avx512bw avx512_vnni; then
do_benchmark "AVX512BW/AVX512VNNI"
disable_cpu_feature "avx512_vnni" "-mno-avx512vnni"
disable_cpu_feature "avx512bw" "-mno-avx512bw"
fi
if have_cpu_features avx2 avx_vnni; then
do_benchmark "AVX2/AVX-VNNI"
disable_cpu_feature "avx_vnni" "-mno-avxvnni"
fi
if have_cpu_feature avx2; then
do_benchmark "AVX2"
disable_cpu_feature "avx2" "-mno-avx2"
Expand Down
5 changes: 3 additions & 2 deletions scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,9 @@ build_and_run_tests()
if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
case "$ARCH" in
i386|x86_64)
features+=(zmm avx512vnni vpclmulqdq avx512vl avx512bw
avx512f avx2 avx bmi2 pclmulqdq sse2)
features+=(zmm avx_vnni avx512_vnni vpclmulqdq
avx512vl avx512bw avx512f
avx2 avx bmi2 pclmulqdq sse2)
;;
arm*|aarch*)
features+=(dotprod sha3 crc32 pmull neon)
Expand Down

0 comments on commit 7582012

Please sign in to comment.