diff --git a/deps/base64/base64.gyp b/deps/base64/base64.gyp index be68561708fef0..7bba55dbd9b498 100644 --- a/deps/base64/base64.gyp +++ b/deps/base64/base64.gyp @@ -46,6 +46,7 @@ 'HAVE_SSE42=1', 'HAVE_AVX=1', 'HAVE_AVX2=1', + 'HAVE_AVX512=1', ], 'dependencies': [ 'base64_ssse3', @@ -53,6 +54,7 @@ 'base64_sse42', 'base64_avx', 'base64_avx2', + 'base64_avx512', ], }, { 'sources': [ @@ -61,6 +63,7 @@ 'base64/lib/arch/sse42/codec.c', 'base64/lib/arch/avx/codec.c', 'base64/lib/arch/avx2/codec.c', + 'base64/lib/arch/avx512/codec.c', ], }], ], @@ -162,6 +165,30 @@ ], }, + { + 'target_name': 'base64_avx512', + 'type': 'static_library', + 'include_dirs': [ 'base64/include', 'base64/lib' ], + 'sources': [ 'base64/lib/arch/avx512/codec.c' ], + 'defines': [ 'BASE64_STATIC_DEFINE', 'HAVE_AVX512=1' ], + 'conditions': [ + [ 'OS!="win"', { + 'cflags': [ '-mavx512vl', '-mavx512vbmi' ], + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-mavx512vl', '-mavx512vbmi' ] + }, + }, { + 'msvs_settings': { + 'VCCLCompilerTool': { + 'AdditionalOptions': [ + '/arch:AVX512' + ], + }, + }, + }], + ], + }, + { 'target_name': 'base64_neon32', 'type': 'static_library', diff --git a/deps/base64/base64/CMakeLists.txt b/deps/base64/base64/CMakeLists.txt index dcca17f6e27b48..56a22a7d7d3bee 100644 --- a/deps/base64/base64/CMakeLists.txt +++ b/deps/base64/base64/CMakeLists.txt @@ -17,7 +17,7 @@ if (POLICY CMP0127) cmake_policy(SET CMP0127 NEW) endif() -project(base64 LANGUAGES C VERSION 0.4.0) +project(base64 LANGUAGES C VERSION 0.5.0) include(GNUInstallDirs) include(CMakeDependentOption) @@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF) add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath") cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF) add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath") +cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF) +add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath") cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF) add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath") @@ -118,6 +120,7 @@ add_library(base64 lib/arch/sse42/codec.c lib/arch/avx/codec.c lib/arch/avx2/codec.c + lib/arch/avx512/codec.c lib/arch/neon32/codec.c lib/arch/neon64/codec.c @@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64") configure_codec(SSE42 __SSSE4_2__) configure_codec(AVX) configure_codec(AVX2) + configure_codec(AVX512) elseif (_TARGET_ARCH STREQUAL "arm") set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')") diff --git a/deps/base64/base64/LICENSE b/deps/base64/base64/LICENSE index 9446393a82a847..109d6521b122c0 100644 --- a/deps/base64/base64/LICENSE +++ b/deps/base64/base64/LICENSE @@ -1,7 +1,7 @@ Copyright (c) 2005-2007, Nick Galbreath -Copyright (c) 2013-2019, Alfred Klomp -Copyright (c) 2015-2017, Wojciech Mula +Copyright (c) 2015-2018, Wojciech Muła Copyright (c) 2016-2017, Matthieu Darbois +Copyright (c) 2013-2022, Alfred Klomp All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/deps/base64/base64/Makefile b/deps/base64/base64/Makefile index 2bb01e204fcfac..bcb944551ae881 100644 --- a/deps/base64/base64/Makefile +++ b/deps/base64/base64/Makefile @@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic OBJCOPY ?= objcopy OBJS = \ + lib/arch/avx512/codec.o \ lib/arch/avx2/codec.o \ lib/arch/generic/codec.o \ lib/arch/neon32/codec.o \ @@ -16,6 +17,7 @@ OBJS = \ lib/codec_choose.o \ lib/tables/tables.o +HAVE_AVX512 = 0 HAVE_AVX2 = 0 HAVE_NEON32 = 0 HAVE_NEON64 = 0 @@ -26,6 +28,9 @@ HAVE_AVX = 0 # The user should supply compiler flags for the codecs they want to build. # Check which codecs we're going to include: +ifdef AVX512_CFLAGS + HAVE_AVX512 = 1 +endif ifdef AVX2_CFLAGS HAVE_AVX2 = 1 endif @@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS) $(OBJCOPY) --keep-global-symbols=lib/exports.txt $@ lib/config.h: - @echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@ + @echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@ + @echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@ @echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@ @echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@ @echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@ @@ -75,6 +81,7 @@ lib/config.h: $(OBJS): lib/config.h $(OBJS): CFLAGS += -Ilib +lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS) lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS) lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS) lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS) diff --git a/deps/base64/base64/README.md b/deps/base64/base64/README.md index b953c324c9dc1e..ae0a914965e101 100644 --- a/deps/base64/base64/README.md +++ b/deps/base64/base64/README.md @@ -3,7 +3,7 @@ [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml) This is an implementation of a base64 stream encoding/decoding library in C99 -with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and +with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions to encode/decode simple length-delimited strings. This library aims to be: @@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a time, which gives a speedup of four or more times compared to the "plain" bytewise codec. +AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI +instructions. Decoding part reused AVX2 implementations. For CPUs later than +Cannonlake (manufactured in 2018) supports these instructions. + NEON support is hardcoded to on or off at compile time, because portable runtime feature detection is unavailable on ARM. @@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html). His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64). +The AVX512 encoder is based on code from Wojciech Muła's +[base64simd](https://github.com/WojciechMula/base64simd) library. + The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl). ## Building @@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type: make lib/libbase64.o ``` -Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`, -`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. +Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`, +`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. A typical build invocation on x86 looks like this: ```sh @@ -93,6 +100,15 @@ Example: AVX2_CFLAGS=-mavx2 make ``` +### AVX512 + +To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`. +Example: + +```sh +AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make +``` + The codec will only be used if runtime feature detection shows that the target machine supports AVX2. ### SSSE3 @@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way The following constants can be used: - `BASE64_FORCE_AVX2` +- `BASE64_FORCE_AVX512` - `BASE64_FORCE_NEON32` - `BASE64_FORCE_NEON64` - `BASE64_FORCE_PLAIN` @@ -434,7 +451,7 @@ x86 processors | i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* | | i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* | | i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* | -| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243 | 6233 | 4160 | 6344 | +| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243\* | 6233 | 4160\* | 6344 | | Xeon X5570 @ 2.93 GHz | 2161 | 1508 | 3160 | 3915 | - | - | - | - | | Pentium4 @ 3.4 GHz | 896 | 740 | - | - | - | - | - | - | | Atom N270 | 243 | 266 | 508 | 387 | - | - | - | - | diff --git a/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake b/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake index ba1f6e51815eec..485080905319a6 100644 --- a/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake +++ b/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake @@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 "-msse4.2") set(COMPILE_FLAGS_AVX "-mavx") set(COMPILE_FLAGS_AVX2 "-mavx2") + set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi") #arm set(COMPILE_FLAGS_NEON32 "-mfpu=neon") @@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 " ") set(COMPILE_FLAGS_AVX "/arch:AVX") set(COMPILE_FLAGS_AVX2 "/arch:AVX2") + set(COMPILE_FLAGS_AVX512 "/arch:AVX512") endif() endmacro(define_SIMD_compile_flags) diff --git a/deps/base64/base64/cmake/config.h.in b/deps/base64/base64/cmake/config.h.in index 8530d1e13d4801..c7faa94bc0903d 100644 --- a/deps/base64/base64/cmake/config.h.in +++ b/deps/base64/base64/cmake/config.h.in @@ -16,6 +16,9 @@ #cmakedefine01 BASE64_WITH_AVX2 #define HAVE_AVX2 BASE64_WITH_AVX2 +#cmakedefine01 BASE64_WITH_AVX512 +#define HAVE_AVX512 BASE64_WITH_AVX512 + #cmakedefine01 BASE64_WITH_NEON32 #define HAVE_NEON32 BASE64_WITH_NEON32 diff --git a/deps/base64/base64/include/libbase64.h b/deps/base64/base64/include/libbase64.h index d470a82f1028dc..c5908973c5e73c 100644 --- a/deps/base64/base64/include/libbase64.h +++ b/deps/base64/base64/include/libbase64.h @@ -53,6 +53,7 @@ extern "C" { #define BASE64_FORCE_SSE41 (1 << 5) #define BASE64_FORCE_SSE42 (1 << 6) #define BASE64_FORCE_AVX (1 << 7) +#define BASE64_FORCE_AVX512 (1 << 8) struct base64_state { int eof; diff --git a/deps/base64/base64/lib/arch/avx/codec.c b/deps/base64/base64/lib/arch/avx/codec.c index a7a963d8358918..83cf3098f3f554 100644 --- a/deps/base64/base64/lib/arch/avx/codec.c +++ b/deps/base64/base64/lib/arch/avx/codec.c @@ -11,11 +11,25 @@ #if HAVE_AVX #include +// Only enable inline assembly on supported compilers. +#ifndef BASE64_AVX_USE_ASM +# if defined(__GNUC__) || defined(__clang__) +# define BASE64_AVX_USE_ASM 1 +# else +# define BASE64_AVX_USE_ASM 0 +# endif +#endif + #include "../ssse3/dec_reshuffle.c" #include "../ssse3/dec_loop.c" -#include "../ssse3/enc_translate.c" -#include "../ssse3/enc_reshuffle.c" -#include "../ssse3/enc_loop.c" + +#if BASE64_AVX_USE_ASM +# include "enc_loop_asm.c" +#else +# include "../ssse3/enc_translate.c" +# include "../ssse3/enc_reshuffle.c" +# include "../ssse3/enc_loop.c" +#endif #endif // HAVE_AVX @@ -23,7 +37,17 @@ BASE64_ENC_FUNCTION(avx) { #if HAVE_AVX #include "../generic/enc_head.c" + + // For supported compilers, use a hand-optimized inline assembly + // encoder. Otherwise fall back on the SSSE3 encoder, but compiled with + // AVX flags to generate better optimized AVX code. + +#if BASE64_AVX_USE_ASM + enc_loop_avx(&s, &slen, &o, &olen); +#else enc_loop_ssse3(&s, &slen, &o, &olen); +#endif + #include "../generic/enc_tail.c" #else BASE64_ENC_STUB diff --git a/deps/base64/base64/lib/arch/avx/enc_loop_asm.c b/deps/base64/base64/lib/arch/avx/enc_loop_asm.c new file mode 100644 index 00000000000000..27f1e88e12f590 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx/enc_loop_asm.c @@ -0,0 +1,259 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads register R0 from memory. The +// offset at which the register is loaded is set by the given round. +#define LOAD(R0, ROUND) \ + "vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t" + +// Generate a block of inline assembly that deinterleaves and shuffles register +// R0 using preloaded constants. Outputs in R0 and R1. +#define SHUF(R0, R1, R2) \ + "vpshufb %[lut0], %["R0"], %["R1"] \n\t" \ + "vpand %["R1"], %[msk0], %["R2"] \n\t" \ + "vpand %["R1"], %[msk2], %["R1"] \n\t" \ + "vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \ + "vpmullw %["R1"], %[msk3], %["R1"] \n\t" \ + "vpor %["R1"], %["R2"], %["R1"] \n\t" + +// Generate a block of inline assembly that takes R0 and R1 and translates +// their contents to the base64 alphabet, using preloaded constants. +#define TRAN(R0, R1, R2) \ + "vpsubusb %[n51], %["R1"], %["R0"] \n\t" \ + "vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \ + "vpsubb %["R2"], %["R0"], %["R0"] \n\t" \ + "vpshufb %["R0"], %[lut1], %["R2"] \n\t" \ + "vpaddb %["R1"], %["R2"], %["R0"] \n\t" + +// Generate a block of inline assembly that stores the given register R0 at an +// offset set by the given round. +#define STOR(R0, ROUND) \ + "vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t" + +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. Then update +// the source and destination pointers. +#define ROUND() \ + LOAD("a", 0) \ + SHUF("a", "b", "c") \ + TRAN("a", "b", "c") \ + STOR("a", 0) \ + "add $12, %[src] \n\t" \ + "add $16, %[dst] \n\t" + +// Define a macro that initiates a three-way interleaved encoding round by +// preloading registers a, b and c from memory. +// The register graph shows which registers are in use during each step, and +// is a visual aid for choosing registers for that step. Symbol index: +// +// + indicates that a register is loaded by that step. +// | indicates that a register is in use and must not be touched. +// - indicates that a register is decommissioned by that step. +// x indicates that a register is used as a temporary by that step. +// V indicates that a register is an input or output to the macro. +// +#define ROUND_3_INIT() /* a b c d e f */ \ + LOAD("a", 0) /* + */ \ + SHUF("a", "d", "e") /* | + x */ \ + LOAD("b", 1) /* | + | */ \ + TRAN("a", "d", "e") /* | | - x */ \ + LOAD("c", 2) /* V V V */ + +// Define a macro that translates, shuffles and stores the input registers A, B +// and C, and preloads registers D, E and F for the next round. +// This macro can be arbitrarily daisy-chained by feeding output registers D, E +// and F back into the next round as input registers A, B and C. The macro +// carefully interleaves memory operations with data operations for optimal +// pipelined performance. + +#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + LOAD(D, (ROUND + 3)) /* V V V + */ \ + SHUF(B, E, F) /* | | | | + x */ \ + STOR(A, (ROUND + 0)) /* - | | | | */ \ + TRAN(B, E, F) /* | | | - x */ \ + LOAD(E, (ROUND + 4)) /* | | | + */ \ + SHUF(C, A, F) /* + | | | | x */ \ + STOR(B, (ROUND + 1)) /* | - | | | */ \ + TRAN(C, A, F) /* - | | | x */ \ + LOAD(F, (ROUND + 5)) /* | | | + */ \ + SHUF(D, A, B) /* + + | | | | */ \ + STOR(C, (ROUND + 2)) /* | | - | | | */ \ + TRAN(D, A, B) /* - - V V V */ + +// Define a macro that terminates a ROUND_3 macro by taking pre-loaded +// registers D, E and F, and translating, shuffling and storing them. +#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + SHUF(E, A, B) /* + x V V V */ \ + STOR(D, (ROUND + 3)) /* | - | | */ \ + TRAN(E, A, B) /* - x | | */ \ + SHUF(F, C, D) /* + x | | */ \ + STOR(E, (ROUND + 4)) /* | - | */ \ + TRAN(F, C, D) /* - x | */ \ + STOR(F, (ROUND + 5)) /* - */ + +// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f. +#define ROUND_3_A(ROUND) \ + ROUND_3(ROUND, "a", "b", "c", "d", "e", "f") + +// Define a type B round. Inputs and outputs are swapped with regard to type A. +#define ROUND_3_B(ROUND) \ + ROUND_3(ROUND, "d", "e", "f", "a", "b", "c") + +// Terminating macro for a type A round. +#define ROUND_3_A_LAST(ROUND) \ + ROUND_3_A(ROUND) \ + ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f") + +// Terminating macro for a type B round. +#define ROUND_3_B_LAST(ROUND) \ + ROUND_3_B(ROUND) \ + ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + +static inline void +enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + // For a clearer explanation of the algorithm used by this function, + // please refer to the plain (not inline assembly) implementation. This + // function follows the same basic logic. + + if (*slen < 16) { + return; + } + + // Process blocks of 12 bytes at a time. + size_t rounds = *slen / 12; + + *slen -= rounds * 12; // 12 bytes consumed per round + *olen += rounds * 16; // 16 bytes produced per round + + // Number of times to go through the 36x loop. + size_t loops = rounds / 36; + + // Number of rounds remaining after the 36x loop. + rounds %= 36; + + // Lookup tables. + const __m128i lut0 = _mm_set_epi8( + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + + const __m128i lut1 = _mm_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Temporary registers. + __m128i a, b, c, d, e, f; + + __asm__ volatile ( + + // If there are 36 rounds or more, enter a 36x unrolled loop of + // interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations (table lookups, + // etc) to maximize pipeline throughput. + " test %[loops], %[loops] \n\t" + " jz 18f \n\t" + " jmp 36f \n\t" + " \n\t" + ".balign 64 \n\t" + "36: " ROUND_3_INIT() + " " ROUND_3_A( 0) + " " ROUND_3_B( 3) + " " ROUND_3_A( 6) + " " ROUND_3_B( 9) + " " ROUND_3_A(12) + " " ROUND_3_B(15) + " " ROUND_3_A(18) + " " ROUND_3_B(21) + " " ROUND_3_A(24) + " " ROUND_3_B(27) + " " ROUND_3_A_LAST(30) + " add $(12 * 36), %[src] \n\t" + " add $(16 * 36), %[dst] \n\t" + " dec %[loops] \n\t" + " jnz 36b \n\t" + + // Enter an 18x unrolled loop for rounds of 18 or more. + "18: cmp $18, %[rounds] \n\t" + " jl 9f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B(3) + " " ROUND_3_A(6) + " " ROUND_3_B(9) + " " ROUND_3_A_LAST(12) + " sub $18, %[rounds] \n\t" + " add $(12 * 18), %[src] \n\t" + " add $(16 * 18), %[dst] \n\t" + + // Enter a 9x unrolled loop for rounds of 9 or more. + "9: cmp $9, %[rounds] \n\t" + " jl 6f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B_LAST(3) + " sub $9, %[rounds] \n\t" + " add $(12 * 9), %[src] \n\t" + " add $(16 * 9), %[dst] \n\t" + + // Enter a 6x unrolled loop for rounds of 6 or more. + "6: cmp $6, %[rounds] \n\t" + " jl 55f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A_LAST(0) + " sub $6, %[rounds] \n\t" + " add $(12 * 6), %[src] \n\t" + " add $(16 * 6), %[dst] \n\t" + + // Dispatch the remaining rounds 0..5. + "55: cmp $3, %[rounds] \n\t" + " jg 45f \n\t" + " je 3f \n\t" + " cmp $1, %[rounds] \n\t" + " jg 2f \n\t" + " je 1f \n\t" + " jmp 0f \n\t" + + "45: cmp $4, %[rounds] \n\t" + " je 4f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "5: " ROUND() + "4: " ROUND() + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [rounds] "+r" (rounds), + [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [a] "=&x" (a), + [b] "=&x" (b), + [c] "=&x" (c), + [d] "=&x" (d), + [e] "=&x" (e), + [f] "=&x" (f) + + // Inputs (not modified). + : [lut0] "x" (lut0), + [lut1] "x" (lut1), + [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)), + [msk1] "x" (_mm_set1_epi32(0x04000040)), + [msk2] "x" (_mm_set1_epi32(0x003F03F0)), + [msk3] "x" (_mm_set1_epi32(0x01000010)), + [n51] "x" (_mm_set1_epi8(51)), + [n25] "x" (_mm_set1_epi8(25)) + ); +} + +#pragma GCC diagnostic pop diff --git a/deps/base64/base64/lib/arch/avx2/codec.c b/deps/base64/base64/lib/arch/avx2/codec.c index 0498548b80d286..150e530a3dd553 100644 --- a/deps/base64/base64/lib/arch/avx2/codec.c +++ b/deps/base64/base64/lib/arch/avx2/codec.c @@ -11,11 +11,25 @@ #if HAVE_AVX2 #include +// Only enable inline assembly on supported compilers. +#ifndef BASE64_AVX2_USE_ASM +# if defined(__GNUC__) || defined(__clang__) +# define BASE64_AVX2_USE_ASM 1 +# else +# define BASE64_AVX2_USE_ASM 0 +# endif +#endif + #include "dec_reshuffle.c" #include "dec_loop.c" -#include "enc_translate.c" -#include "enc_reshuffle.c" -#include "enc_loop.c" + +#if BASE64_AVX2_USE_ASM +# include "enc_loop_asm.c" +#else +# include "enc_translate.c" +# include "enc_reshuffle.c" +# include "enc_loop.c" +#endif #endif // HAVE_AVX2 diff --git a/deps/base64/base64/lib/arch/avx2/enc_loop_asm.c b/deps/base64/base64/lib/arch/avx2/enc_loop_asm.c new file mode 100644 index 00000000000000..2338b1c1c40a23 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx2/enc_loop_asm.c @@ -0,0 +1,288 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads register R0 from memory. The +// offset at which the register is loaded is set by the given round and a +// constant offset. +#define LOAD(R0, ROUND, OFFSET) \ + "vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t" + +// Generate a block of inline assembly that deinterleaves and shuffles register +// R0 using preloaded constants. Outputs in R0 and R1. +#define SHUF(R0, R1, R2) \ + "vpshufb %[lut0], %["R0"], %["R1"] \n\t" \ + "vpand %["R1"], %[msk0], %["R2"] \n\t" \ + "vpand %["R1"], %[msk2], %["R1"] \n\t" \ + "vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \ + "vpmullw %["R1"], %[msk3], %["R1"] \n\t" \ + "vpor %["R1"], %["R2"], %["R1"] \n\t" + +// Generate a block of inline assembly that takes R0 and R1 and translates +// their contents to the base64 alphabet, using preloaded constants. +#define TRAN(R0, R1, R2) \ + "vpsubusb %[n51], %["R1"], %["R0"] \n\t" \ + "vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \ + "vpsubb %["R2"], %["R0"], %["R0"] \n\t" \ + "vpshufb %["R0"], %[lut1], %["R2"] \n\t" \ + "vpaddb %["R1"], %["R2"], %["R0"] \n\t" + +// Generate a block of inline assembly that stores the given register R0 at an +// offset set by the given round. +#define STOR(R0, ROUND) \ + "vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t" + +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. Then update +// the source and destination pointers. +#define ROUND() \ + LOAD("a", 0, -4) \ + SHUF("a", "b", "c") \ + TRAN("a", "b", "c") \ + STOR("a", 0) \ + "add $24, %[src] \n\t" \ + "add $32, %[dst] \n\t" + +// Define a macro that initiates a three-way interleaved encoding round by +// preloading registers a, b and c from memory. +// The register graph shows which registers are in use during each step, and +// is a visual aid for choosing registers for that step. Symbol index: +// +// + indicates that a register is loaded by that step. +// | indicates that a register is in use and must not be touched. +// - indicates that a register is decommissioned by that step. +// x indicates that a register is used as a temporary by that step. +// V indicates that a register is an input or output to the macro. +// +#define ROUND_3_INIT() /* a b c d e f */ \ + LOAD("a", 0, -4) /* + */ \ + SHUF("a", "d", "e") /* | + x */ \ + LOAD("b", 1, -4) /* | + | */ \ + TRAN("a", "d", "e") /* | | - x */ \ + LOAD("c", 2, -4) /* V V V */ + +// Define a macro that translates, shuffles and stores the input registers A, B +// and C, and preloads registers D, E and F for the next round. +// This macro can be arbitrarily daisy-chained by feeding output registers D, E +// and F back into the next round as input registers A, B and C. The macro +// carefully interleaves memory operations with data operations for optimal +// pipelined performance. + +#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + LOAD(D, (ROUND + 3), -4) /* V V V + */ \ + SHUF(B, E, F) /* | | | | + x */ \ + STOR(A, (ROUND + 0)) /* - | | | | */ \ + TRAN(B, E, F) /* | | | - x */ \ + LOAD(E, (ROUND + 4), -4) /* | | | + */ \ + SHUF(C, A, F) /* + | | | | x */ \ + STOR(B, (ROUND + 1)) /* | - | | | */ \ + TRAN(C, A, F) /* - | | | x */ \ + LOAD(F, (ROUND + 5), -4) /* | | | + */ \ + SHUF(D, A, B) /* + + | | | | */ \ + STOR(C, (ROUND + 2)) /* | | - | | | */ \ + TRAN(D, A, B) /* - - V V V */ + +// Define a macro that terminates a ROUND_3 macro by taking pre-loaded +// registers D, E and F, and translating, shuffling and storing them. +#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + SHUF(E, A, B) /* + x V V V */ \ + STOR(D, (ROUND + 3)) /* | - | | */ \ + TRAN(E, A, B) /* - x | | */ \ + SHUF(F, C, D) /* + x | | */ \ + STOR(E, (ROUND + 4)) /* | - | */ \ + TRAN(F, C, D) /* - x | */ \ + STOR(F, (ROUND + 5)) /* - */ + +// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f. +#define ROUND_3_A(ROUND) \ + ROUND_3(ROUND, "a", "b", "c", "d", "e", "f") + +// Define a type B round. Inputs and outputs are swapped with regard to type A. +#define ROUND_3_B(ROUND) \ + ROUND_3(ROUND, "d", "e", "f", "a", "b", "c") + +// Terminating macro for a type A round. +#define ROUND_3_A_LAST(ROUND) \ + ROUND_3_A(ROUND) \ + ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f") + +// Terminating macro for a type B round. +#define ROUND_3_B_LAST(ROUND) \ + ROUND_3_B(ROUND) \ + ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + +static inline void +enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + // For a clearer explanation of the algorithm used by this function, + // please refer to the plain (not inline assembly) implementation. This + // function follows the same basic logic. + + if (*slen < 32) { + return; + } + + // Process blocks of 24 bytes at a time. Because blocks are loaded 32 + // bytes at a time an offset of -4, ensure that there will be at least + // 4 remaining bytes after the last round, so that the final read will + // not pass beyond the bounds of the input buffer. + size_t rounds = (*slen - 4) / 24; + + *slen -= rounds * 24; // 24 bytes consumed per round + *olen += rounds * 32; // 32 bytes produced per round + + // Pre-decrement the number of rounds to get the number of rounds + // *after* the first round, which is handled as a special case. + rounds--; + + // Number of times to go through the 36x loop. + size_t loops = rounds / 36; + + // Number of rounds remaining after the 36x loop. + rounds %= 36; + + // Lookup tables. + const __m256i lut0 = _mm256_set_epi8( + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, + 14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5); + + const __m256i lut1 = _mm256_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Temporary registers. + __m256i a, b, c, d, e; + + // Temporary register f doubles as the shift mask for the first round. + __m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6); + + __asm__ volatile ( + + // The first loop iteration requires special handling to ensure + // that the read, which is normally done at an offset of -4, + // does not underflow the buffer. Load the buffer at an offset + // of 0 and permute the input to achieve the same effect. + LOAD("a", 0, 0) + "vpermd %[a], %[f], %[a] \n\t" + + // Perform the standard shuffling and translation steps. + SHUF("a", "b", "c") + TRAN("a", "b", "c") + + // Store the result and increment the source and dest pointers. + "vmovdqu %[a], (%[dst]) \n\t" + "add $24, %[src] \n\t" + "add $32, %[dst] \n\t" + + // If there are 36 rounds or more, enter a 36x unrolled loop of + // interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations (table lookups, + // etc) to maximize pipeline throughput. + " test %[loops], %[loops] \n\t" + " jz 18f \n\t" + " jmp 36f \n\t" + " \n\t" + ".balign 64 \n\t" + "36: " ROUND_3_INIT() + " " ROUND_3_A( 0) + " " ROUND_3_B( 3) + " " ROUND_3_A( 6) + " " ROUND_3_B( 9) + " " ROUND_3_A(12) + " " ROUND_3_B(15) + " " ROUND_3_A(18) + " " ROUND_3_B(21) + " " ROUND_3_A(24) + " " ROUND_3_B(27) + " " ROUND_3_A_LAST(30) + " add $(24 * 36), %[src] \n\t" + " add $(32 * 36), %[dst] \n\t" + " dec %[loops] \n\t" + " jnz 36b \n\t" + + // Enter an 18x unrolled loop for rounds of 18 or more. + "18: cmp $18, %[rounds] \n\t" + " jl 9f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B(3) + " " ROUND_3_A(6) + " " ROUND_3_B(9) + " " ROUND_3_A_LAST(12) + " sub $18, %[rounds] \n\t" + " add $(24 * 18), %[src] \n\t" + " add $(32 * 18), %[dst] \n\t" + + // Enter a 9x unrolled loop for rounds of 9 or more. + "9: cmp $9, %[rounds] \n\t" + " jl 6f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B_LAST(3) + " sub $9, %[rounds] \n\t" + " add $(24 * 9), %[src] \n\t" + " add $(32 * 9), %[dst] \n\t" + + // Enter a 6x unrolled loop for rounds of 6 or more. + "6: cmp $6, %[rounds] \n\t" + " jl 55f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A_LAST(0) + " sub $6, %[rounds] \n\t" + " add $(24 * 6), %[src] \n\t" + " add $(32 * 6), %[dst] \n\t" + + // Dispatch the remaining rounds 0..5. + "55: cmp $3, %[rounds] \n\t" + " jg 45f \n\t" + " je 3f \n\t" + " cmp $1, %[rounds] \n\t" + " jg 2f \n\t" + " je 1f \n\t" + " jmp 0f \n\t" + + "45: cmp $4, %[rounds] \n\t" + " je 4f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "5: " ROUND() + "4: " ROUND() + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [rounds] "+r" (rounds), + [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [a] "=&x" (a), + [b] "=&x" (b), + [c] "=&x" (c), + [d] "=&x" (d), + [e] "=&x" (e), + [f] "+x" (f) + + // Inputs (not modified). + : [lut0] "x" (lut0), + [lut1] "x" (lut1), + [msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)), + [msk1] "x" (_mm256_set1_epi32(0x04000040)), + [msk2] "x" (_mm256_set1_epi32(0x003F03F0)), + [msk3] "x" (_mm256_set1_epi32(0x01000010)), + [n51] "x" (_mm256_set1_epi8(51)), + [n25] "x" (_mm256_set1_epi8(25)) + ); +} + +#pragma GCC diagnostic pop diff --git a/deps/base64/base64/lib/arch/avx512/codec.c b/deps/base64/base64/lib/arch/avx512/codec.c new file mode 100644 index 00000000000000..664120853d4316 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx512/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_AVX512 +#include + +#include "../avx2/dec_reshuffle.c" +#include "../avx2/dec_loop.c" +#include "enc_reshuffle_translate.c" +#include "enc_loop.c" + +#endif // HAVE_AVX512 + +BASE64_ENC_FUNCTION(avx512) +{ +#if HAVE_AVX512 + #include "../generic/enc_head.c" + enc_loop_avx512(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +// Reuse AVX2 decoding. Not supporting AVX512 at present +BASE64_DEC_FUNCTION(avx512) +{ +#if HAVE_AVX512 + #include "../generic/dec_head.c" + dec_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/deps/base64/base64/lib/arch/avx512/enc_loop.c b/deps/base64/base64/lib/arch/avx512/enc_loop.c new file mode 100644 index 00000000000000..4c71e160ae820a --- /dev/null +++ b/deps/base64/base64/lib/arch/avx512/enc_loop.c @@ -0,0 +1,61 @@ +static inline void +enc_loop_avx512_inner (const uint8_t **s, uint8_t **o) +{ + // Load input. + __m512i src = _mm512_loadu_si512((__m512i *) *s); + + // Reshuffle, translate, store. + src = enc_reshuffle_translate(src); + _mm512_storeu_si512((__m512i *) *o, src); + + *s += 48; + *o += 64; +} + +static inline void +enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 48 bytes at a time. Because blocks are loaded 64 + // bytes at a time, ensure that there will be at least 24 remaining + // bytes after the last round, so that the final read will not pass + // beyond the bounds of the input buffer. + size_t rounds = (*slen - 24) / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_avx512_inner(s, o); + break; + } +} diff --git a/deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c b/deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c new file mode 100644 index 00000000000000..5c332bb24ca4e3 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c @@ -0,0 +1,50 @@ +// AVX512 algorithm is based on permutevar and multishift. The code is based on +// https://github.com/WojciechMula/base64simd which is under BSD-2 license. + +static inline __m512i +enc_reshuffle_translate (const __m512i input) +{ + // 32-bit input + // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0| + // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4] + // output order [1, 2, 0, 1] + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + + const __m512i shuffle_input = _mm512_setr_epi32(0x01020001, + 0x04050304, + 0x07080607, + 0x0a0b090a, + 0x0d0e0c0d, + 0x10110f10, + 0x13141213, + 0x16171516, + 0x191a1819, + 0x1c1d1b1c, + 0x1f201e1f, + 0x22232122, + 0x25262425, + 0x28292728, + 0x2b2c2a2b, + 0x2e2f2d2e); + + // Reorder bytes + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input); + + // After multishift a single 32-bit lane has following layout + // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0| + // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0] + // (a = [10:17], b = [4:11], c = [22:27], d = [16:21]) + + // 48, 54, 36, 42, 16, 22, 4, 10 + const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); + __m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in); + + // Translate immediatedly after reshuffled. + const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit); + + // Translation 6-bit values to ASCII. + return _mm512_permutexvar_epi8(shuffled_in, lookup); +} diff --git a/deps/base64/base64/lib/codec_choose.c b/deps/base64/base64/lib/codec_choose.c index 6a07d6a74cc24f..ae4e5b5a257a88 100644 --- a/deps/base64/base64/lib/codec_choose.c +++ b/deps/base64/base64/lib/codec_choose.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "../include/libbase64.h" #include "codecs.h" @@ -10,7 +11,7 @@ #if (__x86_64__ || __i386__ || _M_X86 || _M_X64) #define BASE64_X86 - #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2) + #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512) #define BASE64_X86_SIMD #endif #endif @@ -31,7 +32,7 @@ __cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx) #else #include - #if HAVE_AVX2 || HAVE_AVX + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) static inline uint64_t _xgetbv (uint32_t index) { @@ -45,6 +46,12 @@ #endif #endif +#ifndef bit_AVX512vl +#define bit_AVX512vl (1 << 31) +#endif +#ifndef bit_AVX512vbmi +#define bit_AVX512vbmi (1 << 1) +#endif #ifndef bit_AVX2 #define bit_AVX2 (1 << 5) #endif @@ -75,6 +82,7 @@ BASE64_ENC_FUNCTION(arch); \ BASE64_DEC_FUNCTION(arch); \ +BASE64_CODEC_FUNCS(avx512) BASE64_CODEC_FUNCS(avx2) BASE64_CODEC_FUNCS(neon32) BASE64_CODEC_FUNCS(neon64) @@ -91,9 +99,10 @@ codec_choose_forced (struct codec *codec, int flags) // always allow it, even if the codec is a no-op. // For testing purposes. - if (!(flags & 0xFF)) { + if (!(flags & 0xFFFF)) { return false; } + if (flags & BASE64_FORCE_AVX2) { codec->enc = base64_stream_encode_avx2; codec->dec = base64_stream_decode_avx2; @@ -134,6 +143,11 @@ codec_choose_forced (struct codec *codec, int flags) codec->dec = base64_stream_decode_avx; return true; } + if (flags & BASE64_FORCE_AVX512) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } return false; } @@ -178,8 +192,8 @@ codec_choose_x86 (struct codec *codec) max_level = __get_cpuid_max(0, NULL); #endif - #if HAVE_AVX2 || HAVE_AVX - // Check for AVX/AVX2 support: + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX + // Check for AVX/AVX2/AVX512 support: // Checking for AVX requires 3 things: // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions // (allowing saving YMM registers on context switch) @@ -195,6 +209,16 @@ codec_choose_x86 (struct codec *codec) uint64_t xcr_mask; xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { + #if HAVE_AVX512 + if (max_level >= 7) { + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } + } + #endif #if HAVE_AVX2 if (max_level >= 7) { __cpuid_count(7, 0, eax, ebx, ecx, edx); diff --git a/deps/base64/base64/lib/lib.c b/deps/base64/base64/lib/lib.c index 4703512b87ab46..053931a9918b2b 100644 --- a/deps/base64/base64/lib/lib.c +++ b/deps/base64/base64/lib/lib.c @@ -68,7 +68,7 @@ void base64_stream_decode_init (struct base64_state *state, int flags) { // If any of the codec flags are set, redo choice: - if (codec.dec == NULL || flags & 0xFF) { + if (codec.dec == NULL || flags & 0xFFFF) { codec_choose(&codec, flags); } state->eof = 0; diff --git a/deps/base64/base64/test/ci/test.sh b/deps/base64/base64/test/ci/test.sh index 066a49f400b95c..fb188418cca0a9 100755 --- a/deps/base64/base64/test/ci/test.sh +++ b/deps/base64/base64/test/ci/test.sh @@ -7,9 +7,11 @@ if [ "${MACHINE}" == "x86_64" ]; then export SSE41_CFLAGS=-msse4.1 export SSE42_CFLAGS=-msse4.2 export AVX_CFLAGS=-mavx - # no AVX2 on GHA macOS + # no AVX2 or AVX512 on GHA macOS if [ "$(uname -s)" != "Darwin" ]; then export AVX2_CFLAGS=-mavx2 + # Temporarily disable AVX512; it is not available in CI yet. + # export AVX512_CFLAGS="-mavx512vl -mavx512vbmi" fi elif [ "${MACHINE}" == "aarch64" ]; then export NEON64_CFLAGS="-march=armv8-a" diff --git a/deps/base64/base64/test/codec_supported.c b/deps/base64/base64/test/codec_supported.c index a027b9943bf8ed..f68c766875abcd 100644 --- a/deps/base64/base64/test/codec_supported.c +++ b/deps/base64/base64/test/codec_supported.c @@ -11,6 +11,7 @@ static char *_codecs[] = , "SSE41" , "SSE42" , "AVX" +, "AVX512" , NULL } ; diff --git a/deps/base64/base64/test/test_base64.c b/deps/base64/base64/test/test_base64.c index bec52d146c824a..ee8d706860ac98 100644 --- a/deps/base64/base64/test/test_base64.c +++ b/deps/base64/base64/test/test_base64.c @@ -198,6 +198,11 @@ test_streaming (int flags) while (base64_stream_decode(&state, &ref[inpos], (inpos + bs > reflen) ? reflen - inpos : bs, &enc[enclen], &partlen)) { enclen += partlen; inpos += bs; + + // Has the entire buffer been consumed? + if (inpos >= 400) { + break; + } } if (enclen != 256) { printf("FAIL: stream decoding gave incorrect size: "