Skip to content

Commit

Permalink
deps: update zlib to 1.3.0.1-motley-24c07df
Browse files Browse the repository at this point in the history
PR-URL: #52199
Reviewed-By: Marco Ippolito <marcoippolito54@gmail.com>
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
  • Loading branch information
nodejs-github-bot authored and richardlau committed May 16, 2024
1 parent 755399d commit 1152d7f
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 25 deletions.
46 changes: 32 additions & 14 deletions deps/zlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)

SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto")
endif()

if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
add_definitions(-DRISCV_RVV)
add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
add_definitions(-DADLER32_SIMD_RVV)
#TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
# Required by CPU features detection code.
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")
endif()

endif()

#
Expand Down Expand Up @@ -180,20 +190,28 @@ set(ZLIB_SRCS
# Update list of source files if optimizations were enabled
#============================================================================
if (ENABLE_SIMD_OPTIMIZATIONS)
list(REMOVE_ITEM ZLIB_SRCS inflate.c)

list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)

list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
message("RISCVV: Add optimizations.")
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
else()
list(REMOVE_ITEM ZLIB_SRCS inflate.c)

list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)

list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
endif()
endif()

# parse the full version number from zlib.h and include in ZLIB_FULL_VERSION
Expand Down
13 changes: 9 additions & 4 deletions deps/zlib/adler32.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,24 @@
#endif

#include "cpu_features.h"
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_RVV)
#include "adler32_simd.h"
#endif

/* ========================================================================= */
uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
unsigned long sum2;
unsigned n;

/* TODO(cavalcantii): verify if this lengths are optimal for current CPUs. */
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
|| defined(ADLER32_SIMD_RVV)
#if defined(ADLER32_SIMD_SSSE3)
if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
return adler32_simd_(adler, buf, len);
#elif defined(ADLER32_SIMD_NEON)
if (buf != Z_NULL && len >= 64)
#elif defined(ADLER32_SIMD_RVV)
if (buf != Z_NULL && len >= 32 && riscv_cpu_enable_rvv)
#endif
return adler32_simd_(adler, buf, len);
#endif

Expand All @@ -90,7 +94,8 @@ uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
return adler | (sum2 << 16);
}

#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
|| defined(RISCV_RVV)
/*
* Use SIMD to compute the adler32. Since this function can be
* freely used, check CPU features here. zlib convention is to
Expand Down
104 changes: 104 additions & 0 deletions deps/zlib/adler32_simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/
/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include "adler32_simd.h"

Expand Down Expand Up @@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
return s1 | (s2 << 16);
}

#elif defined(ADLER32_SIMD_RVV)
#include <riscv_vector.h>
/* adler32_rvv.c - RVV version of Adler-32
* RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com>
* on https://github.com/zlib-ng/zlib-ng/pull/1532
* Port from Simon Hosie's fork:
* https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
*/

uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
uint32_t adler,
const unsigned char *buf,
unsigned long len)
{
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;

size_t left = len;
size_t vl = __riscv_vsetvlmax_e8m1();
vl = vl > 256 ? 256 : vl;
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint16m2_t v_buf16_accu;

/*
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
* accumulators to boost performance.
*
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
* vl > 256 (255 * 256 <= UINT16_MAX).
*
* We accumulate 8-bit data into a 16-bit accumulator and then
* move the data into the 32-bit accumulator at the last iteration.
*/
size_t block_size = (256 / vl) * vl;
size_t nmax_limit = (NMAX / block_size);
size_t cnt = 0;
while (left >= block_size) {
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t subprob = block_size;
while (subprob > 0) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
subprob -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
left -= block_size;
/* do modulo once each block of NMAX size */
if (++cnt >= nmax_limit) {
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
cnt = 0;
}
}
/* the left len <= 256 now, we can use 16-bit accum safely */
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t res = left;
while (left >= vl) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
left -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);

vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);

v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);

vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);

sum2 += (sum2_sum + adler * (len - left));

vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);

adler += adler_sum;

while (left--) {
adler += *buf++;
sum2 += adler;
}

sum2 %= BASE;
adler %= BASE;

return adler | (sum2 << 16);
}

#endif /* ADLER32_SIMD_SSSE3 */
32 changes: 28 additions & 4 deletions deps/zlib/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,13 @@ int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;

int ZLIB_INTERNAL riscv_cpu_enable_rvv = 0;
int ZLIB_INTERNAL riscv_cpu_enable_vclmul = 0;

#ifndef CPU_NO_SIMD

#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
#include <pthread.h>
#endif

Expand All @@ -62,7 +66,10 @@ int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
static void _cpu_check_features(void);
#endif

#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS)
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || \
defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS) || \
defined(RISCV_RVV)
#if !defined(ARMV8_OS_MACOS)
// _cpu_check_features() doesn't need to do anything on mac/arm since all
// features are known at build time, so don't call it.
Expand Down Expand Up @@ -184,6 +191,23 @@ static void _cpu_check_features(void)
x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
#endif
}
#endif // x86 & NO_SIMD

#elif defined(RISCV_RVV)
#include <sys/auxv.h>

#ifndef ZLIB_HWCAP_RVV
#define ZLIB_HWCAP_RVV (1 << ('v' - 'a'))
#endif
#endif
#endif

/* TODO(cavalcantii)
* - add support for Android@RISCV i.e. __riscv_hwprobe().
* - detect vclmul (crypto extensions).
*/
static void _cpu_check_features(void)
{
unsigned long features = getauxval(AT_HWCAP);
riscv_cpu_enable_rvv = !!(features & ZLIB_HWCAP_RVV);
}
#endif // ARM | x86 | RISCV
#endif // NO SIMD CPU
3 changes: 3 additions & 0 deletions deps/zlib/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ extern int x86_cpu_enable_ssse3;
extern int x86_cpu_enable_simd;
extern int x86_cpu_enable_avx512;

extern int riscv_cpu_enable_rvv;
extern int riscv_cpu_enable_vclmul;

void cpu_check_features(void);
6 changes: 4 additions & 2 deletions deps/zlib/crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,8 @@ unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
* place to cache CPU features if needed for those later, more
* interesting crc32() calls.
*/
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
|| defined(RISCV_RVV)
/*
* Since this routine can be freely used, check CPU features here.
*/
Expand Down Expand Up @@ -1085,7 +1086,8 @@ unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
/* Some bots compile with optimizations disabled, others will emulate
* ARM on x86 and other weird combinations.
*/
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
|| defined(RISCV_RVV)
/* We got to verify CPU features, so exploit the common usage pattern
* of calling this function with Z_NULL for an initial valid crc value.
* This allows to cache the result of the feature check and avoid extraneous
Expand Down
3 changes: 2 additions & 1 deletion deps/zlib/deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,8 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method,
// for all wrapper formats (e.g. RAW, ZLIB, GZIP).
// Feature detection is not triggered while using RAW mode (i.e. we never
// call crc32() with a NULL buffer).
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) \
|| defined(RISCV_RVV)
cpu_check_features();
#endif

Expand Down

0 comments on commit 1152d7f

Please sign in to comment.