Skip to content

Commit

Permalink
deps: update zlib to 1.3.0.1-motley-7d77fb7
Browse files Browse the repository at this point in the history
PR-URL: #52516
Reviewed-By: Marco Ippolito <marcoippolito54@gmail.com>
Reviewed-By: Mohammed Keyvanzadeh <mohammadkeyvanzade94@gmail.com>
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
  • Loading branch information
nodejs-github-bot authored and marco-ippolito committed May 3, 2024
1 parent 6fdd748 commit 9c330b6
Show file tree
Hide file tree
Showing 10 changed files with 1,032 additions and 99 deletions.
30 changes: 30 additions & 0 deletions deps/zlib/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,36 @@ executable("zlib_bench") {
configs += [ "//build/config/compiler:no_chromium_code" ]
}

executable("minigzip") {
include_dirs = [ "." ]

sources = [ "test/minigzip.c" ]
if (!is_debug) {
configs -= [ "//build/config/compiler:default_optimization" ]
configs += [ "//build/config/compiler:optimize_speed" ]
}

deps = [ ":zlib" ]

configs -= [ "//build/config/compiler:chromium_code" ]
configs += [ "//build/config/compiler:no_chromium_code" ]
}

executable("zpipe") {
include_dirs = [ "." ]

sources = [ "examples/zpipe.c" ]
if (!is_debug) {
configs -= [ "//build/config/compiler:default_optimization" ]
configs += [ "//build/config/compiler:optimize_speed" ]
}

deps = [ ":zlib" ]

configs -= [ "//build/config/compiler:chromium_code" ]
configs += [ "//build/config/compiler:no_chromium_code" ]
}

if (!is_win || target_os != "winuwp") {
executable("minizip_bin") {
include_dirs = [ "." ]
Expand Down
37 changes: 33 additions & 4 deletions deps/zlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)
option(USE_ZLIB_RABIN_KARP_HASH "Enable bitstream compatibility with canonical zlib" OFF)
option(BUILD_UNITTESTS "Enable standalone unit tests build" OFF)
option(BUILD_MINIZIP_BIN "Enable building minzip_bin tool" OFF)
option(BUILD_ZPIPE "Enable building zpipe tool" OFF)
option(BUILD_MINIGZIP "Enable building minigzip tool" OFF)

if (USE_ZLIB_RABIN_KARP_HASH)
add_definitions(-DUSE_ZLIB_RABIN_KARP_ROLLING_HASH)
Expand Down Expand Up @@ -79,9 +81,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
add_definitions(-DRISCV_RVV)
add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
add_definitions(-DADLER32_SIMD_RVV)
#TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
# Required by CPU features detection code.
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")

# TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
# chunk_copy is required for READ64 and unconditional decode of literals.
add_definitions(-DINFLATE_CHUNK_GENERIC)
add_definitions(-DINFLATE_CHUNK_READ_64LE)

# Tested with clang-17, unaligned loads are required by read64 & chunk_copy.
# TODO(cavalcantii): replace internal clang flags for -munaligned-access
# when we have a newer compiler available.
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv -Xclang -target-feature -Xclang +unaligned-scalar-mem")
endif()

endif()
Expand Down Expand Up @@ -192,9 +201,14 @@ set(ZLIB_SRCS
if (ENABLE_SIMD_OPTIMIZATIONS)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
message("RISCVV: Add optimizations.")
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)

list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
else()
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
Expand Down Expand Up @@ -339,7 +353,7 @@ if (BUILD_UNITTESTS)
endif()

#============================================================================
# Minigzip tool
# Minizip tool
#============================================================================
# TODO(cavalcantii): get it working on Windows.
if (BUILD_MINIZIP_BIN)
Expand All @@ -349,3 +363,18 @@ if (BUILD_MINIZIP_BIN)
)
target_link_libraries(minizip_bin zlib)
endif()

#============================================================================
# zpipe tool
#============================================================================
if (BUILD_ZPIPE)
add_executable(zpipe examples/zpipe.c)
target_link_libraries(zpipe zlib)
endif()
#============================================================================
# MiniGzip tool
#============================================================================
if (BUILD_MINIGZIP)
add_executable(minigzip_bin test/minigzip.c)
target_link_libraries(minigzip_bin zlib)
endif()
166 changes: 76 additions & 90 deletions deps/zlib/adler32_simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/
/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include "adler32_simd.h"

Expand Down Expand Up @@ -368,103 +365,92 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */

#elif defined(ADLER32_SIMD_RVV)
#include <riscv_vector.h>
/* adler32_rvv.c - RVV version of Adler-32
* RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com>
* on https://github.com/zlib-ng/zlib-ng/pull/1532
* Port from Simon Hosie's fork:
* https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1

/*
* Patch by Simon Hosie, from:
* https://github.com/cloudflare/zlib/pull/55
*/

uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
uint32_t adler,
const unsigned char *buf,
unsigned long len)
{
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;

size_t left = len;
size_t vl = __riscv_vsetvlmax_e8m1();
vl = vl > 256 ? 256 : vl;
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint16m2_t v_buf16_accu;

/*
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
* accumulators to boost performance.
*
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
* vl > 256 (255 * 256 <= UINT16_MAX).
*
* We accumulate 8-bit data into a 16-bit accumulator and then
* move the data into the 32-bit accumulator at the last iteration.
size_t vl = __riscv_vsetvlmax_e8m2();
const vuint16m4_t zero16 = __riscv_vmv_v_x_u16m4(0, vl);
vuint16m4_t a_sum = zero16;
vuint32m8_t b_sum = __riscv_vmv_v_x_u32m8(0, vl);

/* Deal with the part which is not a multiple of vl first; because it's
* easier to zero-stuff the beginning of the checksum than it is to tweak the
* multipliers and sums for odd lengths afterwards.
*/
size_t head = len & (vl - 1);
if (head > 0) {
vuint8m2_t zero8 = __riscv_vmv_v_x_u8m2(0, vl);
vuint8m2_t in = __riscv_vle8_v_u8m2(buf, vl);
in = __riscv_vslideup(zero8, in, vl - head, vl);
vuint16m4_t in16 = __riscv_vwcvtu_x(in, vl);
a_sum = in16;
buf += head;
}

/* We have a 32-bit accumulator, and in each iteration we add 22-times a
* 16-bit value, plus another 16-bit value. We periodically subtract up to
* 65535 times BASE to avoid overflow. b_overflow estimates how often we
* need to do this subtraction.
*/
const int b_overflow = BASE / 23;
int fixup = b_overflow;
ssize_t iters = (len - head) / vl;
while (iters > 0) {
const vuint16m4_t a_overflow = __riscv_vrsub(a_sum, BASE, vl);
int batch = iters < 22 ? iters : 22;
iters -= batch;
b_sum = __riscv_vwmaccu(b_sum, batch, a_sum, vl);
vuint16m4_t a_batch = zero16, b_batch = zero16;

/* Do a short batch, where neither a_sum nor b_sum can overflow a 16-bit
* register. Then add them back into the main accumulators.
*/
size_t block_size = (256 / vl) * vl;
size_t nmax_limit = (NMAX / block_size);
size_t cnt = 0;
while (left >= block_size) {
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t subprob = block_size;
while (subprob > 0) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
subprob -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
left -= block_size;
/* do modulo once each block of NMAX size */
if (++cnt >= nmax_limit) {
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
cnt = 0;
}
while (batch-- > 0) {
vuint8m2_t in8 = __riscv_vle8_v_u8m2(buf, vl);
buf += vl;
b_batch = __riscv_vadd(b_batch, a_batch, vl);
a_batch = __riscv_vwaddu_wv(a_batch, in8, vl);
}
/* the left len <= 256 now, we can use 16-bit accum safely */
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t res = left;
while (left >= vl) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
left -= vl;
vbool4_t ov = __riscv_vmsgeu(a_batch, a_overflow, vl);
a_sum = __riscv_vadd(a_sum, a_batch, vl);
a_sum = __riscv_vadd_mu(ov, a_sum, a_sum, 65536 - BASE, vl);
b_sum = __riscv_vwaddu_wv(b_sum, b_batch, vl);
if (--fixup <= 0) {
b_sum = __riscv_vnmsac(b_sum, BASE, __riscv_vsrl(b_sum, 16, vl), vl);
fixup = b_overflow;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);

vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);

v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);

vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);

sum2 += (sum2_sum + adler * (len - left));

vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);

adler += adler_sum;

while (left--) {
adler += *buf++;
sum2 += adler;
}

sum2 %= BASE;
adler %= BASE;

return adler | (sum2 << 16);
}
/* Adjust per-lane sums to have appropriate offsets from the end of the
* buffer.
*/
const vuint16m4_t off = __riscv_vrsub(__riscv_vid_v_u16m4(vl), vl, vl);
vuint16m4_t bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);
b_sum = __riscv_vadd(__riscv_vwmulu(a_sum, off, vl),
__riscv_vwmulu(bsum16, vl, vl), vl);
bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);

/* And finally, do a horizontal sum across the registers for the final
* result.
*/
uint32_t a = adler & 0xffff;
uint32_t b = ((adler >> 16) + a * (len % BASE)) % BASE;
vuint32m1_t sca = __riscv_vmv_v_x_u32m1(a, 1);
vuint32m1_t scb = __riscv_vmv_v_x_u32m1(b, 1);
sca = __riscv_vwredsumu(a_sum, sca, vl);
scb = __riscv_vwredsumu(bsum16, scb, vl);
a = __riscv_vmv_x(sca);
b = __riscv_vmv_x(scb);
a %= BASE;
b %= BASE;
return (b << 16) | a;
}

#endif /* ADLER32_SIMD_SSSE3 */
75 changes: 75 additions & 0 deletions deps/zlib/contrib/optimizations/chunkcopy.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@

#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
#define Z_BUILTIN_MEMCPY __builtin_memcpy
#define Z_BUILTIN_MEMSET __builtin_memset
#else
#define Z_BUILTIN_MEMCPY zmemcpy
#define Z_BUILTIN_MEMSET zmemset
#endif

#if defined(INFLATE_CHUNK_SIMD_NEON)
Expand All @@ -31,6 +33,8 @@ typedef uint8x16_t z_vec128i_t;
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
#include <emmintrin.h>
typedef __m128i z_vec128i_t;
#elif defined(INFLATE_CHUNK_GENERIC)
typedef struct { uint8_t x[16]; } z_vec128i_t;
#else
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
#endif
Expand Down Expand Up @@ -265,6 +269,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
static inline void v_store_128(void* out, const z_vec128i_t vec) {
_mm_storeu_si128((__m128i*)out, vec);
}
#elif defined(INFLATE_CHUNK_GENERIC)
/*
* Default implementations for chunk-copy functions rely on memcpy() being
* inlined by the compiler for best performance. This is most likely to work
* as expected when the length argument is constant (as is the case here) and
* the target supports unaligned loads and stores. Since that's not always a
* safe assumption, this may need extra compiler arguments such as
* `-mno-strict-align` or `-munaligned-access`, or the availability of
* extensions like SIMD.
*/

/*
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
* every 64-bit component of the 128-bit result (64-bit int splat).
*/
static inline z_vec128i_t v_load64_dup(const void* src) {
int64_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
* every 32-bit component of the 128-bit result (32-bit int splat).
*/
static inline z_vec128i_t v_load32_dup(const void* src) {
int32_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
* every 16-bit component of the 128-bit result (16-bit int splat).
*/
static inline z_vec128i_t v_load16_dup(const void* src) {
int16_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
* component of the 128-bit result (8-bit int splat).
*/
static inline z_vec128i_t v_load8_dup(const void* src) {
int8_t in = *(const uint8_t*)src;
z_vec128i_t out;
Z_BUILTIN_MEMSET(&out, in, sizeof(out));
return out;
}

/*
* v_store_128(): store the 128-bit vec in a memory destination (that might
* not be 16-byte aligned) void* out.
*/
static inline void v_store_128(void* out, const z_vec128i_t vec) {
Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
}
#endif

/*
Expand Down
Loading

0 comments on commit 9c330b6

Please sign in to comment.