zlib-ng 2022-08-17 (89763032)

Code extracted from: https://github.com/zlib-ng/zlib-ng.git at commit 89763032d57e3da5301f4cd6e1f363e7a1f85f02 (develop).
InsightSoftwareConsortium · Aug 25, 2022 · 7b40b99 · 7b40b99
1 parent 4cab5b3
commit 7b40b99
Show file tree

Hide file tree

Showing 80 changed files with 11,604 additions and 2,794 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
diff --git a/README.md b/README.md
@@ -142,7 +142,7 @@ with zlib, then zlib-ng will temporarily be used instead by the program,
 without risking system-wide instability.
 
 ```
-LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.11.zlib-ng /usr/bin/program
+LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.12.zlib-ng /usr/bin/program
 ```
 
 ### Cmake

diff --git a/adler32.c b/adler32.c
@@ -8,7 +8,7 @@
 #include "adler32_p.h"
 
 /* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len) {
     uint32_t sum2;
     unsigned n;
 

diff --git a/adler32_fold.c b/adler32_fold.c
@@ -0,0 +1,22 @@
+/* adler32_fold.c -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_fold.h"
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+    adler = functable.adler32(adler, src, len);
+    while (len > SIZE_MAX) {
+        memcpy(dst, src, SIZE_MAX);
+        dst += SIZE_MAX;
+        src += SIZE_MAX;
+        len -= SIZE_MAX;
+    }
+    if (len) {
+        memcpy(dst, src, (size_t)len);
+    }
+    return adler;
+}
diff --git a/adler32_fold.h b/adler32_fold.h
@@ -0,0 +1,11 @@
+/* adler32_fold.h -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_FOLD_H_
+#define ADLER32_FOLD_H_
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+
+#endif
diff --git a/adler32_p.h b/adler32_p.h
@@ -18,15 +18,15 @@
 #define DO8(sum1, sum2, buf, i)  {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
 #define DO16(sum1, sum2, buf)    {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
 
-static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, uint32_t sum2) {
+static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
     adler += buf[0];
     adler %= BASE;
     sum2 += adler;
     sum2 %= BASE;
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
     while (len) {
         --len;
         adler += *buf++;
@@ -38,7 +38,19 @@ static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf,
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_len_64(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, uint64_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++;
+        adler += *dst++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
 #ifdef UNROLL_MORE
     while (len >= 16) {
         len -= 16;

diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
@@ -15,7 +15,7 @@
 #include "../../adler32_p.h"
 #include "../../fallback_builtins.h"
 
-static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
+static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
     static const uint16_t ALIGNED_(16) taps[64] = {
         64, 63, 62, 61, 60, 59, 58, 57,
         56, 55, 54, 53, 52, 51, 50, 49,
@@ -138,15 +138,15 @@ static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
     s[1] = vget_lane_u32(as, 1);
 }
 
-static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
+static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
     unsigned int i;
     for (i = 0; i < len; ++i) {
         pair[0] += buf[i];
         pair[1] += pair[0];
     }
 }
 
-uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len) {
     /* split Adler-32 into component sums */
     uint32_t sum2 = (adler >> 16) & 0xffff;
     adler &= 0xffff;

diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
@@ -9,6 +9,7 @@
 #  include <arm_neon.h>
 #endif
 #include "../../zbuild.h"
+#include "../generic/chunk_permute_table.h"
 
 typedef uint8x16_t chunk_t;
 
@@ -17,28 +18,44 @@ typedef uint8x16_t chunk_t;
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
 
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
     uint16_t tmp;
-    zmemcpy_2(&tmp, from);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
 }
 
 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
     uint32_t tmp;
-    zmemcpy_4(&tmp, from);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
 }
 
 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
     uint64_t tmp;
-    zmemcpy_8(&tmp, from);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
 }
 
 #define CHUNKSIZE        chunksize_neon
 #define CHUNKCOPY        chunkcopy_neon
-#define CHUNKCOPY_SAFE   chunkcopy_safe_neon
 #define CHUNKUNROLL      chunkunroll_neon
 #define CHUNKMEMSET      chunkmemset_neon
 #define CHUNKMEMSET_SAFE chunkmemset_safe_neon
@@ -51,6 +68,34 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
     vst1q_u8(out, *chunk);
 }
 
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+    /* See note in chunkset_sse41.c for why this is ok */
+    __msan_unpoison(buf + dist, 16 - dist);
+#endif
+
+    /* This version of table is only available on aarch64 */
+#if defined(_M_ARM64) || defined(__aarch64__)
+    uint8x16_t ret_vec = vld1q_u8(buf);
+
+    uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+    return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+    uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+    perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
+    perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+    a = vld1_u8(buf);
+    b = vld1_u8(buf + 8);
+    ret0 = vtbl1_u8(a, perm_vec0);
+    uint8x8x2_t ab = {{a, b}};
+    ret1 = vtbl2_u8(ab, perm_vec1);
+    return vcombine_u8(ret0, ret1);
+#endif
+}
+
 #include "chunkset_tpl.h"
 
 #endif
diff --git a/arch/arm/crc32_acle.c b/arch/arm/crc32_acle.c
@@ -11,7 +11,7 @@
 #endif
 #include "../../zbuild.h"
 
-uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
+uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, uint64_t len) {
     Z_REGISTER uint32_t c;
     Z_REGISTER const uint16_t *buf2;
     Z_REGISTER const uint32_t *buf4;
@@ -22,7 +22,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
         len--;
     }
 
-    if ((len > sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
+    if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
         buf2 = (const uint16_t *) buf;
         c = __crc32h(c, *buf2++);
         len -= sizeof(uint16_t);
@@ -32,22 +32,17 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
     }
 
 #if defined(__aarch64__)
-    if ((len > sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
+    if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
         c = __crc32w(c, *buf4++);
         len -= sizeof(uint32_t);
     }
 
-    const uint64_t *buf8 = (const uint64_t *) buf4;
-
-#ifdef UNROLL_MORE
-    while (len >= 4 * sizeof(uint64_t)) {
-        c = __crc32d(c, *buf8++);
-        c = __crc32d(c, *buf8++);
-        c = __crc32d(c, *buf8++);
-        c = __crc32d(c, *buf8++);
-        len -= 4 * sizeof(uint64_t);
+    if (len == 0) {
+        c = ~c;
+        return c;
     }
-#endif
+
+    const uint64_t *buf8 = (const uint64_t *) buf4;
 
     while (len >= sizeof(uint64_t)) {
         c = __crc32d(c, *buf8++);
@@ -71,19 +66,10 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
     buf = (const unsigned char *) buf2;
 #else /* __aarch64__ */
 
-#  ifdef UNROLL_MORE
-    while (len >= 8 * sizeof(uint32_t)) {
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        len -= 8 * sizeof(uint32_t);
+    if (len == 0) {
+        c = ~c;
+        return c;
     }
-#  endif
 
     while (len >= sizeof(uint32_t)) {
         c = __crc32w(c, *buf4++);

diff --git a/arch/generic/chunk_permute_table.h b/arch/generic/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c
@@ -52,7 +52,7 @@ static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsi
     return __a;
 }
 
-uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) {
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len) {
     uint32_t s1 = adler & 0xffff;
     uint32_t s2 = (adler >> 16) & 0xffff;
 

diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
@@ -12,15 +12,15 @@
 
 #define vmx_zero()  (vec_splat_u32(0))
 
-static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
     unsigned int i;
     for (i = 0; i < len; ++i) {
         pair[0] += buf[i];
         pair[1] += pair[0];
     }
 }
 
-static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
     /* Different taps for the separable components of sums */
     const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
     const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
@@ -113,7 +113,7 @@ static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
     vec_ste(s2acc, 0, s+1);
 }
 
-uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) {
     uint32_t sum2;
     uint32_t pair[16] ALIGNED_(16);
     memset(&pair[2], 0, 14);

diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c
@@ -16,25 +16,24 @@ typedef vector unsigned char chunk_t;
 
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
     uint16_t tmp;
-    zmemcpy_2(&tmp, from);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = (vector unsigned char)vec_splats(tmp);
 }
 
 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
     uint32_t tmp;
-    zmemcpy_4(&tmp, from);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = (vector unsigned char)vec_splats(tmp);
 }
 
 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
     uint64_t tmp;
-    zmemcpy_8(&tmp, from);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = (vector unsigned char)vec_splats(tmp);
 }
 
 #define CHUNKSIZE        chunksize_power8
 #define CHUNKCOPY        chunkcopy_power8
-#define CHUNKCOPY_SAFE   chunkcopy_safe_power8
 #define CHUNKUNROLL      chunkunroll_power8
 #define CHUNKMEMSET      chunkmemset_power8
 #define CHUNKMEMSET_SAFE chunkmemset_safe_power8