Skip to content

Commit

Permalink
zlib-ng 2022-08-17 (89763032)
Browse files Browse the repository at this point in the history
Code extracted from:

    https://github.com/zlib-ng/zlib-ng.git

at commit 89763032d57e3da5301f4cd6e1f363e7a1f85f02 (develop).
  • Loading branch information
kwrobot authored and hjmjohnson committed Aug 25, 2022
1 parent 4cab5b3 commit 7b40b99
Show file tree
Hide file tree
Showing 80 changed files with 11,604 additions and 2,794 deletions.
334 changes: 18 additions & 316 deletions CMakeLists.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ with zlib, then zlib-ng will temporarily be used instead by the program,
without risking system-wide instability.

```
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.11.zlib-ng /usr/bin/program
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.12.zlib-ng /usr/bin/program
```

### Cmake
Expand Down
2 changes: 1 addition & 1 deletion adler32.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "adler32_p.h"

/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len) {
uint32_t sum2;
unsigned n;

Expand Down
22 changes: 22 additions & 0 deletions adler32_fold.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/* adler32_fold.c -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include "zbuild.h"
#include "functable.h"
#include "adler32_fold.h"

Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
adler = functable.adler32(adler, src, len);
while (len > SIZE_MAX) {
memcpy(dst, src, SIZE_MAX);
dst += SIZE_MAX;
src += SIZE_MAX;
len -= SIZE_MAX;
}
if (len) {
memcpy(dst, src, (size_t)len);
}
return adler;
}
11 changes: 11 additions & 0 deletions adler32_fold.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/* adler32_fold.h -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_

Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);

#endif
18 changes: 15 additions & 3 deletions adler32_p.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@
#define DO8(sum1, sum2, buf, i) {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
#define DO16(sum1, sum2, buf) {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}

static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, uint32_t sum2) {
static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
adler += buf[0];
adler %= BASE;
sum2 += adler;
sum2 %= BASE;
return adler | (sum2 << 16);
}

static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
while (len) {
--len;
adler += *buf++;
Expand All @@ -38,7 +38,19 @@ static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf,
return adler | (sum2 << 16);
}

static inline uint32_t adler32_len_64(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, uint64_t len, uint32_t sum2) {
while (len--) {
*dst = *buf++;
adler += *dst++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE; /* only added so many BASE's */
/* return recombined sums */
return adler | (sum2 << 16);
}

static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
#ifdef UNROLL_MORE
while (len >= 16) {
len -= 16;
Expand Down
6 changes: 3 additions & 3 deletions arch/arm/adler32_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "../../adler32_p.h"
#include "../../fallback_builtins.h"

static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {
64, 63, 62, 61, 60, 59, 58, 57,
56, 55, 54, 53, 52, 51, 50, 49,
Expand Down Expand Up @@ -138,15 +138,15 @@ static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
s[1] = vget_lane_u32(as, 1);
}

static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}

uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
Expand Down
53 changes: 49 additions & 4 deletions arch/arm/chunkset_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# include <arm_neon.h>
#endif
#include "../../zbuild.h"
#include "../generic/chunk_permute_table.h"

typedef uint8x16_t chunk_t;

Expand All @@ -17,28 +18,44 @@ typedef uint8x16_t chunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG

static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
{0, 0}, /* don't care */
{1 * 32, 1}, /* 5 */
{2 * 32, 4}, /* 6 */
{3 * 32, 2}, /* 7 */
{0 * 32, 0}, /* don't care */
{4 * 32, 7}, /* 9 */
{5 * 32, 6}, /* 10 */
{6 * 32, 5}, /* 11 */
{7 * 32, 4}, /* 12 */
{8 * 32, 3}, /* 13 */
{9 * 32, 2}, /* 14 */
{10 * 32, 1},/* 15 */
};

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
zmemcpy_2(&tmp, from);
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
zmemcpy_4(&tmp, from);
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
zmemcpy_8(&tmp, from);
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
}

#define CHUNKSIZE chunksize_neon
#define CHUNKCOPY chunkcopy_neon
#define CHUNKCOPY_SAFE chunkcopy_safe_neon
#define CHUNKUNROLL chunkunroll_neon
#define CHUNKMEMSET chunkmemset_neon
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
Expand All @@ -51,6 +68,34 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vst1q_u8(out, *chunk);
}

static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
*chunk_rem = lut_rem.remval;

#ifdef Z_MEMORY_SANITIZER
/* See note in chunkset_sse41.c for why this is ok */
__msan_unpoison(buf + dist, 16 - dist);
#endif

/* This version of table is only available on aarch64 */
#if defined(_M_ARM64) || defined(__aarch64__)
uint8x16_t ret_vec = vld1q_u8(buf);

uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
return vqtbl1q_u8(ret_vec, perm_vec);
#else
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
a = vld1_u8(buf);
b = vld1_u8(buf + 8);
ret0 = vtbl1_u8(a, perm_vec0);
uint8x8x2_t ab = {{a, b}};
ret1 = vtbl2_u8(ab, perm_vec1);
return vcombine_u8(ret0, ret1);
#endif
}

#include "chunkset_tpl.h"

#endif
36 changes: 11 additions & 25 deletions arch/arm/crc32_acle.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#endif
#include "../../zbuild.h"

uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, uint64_t len) {
Z_REGISTER uint32_t c;
Z_REGISTER const uint16_t *buf2;
Z_REGISTER const uint32_t *buf4;
Expand All @@ -22,7 +22,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
len--;
}

if ((len > sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
buf2 = (const uint16_t *) buf;
c = __crc32h(c, *buf2++);
len -= sizeof(uint16_t);
Expand All @@ -32,22 +32,17 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
}

#if defined(__aarch64__)
if ((len > sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
c = __crc32w(c, *buf4++);
len -= sizeof(uint32_t);
}

const uint64_t *buf8 = (const uint64_t *) buf4;

#ifdef UNROLL_MORE
while (len >= 4 * sizeof(uint64_t)) {
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
len -= 4 * sizeof(uint64_t);
if (len == 0) {
c = ~c;
return c;
}
#endif

const uint64_t *buf8 = (const uint64_t *) buf4;

while (len >= sizeof(uint64_t)) {
c = __crc32d(c, *buf8++);
Expand All @@ -71,19 +66,10 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
buf = (const unsigned char *) buf2;
#else /* __aarch64__ */

# ifdef UNROLL_MORE
while (len >= 8 * sizeof(uint32_t)) {
c = __crc32w(c, *buf4++);
c = __crc32w(c, *buf4++);
c = __crc32w(c, *buf4++);
c = __crc32w(c, *buf4++);
c = __crc32w(c, *buf4++);
c = __crc32w(c, *buf4++);
c = __crc32w(c, *buf4++);
c = __crc32w(c, *buf4++);
len -= 8 * sizeof(uint32_t);
if (len == 0) {
c = ~c;
return c;
}
# endif

while (len >= sizeof(uint32_t)) {
c = __crc32w(c, *buf4++);
Expand Down
53 changes: 53 additions & 0 deletions arch/generic/chunk_permute_table.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#ifndef CHUNK_PERMUTE_TABLE_H_
#define CHUNK_PERMUTE_TABLE_H_

#include "zbuild.h"

/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
static const ALIGNED_(32) uint8_t permute_table[26*32] = {
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */

/* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
* beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
* blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
* we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
* this is what we're dealt.
*/

16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
};

typedef struct lut_rem_pair_s {
uint16_t idx;
uint16_t remval;
} lut_rem_pair;

#endif
2 changes: 1 addition & 1 deletion arch/power/adler32_power8.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsi
return __a;
}

uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) {
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;

Expand Down
6 changes: 3 additions & 3 deletions arch/power/adler32_vmx.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@

#define vmx_zero() (vec_splat_u32(0))

static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}

static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
Expand Down Expand Up @@ -113,7 +113,7 @@ static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
vec_ste(s2acc, 0, s+1);
}

uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
Expand Down
7 changes: 3 additions & 4 deletions arch/power/chunkset_power8.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,24 @@ typedef vector unsigned char chunk_t;

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
zmemcpy_2(&tmp, from);
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
zmemcpy_4(&tmp, from);
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
zmemcpy_8(&tmp, from);
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}

#define CHUNKSIZE chunksize_power8
#define CHUNKCOPY chunkcopy_power8
#define CHUNKCOPY_SAFE chunkcopy_safe_power8
#define CHUNKUNROLL chunkunroll_power8
#define CHUNKMEMSET chunkmemset_power8
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
Expand Down
Loading

0 comments on commit 7b40b99

Please sign in to comment.