Skip to content

Commit

Permalink
zlib-ng 2022-05-12 (41d67396)
Browse files Browse the repository at this point in the history
Code extracted from:

    https://github.com/zlib-ng/zlib-ng.git

at commit 41d67396924ccc7ab1ff9a7e7d434bfb0887b136 (develop).
  • Loading branch information
kwrobot authored and bradking committed May 12, 2022
1 parent f2bb813 commit 4cab5b3
Show file tree
Hide file tree
Showing 35 changed files with 394 additions and 159 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
*.h text
Makefile text
configure text eol=lf
* -whitespace
27 changes: 23 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ if(BASEARCH_ARM_FOUND)
elseif(BASEARCH_PPC_FOUND)
option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON)
option(WITH_POWER8 "Build with optimisations for POWER8" ON)
option(WITH_POWER9 "Build with optimisations for POWER9" ON)
elseif(BASEARCH_S360_FOUND)
option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
Expand Down Expand Up @@ -138,6 +139,7 @@ mark_as_advanced(FORCE
WITH_PCLMULQDQ
WITH_ALTIVEC
WITH_POWER8
WITH_POWER9
WITH_INFLATE_STRICT
WITH_INFLATE_ALLOW_INVALID_DIST
WITH_UNALIGNED
Expand Down Expand Up @@ -255,7 +257,8 @@ endif()
if(NOT WITH_NATIVE_INSTRUCTIONS)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
foreach(_cfg_name IN LISTS CMAKE_CONFIGURATION_TYPES)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_${_cfg_name} OFF)
string(TOUPPER "${_cfg_name}" _cfg_name_uc)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_${_cfg_name_uc} OFF)
endforeach()
endif()

Expand Down Expand Up @@ -601,8 +604,9 @@ if(WITH_OPTIM)
if(WITH_NEON)
check_neon_compiler_flag()
if(MFPU_NEON_AVAILABLE)
add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH)
set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c ${ARCHDIR}/slide_hash_neon.c)
add_definitions(-DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH)
set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
if(MSVC)
Expand All @@ -626,7 +630,10 @@ if(WITH_OPTIM)
if(WITH_POWER8)
check_power8_intrinsics()
endif()
if(HAVE_VMX OR HAVE_POWER8_INTRIN)
if(WITH_POWER9)
check_power9_intrinsics()
endif()
if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
endif()
Expand Down Expand Up @@ -665,6 +672,17 @@ if(WITH_OPTIM)
set(WITH_POWER8 OFF)
endif()
endif()
# Power9 specific options and files
if(WITH_POWER9)
if(HAVE_POWER9_INTRIN)
add_definitions(-DPOWER9)
set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
else()
set(WITH_POWER9 OFF)
endif()
endif()
elseif(BASEARCH_S360_FOUND)
check_s390_intrinsics()
if(HAVE_S390_INTRIN)
Expand Down Expand Up @@ -1461,6 +1479,7 @@ if(BASEARCH_ARM_FOUND)
elseif(BASEARCH_PPC_FOUND)
add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations")
add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8")
add_feature_info(WITH_POWER9 WITH_POWER9 "Build with optimisations for POWER9")
elseif(BASEARCH_S360_FOUND)
add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z")
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Features
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
* Compare256 implementations using SSE2 & AVX2
* Compare256 implementations using SSE2, AVX2, Neon, & POWER9
* Inflate chunk copying using SSE2, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
Expand Down
7 changes: 7 additions & 0 deletions arch/arm/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ all: \
adler32_neon.o adler32_neon.lo \
arm_features.o arm_features.lo \
chunkset_neon.o chunkset_neon.lo \
compare256_neon.o compare256_neon.lo \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
insert_string_acle.o insert_string_acle.lo
Expand All @@ -42,6 +43,12 @@ chunkset_neon.o:
chunkset_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c

compare256_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c

compare256_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c

crc32_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c

Expand Down
5 changes: 0 additions & 5 deletions arch/arm/chunkset_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,10 @@ typedef uint8x16_t chunk_t;

#define CHUNK_SIZE 16

#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8

static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
*chunk = vld1q_dup_u8(from);
}

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
zmemcpy_2(&tmp, from);
Expand Down
60 changes: 60 additions & 0 deletions arch/arm/compare256_neon.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/* compare256_neon.c - NEON version of compare256
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
#include "../../zbuild.h"

static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;

do {
uint8x16_t a, b, cmp;
uint64_t lane;

a = vld1q_u8(src0);
b = vld1q_u8(src1);

cmp = veorq_u8(a, b);

lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
if (lane) {
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
return len + match_byte;
}
len += 8;
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
if (lane) {
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
return len + match_byte;
}
len += 8;

src0 += 16, src1 += 16;
} while (len < 256);

return 256;
}

Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
return compare256_neon_static(src0, src1);
}

#define LONGEST_MATCH longest_match_neon
#define COMPARE256 compare256_neon_static

#include "match_tpl.h"

#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_neon
#define COMPARE256 compare256_neon_static

#include "match_tpl.h"

#endif
9 changes: 9 additions & 0 deletions arch/power/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ INCLUDES=
SUFFIX=

P8FLAGS=-mcpu=power8
P9FLAGS=-mcpu=power9
PPCFLAGS=-maltivec
NOLTOFLAG=

Expand All @@ -25,6 +26,8 @@ all: power_features.o \
adler32_vmx.lo \
chunkset_power8.o \
chunkset_power8.lo \
compare256_power9.o \
compare256_power9.lo \
crc32_power8.o \
crc32_power8.lo \
slide_hash_power8.o \
Expand Down Expand Up @@ -56,6 +59,12 @@ chunkset_power8.o:
chunkset_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c

compare256_power9.o:
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c

compare256_power9.lo:
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c

crc32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c

Expand Down
5 changes: 0 additions & 5 deletions arch/power/chunkset_power8.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,10 @@ typedef vector unsigned char chunk_t;

#define CHUNK_SIZE 16

#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8

static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
*chunk = vec_splats(*from);
}

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
zmemcpy_2(&tmp, from);
Expand Down
66 changes: 66 additions & 0 deletions arch/power/compare256_power9.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/* compare256_power9.c - Power9 version of compare256
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#ifdef POWER9
#include <altivec.h>
#include "../../zbuild.h"
#include "../../zendian.h"

/* Older versions of GCC misimplemented semantics for these bit counting builtins.
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
#if defined(__GNUC__) && (__GNUC__ < 12)
# define zng_vec_vctzlsbb(vc, len) __asm__ volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
# define zng_vec_vclzlsbb(vc, len) __asm__ volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc))
#else
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
# define zng_vec_vclzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
#endif

static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0, cmplen;

do {
vector unsigned char vsrc0, vsrc1, vc;

vsrc0 = *((vector unsigned char *)src0);
vsrc1 = *((vector unsigned char *)src1);

/* Compare 16 bytes at a time. Each byte of vc will be either
* all ones or all zeroes, depending on the result of the comparison. */
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);

/* Since the index of matching bytes will contain only zeroes
* on vc (since we used cmpne), counting the number of consecutive
* bytes where LSB == 0 is the same as counting the length of the match. */
#if BYTE_ORDER == LITTLE_ENDIAN
zng_vec_vctzlsbb(vc, cmplen);
#else
zng_vec_vclzlsbb(vc, cmplen);
#endif
if (cmplen != 16)
return len + cmplen;

src0 += 16, src1 += 16, len += 16;
} while (len < 256);

return 256;
}

Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
return compare256_power9_static(src0, src1);
}

#define LONGEST_MATCH longest_match_power9
#define COMPARE256 compare256_power9_static

#include "match_tpl.h"

#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_power9
#define COMPARE256 compare256_power9_static

#include "match_tpl.h"

#endif
3 changes: 3 additions & 0 deletions arch/power/power_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

Z_INTERNAL int power_cpu_has_altivec = 0;
Z_INTERNAL int power_cpu_has_arch_2_07 = 0;
Z_INTERNAL int power_cpu_has_arch_3_00 = 0;

void Z_INTERNAL power_check_features(void) {
#ifdef PPC_FEATURES
Expand All @@ -28,5 +29,7 @@ void Z_INTERNAL power_check_features(void) {

if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
power_cpu_has_arch_2_07 = 1;
if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
power_cpu_has_arch_3_00 = 1;
#endif
}
1 change: 1 addition & 0 deletions arch/power/power_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

extern int power_cpu_has_altivec;
extern int power_cpu_has_arch_2_07;
extern int power_cpu_has_arch_3_00;

void Z_INTERNAL power_check_features(void);

Expand Down
3 changes: 2 additions & 1 deletion arch/s390/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ integrated with the rest of zlib-ng using hook macros.
## Hook macros

DFLTCC takes as arguments a parameter block, an input buffer, an output
buffer and a window. `ZALLOC_STATE()`, `ZFREE_STATE()`, `ZCOPY_STATE()`,
buffer and a window. `ZALLOC_DEFLATE_STATE()`, `ZALLOC_INFLATE_STATE()`,
`ZFREE_STATE()`, `ZCOPY_DEFLATE_STATE()`, `ZCOPY_INFLATE_STATE()`,
`ZALLOC_WINDOW()` and `TRY_FREE_WINDOW()` macros encapsulate allocation
details for the parameter block (which is allocated alongside zlib-ng
state) and the window (which must be page-aligned).
Expand Down
53 changes: 0 additions & 53 deletions arch/s390/dfltcc_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,59 +12,6 @@
`posix_memalign' is not an option. Thus, we overallocate and take the
aligned portion of the buffer.
*/
static inline int is_dfltcc_enabled(void) {
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
Z_REGISTER uint8_t r0 __asm__("r0");

memset(facilities, 0, sizeof(facilities));
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
* is 64-bit, it's always z/Architecture mode at runtime.
*/
__asm__ volatile(
#ifndef __clang__
".machinemode push\n"
".machinemode zarch\n"
#endif
"stfle %[facilities]\n"
#ifndef __clang__
".machinemode pop\n"
#endif
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
}

void Z_INTERNAL PREFIX(dfltcc_reset)(PREFIX3(streamp) strm, uInt size) {
struct dfltcc_state *dfltcc_state = (struct dfltcc_state *)((char *)strm->state + ALIGN_UP(size, 8));
struct dfltcc_qaf_param *param = (struct dfltcc_qaf_param *)&dfltcc_state->param;

/* Initialize available functions */
if (is_dfltcc_enabled()) {
dfltcc(DFLTCC_QAF, param, NULL, NULL, NULL, NULL, NULL);
memmove(&dfltcc_state->af, param, sizeof(dfltcc_state->af));
} else
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));

/* Initialize parameter block */
memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
dfltcc_state->param.nt = 1;

/* Initialize tuning parameters */
dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
dfltcc_state->param.ribm = DFLTCC_RIBM;
}

void Z_INTERNAL *PREFIX(dfltcc_alloc_state)(PREFIX3(streamp) strm, uInt items, uInt size) {
return ZALLOC(strm, ALIGN_UP(items * size, 8) + sizeof(struct dfltcc_state), sizeof(unsigned char));
}

void Z_INTERNAL PREFIX(dfltcc_copy_state)(void *dst, const void *src, uInt size) {
memcpy(dst, src, ALIGN_UP(size, 8) + sizeof(struct dfltcc_state));
}

static const int PAGE_ALIGN = 0x1000;

Expand Down
Loading

0 comments on commit 4cab5b3

Please sign in to comment.