From e7c94efe457acef302a4e18af1e56b4b3ed11757 Mon Sep 17 00:00:00 2001 From: Lin Runze Date: Fri, 29 Nov 2024 00:37:46 +0800 Subject: [PATCH] Add LoongArch SX SIMD extenstion implementation --- .github/workflows/ci.yml | 6 +++ cli/xsum_arch.h | 2 + xxhash.h | 79 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0de6123..1d834671 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -386,6 +386,7 @@ jobs: { name: 'RISC-V', xcc_pkg: gcc-riscv64-linux-gnu, xcc: riscv64-linux-gnu-gcc, xemu_pkg: qemu-system-riscv64,xemu: qemu-riscv64-static, os: ubuntu-latest, }, # SPARC64 qemu emulation seems broken on Ubuntu-22 { name: 'SPARC', xcc_pkg: gcc-sparc64-linux-gnu, xcc: sparc64-linux-gnu-gcc, xemu_pkg: qemu-system-sparc, xemu: qemu-sparc64-static, os: ubuntu-20.04, }, + { name: 'LoongArch', xcc_pkg: gcc-14-loongarch64-linux-gnu, xcc: loongarch64-linux-gnu-gcc-14, xemu_pkg: qemu-system-loongarch64, xemu: qemu-loongarch64-static, os: ubuntu-24.04, }, { name: 'ARM, gcc-10', xcc_pkg: gcc-10-arm-linux-gnueabi, xcc: arm-linux-gnueabi-gcc-10, xemu_pkg: qemu-system-arm, xemu: qemu-arm-static, os: ubuntu-20.04, }, { name: 'AARCH64, gcc-10', xcc_pkg: gcc-10-aarch64-linux-gnu, xcc: aarch64-linux-gnu-gcc-10, xemu_pkg: qemu-system-arm, xemu: qemu-aarch64-static, os: ubuntu-20.04, }, @@ -474,6 +475,11 @@ jobs: run: | make clean; LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make check + - name: LoongArch (XXH_VECTOR=[ scalar, LSX ]) + if: ${{ startsWith(matrix.name, 'LoongArch') }} + run: | + CPPFLAGS="-DXXH_VECTOR=XXH_SCALAR" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check + CPPFLAGS=-DXXH_VECTOR=XXH_LSX CFLAGS="-O3 -march=la464 -mlsx" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check # macOS diff --git a/cli/xsum_arch.h b/cli/xsum_arch.h index f025f345..b3d4f643 100644 --- a/cli/xsum_arch.h +++ b/cli/xsum_arch.h @@ -161,6 +161,8 @@ # else # define XSUM_ARCH "wasm/asmjs" # endif +#elif defined(__loongarch_lp64) +# define XSUM_ARCH "loongarch" #else # define XSUM_ARCH "unknown" #endif diff --git a/xxhash.h b/xxhash.h index c2344b8d..80187b36 100644 --- a/xxhash.h +++ b/xxhash.h @@ -3749,6 +3749,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # include # elif defined(__SSE2__) # include +# elif defined(__loongarch_sx) +# include # endif #endif @@ -3871,6 +3873,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { */ XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ + XXH_LSX = 7, /*!< LSX (128-bit SIMD) for LoongArch64 */ }; /*! * @ingroup tuning @@ -3893,6 +3896,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_NEON 4 # define XXH_VSX 5 # define XXH_SVE 6 +# define XXH_LSX 7 #endif #ifndef XXH_VECTOR /* can be defined on command line */ @@ -3917,6 +3921,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { || (defined(__s390x__) && defined(__VEC__)) \ && defined(__GNUC__) /* TODO: IBM XL */ # define XXH_VECTOR XXH_VSX +# elif defined(__loongarch_sx) +# define XXH_VECTOR XXH_LSX # else # define XXH_VECTOR XXH_SCALAR # endif @@ -3954,6 +3960,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_ACC_ALIGN 64 # elif XXH_VECTOR == XXH_SVE /* sve */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_LSX /* lsx */ +# define XXH_ACC_ALIGN 64 # endif #endif @@ -5591,6 +5599,71 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, #endif +#if (XXH_VECTOR == XXH_LSX) +#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + __m128i* const xacc = (__m128i *) acc; + const __m128i* const xinput = (const __m128i *) input; + const __m128i* const xsecret = (const __m128i *) secret; + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = __lsx_vld(xinput + i, 0); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = __lsx_vld(xsecret + i, 0); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); + // __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2)); + __m128i const sum = __lsx_vadd_d(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = __lsx_vadd_d(product, sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + __m128i* const xacc = (__m128i*) acc; + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = __lsx_vreplgr2vr_w((int)XXH_PRIME32_1); + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = __lsx_vsrli_d(acc_vec, 47); + __m128i const data_vec = __lsx_vxor_v(acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = __lsx_vld(xsecret + i, 0); + __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = __lsx_vsrli_d(data_key, 32); + __m128i const prod_lo = __lsx_vmulwev_d_wu(data_key, prime32); + __m128i const prod_hi = __lsx_vmulwev_d_wu(data_key_hi, prime32); + xacc[i] = __lsx_vadd_d(prod_lo, __lsx_vslli_d(prod_hi, 32)); + } + } +} + +#endif + /* scalar variants - universal */ #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) @@ -5821,6 +5894,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#elif (XXH_VECTOR == XXH_LSX) +#define XXH3_accumulate_512 XXH3_accumulate_512_lsx +#define XXH3_accumulate XXH3_accumulate_lsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_lsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar