Skip to content

Commit

Permalink
Add LoongArch SX SIMD extenstion implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
lrzlin committed Nov 28, 2024
1 parent dd11140 commit f997b05
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 0 deletions.
2 changes: 2 additions & 0 deletions cli/xsum_arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@
# else
# define XSUM_ARCH "wasm/asmjs"
# endif
#elif defined(__loongarch_lp64)
# define XSUM_ARCH "loongarch"
#else
# define XSUM_ARCH "unknown"
#endif
Expand Down
76 changes: 76 additions & 0 deletions xxhash.h
Original file line number Diff line number Diff line change
Expand Up @@ -3749,6 +3749,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
# include <immintrin.h>
# elif defined(__SSE2__)
# include <emmintrin.h>
# elif defined(__loongarch_sx)
# include <lsxintrin.h>
# endif
#endif

Expand Down Expand Up @@ -3871,6 +3873,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
*/
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
XXH_LSX = 7, /*!< LSX (128-bit SIMD) for LoongArch64 */
};
/*!
* @ingroup tuning
Expand All @@ -3893,6 +3896,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_NEON 4
# define XXH_VSX 5
# define XXH_SVE 6
# define XXH_LSX 7
#endif

#ifndef XXH_VECTOR /* can be defined on command line */
Expand All @@ -3917,6 +3921,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|| (defined(__s390x__) && defined(__VEC__)) \
&& defined(__GNUC__) /* TODO: IBM XL */
# define XXH_VECTOR XXH_VSX
# elif defined(__loongarch_sx)
# define XXH_VECTOR XXH_LSX
# else
# define XXH_VECTOR XXH_SCALAR
# endif
Expand Down Expand Up @@ -3954,6 +3960,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_ACC_ALIGN 64
# elif XXH_VECTOR == XXH_SVE /* sve */
# define XXH_ACC_ALIGN 64
# elif XXH_VECTOR == XXH_LSX /* lsx */
# define XXH_ACC_ALIGN 64
# endif
#endif

Expand Down Expand Up @@ -5591,6 +5599,68 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,

#endif

#if (XXH_VECTOR == XXH_LSX)
#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))

XXH_FORCE_INLINE void
XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc,
const void* XXH_RESTRICT input,
const void* XXH_RESTRICT secret)
{
XXH_ASSERT((((size_t)acc) & 15) == 0);
__m128i* const xacc = (__m128i *) acc;
const __m128i* const xinput = (const __m128i *) input;
const __m128i* const xsecret = (const __m128i *) secret;

for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
/* data_vec = xinput[i]; */
__m128i const data_vec = __lsx_vld(xinput + i, 0);
/* key_vec = xsecret[i]; */
__m128i const key_vec = __lsx_vld(xsecret + i, 0);
/* data_key = data_vec ^ key_vec; */
__m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
/* data_key_lo = data_key >> 32; */
__m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
// __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
__m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo);
/* xacc[i] += swap(data_vec); */
__m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2));
__m128i const sum = __lsx_vadd_d(xacc[i], data_swap);
/* xacc[i] += product; */
xacc[i] = __lsx_vadd_d(product, sum);
}
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx)

XXH_FORCE_INLINE void
XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
XXH_ASSERT((((size_t)acc) & 15) == 0);
__m128i* const xacc = (__m128i*) acc;
const __m128i* const xsecret = (const __m128i *) secret;
const int prime32_1 = XXH_PRIME32_1;
const __m128i prime32 = __lsx_vreplgr2vr_w(prime32_1);

for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
/* xacc[i] ^= (xacc[i] >> 47) */
__m128i const acc_vec = xacc[i];
__m128i const shifted = __lsx_vsrli_d(acc_vec, 47);
__m128i const data_vec = __lsx_vxor_v(acc_vec, shifted);
/* xacc[i] ^= xsecret[i]; */
__m128i const key_vec = __lsx_vld(xsecret + i, 0);
__m128i const data_key = __lsx_vxor_v(data_vec, key_vec);

/* xacc[i] *= XXH_PRIME32_1; */
__m128i const data_key_hi = __lsx_vsrli_d(data_key, 32);
__m128i const prod_lo = __lsx_vmulwev_d_wu(data_key, prime32);
__m128i const prod_hi = __lsx_vmulwev_d_wu(data_key_hi, prime32);
xacc[i] = __lsx_vadd_d(prod_lo, __lsx_vslli_d(prod_hi, 32));
}
}

#endif

/* scalar variants - universal */

#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
Expand Down Expand Up @@ -5821,6 +5891,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#elif (XXH_VECTOR == XXH_LSX)
#define XXH3_accumulate_512 XXH3_accumulate_512_lsx
#define XXH3_accumulate XXH3_accumulate_lsx
#define XXH3_scrambleAcc XXH3_scrambleAcc_lsx
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#else /* scalar */

#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
Expand Down

0 comments on commit f997b05

Please sign in to comment.