From b65566971617767293031b7fb457257b65624a5a Mon Sep 17 00:00:00 2001 From: wjr <1966336874@qq.com> Date: Mon, 5 Aug 2024 15:31:27 +0800 Subject: [PATCH] update --- godbolt/wjr.hpp | 6502 ++++++++++++++++-------------- include/wjr/format/fastfloat.hpp | 245 +- src/wjr/format/fastfloat.cpp | 25 +- 3 files changed, 3648 insertions(+), 3124 deletions(-) diff --git a/godbolt/wjr.hpp b/godbolt/wjr.hpp index 4568adfb..1a07fca5 100644 --- a/godbolt/wjr.hpp +++ b/godbolt/wjr.hpp @@ -2921,11 +2921,17 @@ WJR_CONST constexpr bool in_range(U value) noexcept { } } +template +struct __is_value_preserving_impl : std::false_type {}; + template -struct is_value_preserving +struct __is_value_preserving_impl>> : std::bool_constant(std::numeric_limits::min()) && in_range(std::numeric_limits::max())> {}; +template +struct is_value_preserving : __is_value_preserving_impl {}; + template struct is_value_preserving : std::true_type {}; @@ -3567,9 +3573,20 @@ WJR_NODISCARD auto allocate_at_least(Allocator &alloc, SizeType count) { #ifndef WJR_X86_SIMD_SIMD_HPP__ #define WJR_X86_SIMD_SIMD_HPP__ +#ifndef WJR_X86_SIMD_AVX_HPP__ +#define WJR_X86_SIMD_AVX_HPP__ + +#ifndef WJR_X86_SIMD_SSE_HPP__ +#define WJR_X86_SIMD_SSE_HPP__ + #ifndef WJR_X86_SIMD_SIMD_CAST_HPP__ #define WJR_X86_SIMD_SIMD_CAST_HPP__ +#include + +#ifndef WJR_SIMD_DETAIL_HPP__ +#define WJR_SIMD_DETAIL_HPP__ + #ifndef WJR_SIMD_SIMD_CAST_HPP__ #define WJR_SIMD_SIMD_CAST_HPP__ @@ -3584,879 +3601,1555 @@ inline constexpr simd_cast_fn simd_cast; } // namespace wjr #endif // WJR_SIMD_SIMD_CAST_HPP__ -#ifndef WJR_X86_SIMD_INTRIN_HPP__ -#define WJR_X86_SIMD_INTRIN_HPP__ +#ifndef WJR_SIMD_SIMD_MASK_HPP__ +#define WJR_SIMD_SIMD_MASK_HPP__ + +#ifndef WJR_ASSERT_HPP__ +#define WJR_ASSERT_HPP__ + +/** + * @file assert.hpp + * @author wjr + * @brief Assertion utilities + * + * @details WJR_DEBUG_LEVEL : 0 ~ 3 \n + * 0 : Release \n + * 1 : Beta \n + * 2 : Runtime detect \n + * 3 : Maximize runtime detect, for debug \n + * If WJR_DEBUG_LEVEL is not defined, \n + * If NDEBUG is defined, WJR_DEBUG_LEVEL is set to 0 by default. \n + * Otherwise, WJR_DEBUG_LEVEL is set to 1 by default. \n + * WJR_ASSERT_L(level, expr) : Specify the level of assertion, \n + * if the WJR_DEBUG_LEVEL is greater than or equal to the level, \n + * the assertion is executed. \n + * WJR_ASSERT(expr) : Equivalent to WJR_ASSERT_L(1, expr) \n + * WJR_ASSERT_0(expr) : Always execute the assertion \n + * + * @version 0.1 + * @date 2024-06-01 + * + * @copyright Copyright (c) 2024 + * + */ + +#include // Already included -#if defined(_MSC_VER) -/* Microsoft C/C++-compatible compiler */ -#include -#elif defined(__GNUC__) -/* GCC-compatible compiler, targeting x86/x86-64 */ -#include +#ifndef WJR_DEBUG_LEVEL +#if defined(NDEBUG) +#define WJR_DEBUG_LEVEL 0 +#else +#define WJR_DEBUG_LEVEL 1 +#endif #endif -#endif // WJR_X86_SIMD_INTRIN_HPP__ +#if WJR_DEBUG_LEVEL < 0 || WJR_DEBUG_LEVEL > 3 +#error "WJR_DEBUG_LEVEL must be 0 ~ 3" +#endif namespace wjr { -// simd type can't be directly used on template -template -struct simd_wrapper { - using type = T; -}; +#define WJR_DEBUG_IF(level, expr0, expr1) \ + WJR_PP_BOOL_IF(WJR_PP_GT(WJR_DEBUG_LEVEL, level), expr0, expr1) -template -using simd_wrapper_t = typename simd_wrapper::type; +WJR_NORETURN extern void __assert_failed(const char *expr, const char *file, + const char *func, int line) noexcept; -#if WJR_HAS_SIMD(SSE) +// LCOV_EXCL_START -struct __m128_t { - using type = __m128; -}; +/// @private +template +WJR_NOINLINE void __assert_handler(const char *expr, const char *file, const char *func, + int line, Args &&...args) noexcept { + std::cerr << "Additional information: "; + (void)(std::cerr << ... << std::forward(args)); + std::cerr << '\n'; + __assert_failed(expr, file, func, line); +} -#endif // SSE +/// @private +inline void __assert_handler(const char *expr, const char *file, const char *func, + int line) noexcept { + __assert_failed(expr, file, func, line); +} -#if WJR_HAS_SIMD(SSE2) +// LCOV_EXCL_STOP -struct __m128i_t { - using type = __m128i; -}; +#define WJR_ASSERT_CHECK_I(expr, ...) \ + do { \ + if (WJR_UNLIKELY(!(expr))) { \ + ::wjr::__assert_handler(#expr, WJR_FILE, WJR_CURRENT_FUNCTION, WJR_LINE, \ + ##__VA_ARGS__); \ + } \ + } while (0) -struct __m128d_t { - using type = __m128d; -}; +// do nothing +#define WJR_ASSERT_UNCHECK_I(expr, ...) \ + do { \ + } while (0) -template <> -struct simd_cast_fn<__m128_t, __m128i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128 v) const { - return _mm_castps_si128(v); - } -}; +// level = [0, 2] +// The higher the level, the less likely it is to be detected +// Runtime detect : 1 +// Maximize detect : 2 +#define WJR_ASSERT_L(level, ...) \ + WJR_DEBUG_IF(level, WJR_ASSERT_CHECK_I, WJR_ASSERT_UNCHECK_I) \ + (__VA_ARGS__) -template <> -struct simd_cast_fn<__m128_t, __m128d_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128 v) const { - return _mm_castps_pd(v); - } -}; +// level of assert is zero at default. +#define WJR_ASSERT_L0(...) WJR_ASSERT_CHECK_I(__VA_ARGS__) +#define WJR_ASSERT_L1(...) WJR_ASSERT_L(1, __VA_ARGS__) +#define WJR_ASSERT_L2(...) WJR_ASSERT_L(2, __VA_ARGS__) +#define WJR_ASSERT_L3(...) WJR_ASSERT_L(3, __VA_ARGS__) +#define WJR_ASSERT(...) WJR_ASSERT_L1(__VA_ARGS__) -template <> -struct simd_cast_fn<__m128i_t, __m128_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128i v) const { - return _mm_castsi128_ps(v); - } -}; +#define WJR_ASSERT_ASSUME_L(level, ...) \ + WJR_ASSERT_L(level, __VA_ARGS__); \ + __WJR_ASSERT_ASSUME_L_ASSUME(__VA_ARGS__) +#define __WJR_ASSERT_ASSUME_L_ASSUME(expr, ...) WJR_ASSUME(expr) -template <> -struct simd_cast_fn<__m128i_t, __m128d_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128i v) const { - return _mm_castsi128_pd(v); - } -}; +#define WJR_ASSERT_ASSUME_L0(...) WJR_ASSERT_ASSUME_L(0, __VA_ARGS__) +#define WJR_ASSERT_ASSUME_L1(...) WJR_ASSERT_ASSUME_L(1, __VA_ARGS__) +#define WJR_ASSERT_ASSUME_L2(...) WJR_ASSERT_ASSUME_L(2, __VA_ARGS__) +#define WJR_ASSERT_ASSUME_L3(...) WJR_ASSERT_ASSUME_L(3, __VA_ARGS__) +#define WJR_ASSERT_ASSUME(...) WJR_ASSERT_ASSUME_L1(__VA_ARGS__) -template <> -struct simd_cast_fn<__m128d_t, __m128_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128d v) const { - return _mm_castpd_ps(v); - } -}; +} // namespace wjr -template <> -struct simd_cast_fn<__m128d_t, __m128i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128d v) const { - return _mm_castpd_si128(v); - } -}; +#endif // WJR_ASSERT_HPP__ +#ifndef WJR_MATH_CLZ_HPP__ +#define WJR_MATH_CLZ_HPP__ -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int8_t v) const { - return _mm_cvtsi32_si128(v); - } -}; +// Already included +#ifndef WJR_MATH_POPCOUNT_HPP__ +#define WJR_MATH_POPCOUNT_HPP__ -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const { - return _mm_cvtsi32_si128(v); - } -}; +#ifndef WJR_MATH_DETAIL_HPP__ +#define WJR_MATH_DETAIL_HPP__ -template <> -struct simd_cast_fn<__m128i_t, int8_t> { - WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m128i v) const { - return _mm_cvtsi128_si32(v); - } -}; +// Already included -template <> -struct simd_cast_fn<__m128i_t, uint8_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m128i v) const { - return _mm_cvtsi128_si32(v); - } -}; +namespace wjr { -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int16_t v) const { - return _mm_cvtsi32_si128(v); - } -}; +#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)) -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const { - return _mm_cvtsi32_si128(v); - } -}; +namespace math_detail { -template <> -struct simd_cast_fn<__m128i_t, int16_t> { - WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m128i v) const { - return _mm_cvtsi128_si32(v); - } -}; +template +class de_bruijn { +public: + constexpr static uint8_t digits = std::numeric_limits::digits; + constexpr static uint8_t mv = digits == 32 ? 27 : 58; + constexpr de_bruijn() noexcept : lookup(), lookupr() { initialize(); } -template <> -struct simd_cast_fn<__m128i_t, uint16_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m128i v) const { - return _mm_cvtsi128_si32(v); - } -}; + constexpr int get(T idx) const noexcept { return lookup[(idx * seed) >> mv]; } + constexpr int getr(T idx) const noexcept { return lookupr[(idx * seed) >> mv]; } -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int32_t v) const { - return _mm_cvtsi32_si128(v); +private: + constexpr void initialize() noexcept { + for (uint8_t i = 0; i < digits; ++i) { + const auto idx = (seed << i) >> mv; + lookup[idx] = i; + lookupr[idx] = i == 0 ? 0 : digits - i; + } } -}; -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const { - return _mm_cvtsi32_si128(v); - } + uint8_t lookup[digits]; + uint8_t lookupr[digits]; }; -template <> -struct simd_cast_fn<__m128i_t, int32_t> { - WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m128i v) const { - return _mm_cvtsi128_si32(v); - } -}; +inline constexpr de_bruijn de_bruijn32 = {}; +inline constexpr de_bruijn de_bruijn64 = {}; -template <> -struct simd_cast_fn<__m128i_t, uint32_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m128i v) const { - return _mm_cvtsi128_si32(v); - } -}; +} // namespace math_detail -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int64_t v) const { - return _mm_cvtsi64_si128(v); - } -}; +#endif -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const { - return _mm_cvtsi64_si128(v); - } -}; +/** + * @brief + * + * @note `n & -n` is the lowest bit of n. + */ +template )> +WJR_CONST constexpr T lowbit(T n) noexcept { + return n & -n; +} -template <> -struct simd_cast_fn<__m128i_t, int64_t> { - WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m128i v) const { - return _mm_cvtsi128_si64(v); - } -}; +template )> +WJR_CONST constexpr T clear_lowbit(T n) noexcept { + return n & (n - 1); +} -template <> -struct simd_cast_fn<__m128i_t, uint64_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m128i v) const { - return _mm_cvtsi128_si64(v); - } -}; +// preview : -#endif // SSE2 +template )> +WJR_CONST constexpr bool is_zero_or_single_bit(T n) noexcept { + return (n & (n - 1)) == 0; +} -#if WJR_HAS_SIMD(AVX) +template )> +WJR_CONST constexpr bool __has_high_bit(T n) noexcept { + return n >> (std::numeric_limits::digits - 1); +} -struct __m256_t { - using type = __m256; -}; +template )> +WJR_CONST constexpr T __ceil_div(T n, type_identity_t div) noexcept { + return (n + div - 1) / div; +} -struct __m256i_t { - using type = __m256i; -}; +template )> +WJR_CONST constexpr T __align_down(T n, type_identity_t alignment) noexcept { + WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); + return n & (-alignment); +} -struct __m256d_t { - using type = __m256d; -}; +template )> +WJR_CONST constexpr T __align_down_offset(T n, type_identity_t alignment) noexcept { + WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); + return n & (alignment - 1); +} -template <> -struct simd_cast_fn<__m256_t, __m256i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256 v) const { - return _mm256_castps_si256(v); - } -}; +template )> +WJR_CONST constexpr T __align_up(T n, type_identity_t alignment) noexcept { + WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); + return (n + alignment - 1) & (-alignment); +} -template <> -struct simd_cast_fn<__m256_t, __m256d_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256 v) const { - return _mm256_castps_pd(v); - } -}; +template )> +WJR_CONST constexpr T __align_up_offset(T n, type_identity_t alignment) noexcept { + WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); + return (-n) & (alignment - 1); +} -template <> -struct simd_cast_fn<__m256i_t, __m256_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256i v) const { - return _mm256_castsi256_ps(v); - } -}; +template )> +WJR_CONST constexpr std::make_signed_t __fasts_from_unsigned(T x) noexcept { + const std::make_signed_t ret = x; + WJR_ASSERT_ASSUME_L2(ret >= 0, "overflow"); + return ret; +} -template <> -struct simd_cast_fn<__m256i_t, __m256d_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256i v) const { - return _mm256_castsi256_pd(v); - } -}; +template , + WJR_REQUIRES(is_nonbool_signed_integral_v)> +WJR_CONST constexpr U __fasts_abs(T x) noexcept { + return static_cast(x < 0 ? -x : x); +} -template <> -struct simd_cast_fn<__m256d_t, __m256_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256d v) const { - return _mm256_castpd_ps(v); - } -}; +template )> +WJR_CONST constexpr T __fasts_negate(T x) noexcept { + return -x; +} -template <> -struct simd_cast_fn<__m256d_t, __m256i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256d v) const { - return _mm256_castpd_si256(v); - } -}; +template , + WJR_REQUIRES(is_nonbool_signed_integral_v)> +WJR_CONST constexpr T __fasts_conditional_negate(bool condition, T x) noexcept { + return condition ? -x : x; +} -template <> -struct simd_cast_fn<__m128i_t, __m256i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const { - return _mm256_castsi128_si256(v); - } -}; +template , + WJR_REQUIRES(is_nonbool_signed_integral_v)> +WJR_CONST constexpr T __fasts_negate_with(T condition, T x) noexcept { + return __fasts_conditional_negate(condition < 0, x); +} -template <> -struct simd_cast_fn<__m256i_t, __m128i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m256i v) const { - return _mm256_castsi256_si128(v); - } -}; +template )> +WJR_CONST constexpr T __fasts_increment(T x) noexcept { + WJR_ASSERT_L2(x != std::numeric_limits::min() && + x != std::numeric_limits::max(), + "overflow"); -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int8_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); - } -}; + return x < 0 ? x - 1 : x + 1; +} -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); - } -}; +template )> +WJR_CONST constexpr T __fasts_decrement(T x) noexcept { + WJR_ASSERT_L2(x != 0 && x + 1 != T(0), "overflow"); -template <> -struct simd_cast_fn<__m256i_t, int8_t> { - WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); - } -}; + return x < 0 ? x + 1 : x - 1; +} -template <> -struct simd_cast_fn<__m256i_t, uint8_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); - } -}; +template )> +WJR_CONST constexpr T __fasts_add(T x, std::make_unsigned_t y) noexcept { + return x < 0 ? x - y : x + y; +} -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int16_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); - } -}; +template )> +WJR_CONST constexpr T __fasts_sub(T x, std::make_unsigned_t y) noexcept { + return x < 0 ? x + y : x - y; +} -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); - } -}; +} // namespace wjr -template <> -struct simd_cast_fn<__m256i_t, int16_t> { - WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); - } -}; +#endif // WJR_MATH_DETAIL_HPP__ -template <> -struct simd_cast_fn<__m256i_t, uint16_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); +namespace wjr { + +template +WJR_CONST WJR_INTRINSIC_CONSTEXPR int fallback_popcount(T x) noexcept { + constexpr auto nd = std::numeric_limits::digits; + if constexpr (nd < 32) { + return fallback_popcount(static_cast(x)); + } else { + if constexpr (nd == 32) { + x -= (x >> 1) & 0x5555'5555; + x = (x & 0x3333'3333) + ((x >> 2) & 0x3333'3333); + x = (x + (x >> 4)) & 0x0f0f'0f0f; + return (x * 0x0101'0101) >> 24; + } else { + x -= (x >> 1) & 0x5555'5555'5555'5555; + x = (x & 0x3333'3333'3333'3333) + ((x >> 2) & 0x3333'3333'3333'3333); + x = (x + (x >> 4)) & 0x0f0f'0f0f'0f0f'0f0f; + return (x * 0x0101'0101'0101'0101) >> 56; + } } -}; +} -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int32_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); +#if WJR_HAS_BUILTIN(POPCOUNT) + +template +WJR_CONST WJR_INTRINSIC_INLINE int builtin_popcount(T x) noexcept { + constexpr auto nd = std::numeric_limits::digits; + if constexpr (nd < 32) { + return builtin_popcount(static_cast(x)); + } else { + if constexpr (nd <= std::numeric_limits::digits) { + return __builtin_popcount(x); + } else if constexpr (nd <= std::numeric_limits::digits) { + return __builtin_popcountl(x); + } + if constexpr (nd <= std::numeric_limits::digits) { + return __builtin_popcountll(x); + } else { + static_assert(nd <= 64, "not support yet"); + } } -}; +} -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); +#endif // WJR_HAS_BUILTIN(POPCOUNT) + +template +WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount_impl(T x) noexcept { + if (WJR_BUILTIN_CONSTANT_P_TRUE(is_zero_or_single_bit(x))) { + return x != 0; } -}; -template <> -struct simd_cast_fn<__m256i_t, int32_t> { - WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); +#if WJR_HAS_BUILTIN(POPCOUNT) + if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) { + return fallback_popcount(x); } -}; -template <> -struct simd_cast_fn<__m256i_t, uint32_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); + return builtin_popcount(x); +#else + return fallback_popcount(x); +#endif +} + +template )> +WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount(T x) noexcept { + const int ret = popcount_impl(x); + WJR_ASSUME(0 <= ret && ret <= std::numeric_limits::digits); + return ret; +} + +} // namespace wjr + +#endif // WJR_MATH_POPCOUNT_HPP__ + +#if WJR_HAS_BUILTIN(__builtin_clz) +#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF +#elif defined(WJR_MSVC) && defined(WJR_X86) +#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF_VAR(2) +#endif + +#if WJR_HAS_BUILTIN(CLZ) == 2 +#ifndef WJR_X86_SIMD_INTRIN_HPP__ +#define WJR_X86_SIMD_INTRIN_HPP__ + +// Already included + +#if defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ +#include +#elif defined(__GNUC__) +/* GCC-compatible compiler, targeting x86/x86-64 */ +#include +#endif + +#endif // WJR_X86_SIMD_INTRIN_HPP__ +#endif + +namespace wjr { + +template +WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_clz(T x) noexcept { + constexpr auto nd = std::numeric_limits::digits; + + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + + if constexpr (nd >= 16) { + x |= (x >> 8); } -}; -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int64_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + if constexpr (nd >= 32) { + x |= (x >> 16); } -}; -template <> -struct simd_cast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const { - return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + if constexpr (nd >= 64) { + x |= (x >> 32); } -}; -template <> -struct simd_cast_fn<__m256i_t, int64_t> { - WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v)); + return fallback_popcount(~x); +} + +template +WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_clz(T x) noexcept { + constexpr auto nd = std::numeric_limits::digits; + +#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)) + if constexpr (nd >= 32) { +#endif + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + + if constexpr (nd >= 16) { + x |= (x >> 8); + } + + if constexpr (nd >= 32) { + x |= (x >> 16); + } + + if constexpr (nd >= 64) { + x |= (x >> 32); + } +#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)) } -}; +#endif -template <> -struct simd_cast_fn<__m256i_t, uint64_t> { - WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m256i v) const { - return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v)); +#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT) + return popcount(~x); +#else + if constexpr (nd < 32) { + return fallback_clz(static_cast(x)) - (32 - nd); + } else { + ++x; + + if constexpr (nd <= 32) { + return math_detail::de_bruijn32.getr(x); + } else if constexpr (nd <= 64) { + return math_detail::de_bruijn64.getr(x); + } else { + static_assert(nd <= 64, "not support yet"); + } } -}; +#endif +} -#endif // AVX +#if WJR_HAS_BUILTIN(CLZ) -} // namespace wjr +template +WJR_CONST WJR_INTRINSIC_INLINE int builtin_clz(T x) noexcept { + constexpr auto nd = std::numeric_limits::digits; + if constexpr (nd < 32) { + return builtin_clz(static_cast(x)) - (32 - nd); + } else { +#if WJR_HAS_BUILTIN(CLZ) == 1 + if constexpr (nd <= std::numeric_limits::digits) { + constexpr auto delta = std::numeric_limits::digits - nd; + return __builtin_clz(static_cast(x)) - delta; + } else if constexpr (nd <= std::numeric_limits::digits) { + constexpr auto delta = std::numeric_limits::digits - nd; + return __builtin_clzl(static_cast(x)) - delta; + } else if constexpr (nd <= std::numeric_limits::digits) { + constexpr auto delta = std::numeric_limits::digits - nd; + return __builtin_clzll(static_cast(x)) - delta; + } else { + static_assert(nd <= 64, "not supported yet"); + } +#else + if constexpr (nd == 32) { + unsigned long result; + (void)_BitScanReverse(&result, x); + return 31 - result; + } else { + unsigned long result; + (void)_BitScanReverse64(&result, x); + return 63 - result; + } +#endif + } +} -#endif // WJR_X86_SIMD_SIMD_CAST_HPP__ +#endif -#include +/** + * @brief Fast count leading zeros + * + * @tparam T Must be an unsigned integral type + */ +template )> +WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int clz(T x) noexcept { +#if WJR_HAS_BUILTIN(CLZ) + if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) { + return fallback_clz(x); + } + + return builtin_clz(x); +#else + return fallback_clz(x); +#endif +} + +} // namespace wjr + +#endif // WJR_MATH_CLZ_HPP__ +#ifndef WJR_MATH_CTZ_HPP__ +#define WJR_MATH_CTZ_HPP__ // Already included // Already included -namespace wjr { +#if WJR_HAS_BUILTIN(__builtin_ctz) +#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF +#elif defined(WJR_MSVC) && defined(WJR_X86) +#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF_VAR(2) +#endif -struct sse { - using mask_type = uint16_t; +#if WJR_HAS_BUILTIN(CTZ) == 2 +// Already included +#endif -#if WJR_HAS_SIMD(SSE) +namespace wjr { - using float_type = __m128; - using float_tag_type = __m128_t; +template +WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_ctz(T x) noexcept { + return fallback_popcount(lowbit(x) - 1); +} -#endif // SSE +template +WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_ctz(T x) noexcept { +#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT) + return popcount(lowbit(x) - 1); +#else + constexpr auto nd = std::numeric_limits::digits; -#if WJR_HAS_SIMD(SSE2) + if constexpr (nd < 32) { + return fallback_ctz(static_cast(x)); + } else { + x = lowbit(x); - using int_type = __m128i; - using int_tag_type = __m128i_t; - using double_type = __m128d; - using double_tag_type = __m128d_t; + if constexpr (nd <= 32) { + return math_detail::de_bruijn32.get(x); + } else if constexpr (nd <= 64) { + return math_detail::de_bruijn64.get(x); + } else { + static_assert(nd <= 64, "not support yet"); + } + } +#endif // +} -#endif // SSE2 +#if WJR_HAS_BUILTIN(CTZ) - constexpr static size_t width(); - constexpr static mask_type mask(); +template +WJR_CONST WJR_INTRINSIC_INLINE int builtin_ctz(T x) noexcept { + constexpr auto nd = std::numeric_limits::digits; -#if WJR_HAS_SIMD(SSE) + if constexpr (nd < 32) { + return builtin_ctz(static_cast(x)); + } else { +#if WJR_HAS_BUILTIN(CTZ) == 1 + if constexpr (nd <= std::numeric_limits::digits) { + return __builtin_ctz(static_cast(x)); + } else if constexpr (nd <= std::numeric_limits::digits) { + return __builtin_ctzl(static_cast(x)); + } else if constexpr (nd <= std::numeric_limits::digits) { + return __builtin_ctzll(static_cast(x)); + } else { + static_assert(nd <= 64, "not supported yet"); + } +#else + if constexpr (nd == 32) { + unsigned long result; + (void)_BitScanForward(&result, x); + return result; + } else { + unsigned long result; + (void)_BitScanForward64(&result, x); + return result; + } +#endif + } +} - WJR_INTRINSIC_INLINE static mask_type movemask_ps(__m128 v); - WJR_INTRINSIC_INLINE static void sfence(); +#endif - template - WJR_INTRINSIC_INLINE static __m128 shuffle_ps(__m128 a, __m128 b); +/** + * @brief Fast count trailing zeros + * + * @details Very fast even on non-optimized platforms by using a De Bruijn sequence. \n + * Try __builtin_clz if available, otherwise fallback to a portable implementation. \n + * In fallback_clz, use popcount and lowbit if POPCOUNT and POPCNT are available, make + * sure popcount is fast. \n + * Then use De Bruijn sequence, just a bit slower than popcount + lowbit. + * + * @tparam T Must be an unsigned integral type + */ +template )> +WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int ctz(T x) noexcept { +#if WJR_HAS_BUILTIN(CTZ) + if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) { + return fallback_ctz(x); + } -#endif // SSE + return builtin_ctz(x); +#else + return fallback_ctz(x); +#endif +} -#if WJR_HAS_SIMD(SSE2) +} // namespace wjr - WJR_INTRINSIC_INLINE static __m128i add_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i add_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i add_epi32(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i add_epi64(__m128i a, __m128i b); +#endif // WJR_MATH_CTZ_HPP__ +// Already included - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int64_t); - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint32_t); - WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint64_t); +namespace wjr::simd_detail { - WJR_INTRINSIC_INLINE static int8_t add_epi8(__m128i a); - WJR_INTRINSIC_INLINE static int16_t add_epi16(__m128i a); - WJR_INTRINSIC_INLINE static int32_t add_epi32(__m128i a); - WJR_INTRINSIC_INLINE static int64_t add_epi64(__m128i a); +template +class basic_simd_mask { + using mask_type = uint_t; + constexpr static size_t __mask_bits = BitWidth / Size; + constexpr static mask_type __half_mask = + static_cast>(in_place_max); + constexpr static mask_type __full_mask = in_place_max; - WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m128i a); - WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m128i a); - WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m128i a); - WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m128i a); +public: + WJR_ENABLE_DEFAULT_SPECIAL_MEMBERS(basic_simd_mask); - WJR_INTRINSIC_INLINE static int8_t add(__m128i a, int8_t); - WJR_INTRINSIC_INLINE static int16_t add(__m128i a, int16_t); - WJR_INTRINSIC_INLINE static int32_t add(__m128i a, int32_t); - WJR_INTRINSIC_INLINE static int64_t add(__m128i a, int64_t); - WJR_INTRINSIC_INLINE static uint8_t add(__m128i a, uint8_t); - WJR_INTRINSIC_INLINE static uint16_t add(__m128i a, uint16_t); - WJR_INTRINSIC_INLINE static uint32_t add(__m128i a, uint32_t); - WJR_INTRINSIC_INLINE static uint64_t add(__m128i a, uint64_t); + constexpr basic_simd_mask(mask_type mask) noexcept : m_mask(mask) {} - WJR_INTRINSIC_INLINE static __m128i adds_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i adds_epi16(__m128i a, __m128i b); + WJR_PURE WJR_CONSTEXPR20 int clz() const noexcept { + WJR_ASSERT_ASSUME(m_mask != 0); - WJR_INTRINSIC_INLINE static __m128i adds_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i adds_epu16(__m128i a, __m128i b); + if constexpr (Size == 2) { + constexpr auto high_mask = __half_mask << (BitWidth / 2); - WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint16_t); + return (m_mask & high_mask) ? 0 : 1; + } else { + return ::wjr::clz(m_mask) / __mask_bits; + } + } - template - WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b); + WJR_PURE WJR_CONSTEXPR20 int ctz() const noexcept { + WJR_ASSERT_ASSUME(m_mask != 0); - WJR_INTRINSIC_INLINE static __m128i alignr_epi16(__m128i a, __m128i b, int c); - WJR_INTRINSIC_INLINE static __m128i alignr_epi32(__m128i a, __m128i b, int c); - WJR_INTRINSIC_INLINE static __m128i alignr_epi64(__m128i a, __m128i b, int c); + if constexpr (Size == 2) { + constexpr auto low_mask = __half_mask; - WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int16_t); - WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int32_t); - WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int64_t); - WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint16_t); - WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint32_t); - WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint64_t); + return (m_mask & low_mask) ? 0 : 1; + } else { + return ::wjr::ctz(m_mask) / __mask_bits; + } + } - WJR_INTRINSIC_INLINE static __m128i And(__m128i a, __m128i b); + WJR_PURE constexpr bool all() const noexcept { return m_mask == __full_mask; } - WJR_INTRINSIC_INLINE static __m128i AndNot(__m128i a, __m128i b); +private: + mask_type m_mask; +}; - WJR_INTRINSIC_INLINE static __m128i avg_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i avg_epu16(__m128i a, __m128i b); +} // namespace wjr::simd_detail - WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint16_t); +#endif // WJR_SIMD_SIMD_MASK_HPP__ - // notice that mask must be 0 or 255(every byte) - WJR_INTRINSIC_INLINE static __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask); - WJR_INTRINSIC_INLINE static __m128i blendv_epi16(__m128i a, __m128i b, __m128i mask); - WJR_INTRINSIC_INLINE static __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask); +namespace wjr { - WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, - int8_t); - WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, - int16_t); - WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, - int32_t); - WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, - uint8_t); - WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, - uint16_t); - WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, - uint32_t); +namespace simd_abi { - template - WJR_INTRINSIC_INLINE static __m128i bslli(__m128i val); +template +struct fixed_size {}; - template - WJR_INTRINSIC_INLINE static __m128i bsrli(__m128i val); +} // namespace simd_abi - WJR_INTRINSIC_INLINE static __m128i cmpeq_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpeq_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpeq_epi32(__m128i a, __m128i b); +struct element_aligned_t {}; +inline constexpr element_aligned_t element_aligned{}; - WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint32_t); +struct vector_aligned_t {}; +inline constexpr vector_aligned_t vector_aligned{}; - WJR_INTRINSIC_INLINE static __m128i cmpge_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpge_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpge_epi32(__m128i a, __m128i b); +template +class simd; - WJR_INTRINSIC_INLINE static __m128i cmpge_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpge_epu16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpge_epu32(__m128i a, __m128i b); +template +using fixed_size_simd = simd>; - WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint32_t); +} // namespace wjr - WJR_INTRINSIC_INLINE static __m128i cmpgt_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpgt_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpgt_epi32(__m128i a, __m128i b); +#endif // WJR_SIMD_DETAIL_HPP__ +// Already included - WJR_INTRINSIC_INLINE static __m128i cmpgt_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpgt_epu16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpgt_epu32(__m128i a, __m128i b); +namespace wjr { - WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint32_t); +// simd type can't be directly used on template +template +struct simd_wrapper { + using type = T; +}; - WJR_INTRINSIC_INLINE static __m128i cmple_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmple_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmple_epi32(__m128i a, __m128i b); +template +using simd_wrapper_t = typename simd_wrapper::type; - WJR_INTRINSIC_INLINE static __m128i cmple_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmple_epu16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmple_epu32(__m128i a, __m128i b); +#if WJR_HAS_SIMD(SSE) - WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint32_t); +struct __m128_t { + using type = __m128; +}; - WJR_INTRINSIC_INLINE static __m128i cmplt_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmplt_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmplt_epi32(__m128i a, __m128i b); +#endif // SSE - WJR_INTRINSIC_INLINE static __m128i cmplt_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmplt_epu16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmplt_epu32(__m128i a, __m128i b); +#if WJR_HAS_SIMD(SSE2) - WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint32_t); +struct __m128i_t { + using type = __m128i; +}; - WJR_INTRINSIC_INLINE static __m128i cmpne_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpne_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i cmpne_epi32(__m128i a, __m128i b); +struct __m128d_t { + using type = __m128d; +}; - WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint32_t); +template <> +struct simd_cast_fn<__m128_t, __m128i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128 v) const { + return _mm_castps_si128(v); + } +}; - template - WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::equal_to<>, T); - template - WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::not_equal_to<>, T); - template - WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater<>, T); - template - WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater_equal<>, - T); - template - WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less<>, T); - template - WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less_equal<>, T); +template <> +struct simd_cast_fn<__m128_t, __m128d_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128 v) const { + return _mm_castps_pd(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i concat(uint64_t lo, uint64_t hi); +template <> +struct simd_cast_fn<__m128i_t, __m128_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128i v) const { + return _mm_castsi128_ps(v); + } +}; - template - WJR_INTRINSIC_INLINE static int extract_epi8(__m128i a); +template <> +struct simd_cast_fn<__m128i_t, __m128d_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128i v) const { + return _mm_castsi128_pd(v); + } +}; - template - WJR_INTRINSIC_INLINE static int extract_epi16(__m128i a); - template - WJR_INTRINSIC_INLINE static int extract_epi32(__m128i a); - template - WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m128i a); +template <> +struct simd_cast_fn<__m128d_t, __m128_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128d v) const { + return _mm_castpd_ps(v); + } +}; - template - WJR_INTRINSIC_INLINE static int extract(__m128i a, int8_t); - template - WJR_INTRINSIC_INLINE static int extract(__m128i a, int16_t); - template - WJR_INTRINSIC_INLINE static int extract(__m128i a, int32_t); - template - WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, int64_t); - template - WJR_INTRINSIC_INLINE static int extract(__m128i a, uint8_t); - template - WJR_INTRINSIC_INLINE static int extract(__m128i a, uint16_t); - template - WJR_INTRINSIC_INLINE static int extract(__m128i a, uint32_t); - template - WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, uint64_t); +template <> +struct simd_cast_fn<__m128d_t, __m128i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128d v) const { + return _mm_castpd_si128(v); + } +}; - WJR_INTRINSIC_INLINE static uint64_t getlow(__m128i v); - WJR_INTRINSIC_INLINE static uint64_t gethigh(__m128i v); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int8_t v) const { + return _mm_cvtsi32_si128(v); + } +}; - template - WJR_INTRINSIC_INLINE static __m128i insert_epi16(__m128i a, int i); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const { + return _mm_cvtsi32_si128(v); + } +}; - template - WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, int16_t); - template - WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, uint16_t); +template <> +struct simd_cast_fn<__m128i_t, int8_t> { + WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m128i v) const { + return _mm_cvtsi128_si32(v); + } +}; - WJR_INTRINSIC_INLINE static void lfence(); +template <> +struct simd_cast_fn<__m128i_t, uint8_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m128i v) const { + return _mm_cvtsi128_si32(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i load(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu_si16(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu_si32(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu_si64(const void *ptr); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int16_t v) const { + return _mm_cvtsi32_si128(v); + } +}; - template )> - WJR_INTRINSIC_INLINE static __m128i logical_and(__m128i a, __m128i b, T); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const { + return _mm_cvtsi32_si128(v); + } +}; - template )> - WJR_INTRINSIC_INLINE static __m128i logical_not(__m128i v, T); +template <> +struct simd_cast_fn<__m128i_t, int16_t> { + WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m128i v) const { + return _mm_cvtsi128_si32(v); + } +}; - template )> - WJR_INTRINSIC_INLINE static __m128i logical_or(__m128i a, __m128i b, T); +template <> +struct simd_cast_fn<__m128i_t, uint16_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m128i v) const { + return _mm_cvtsi128_si32(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i madd_epi16(__m128i a, __m128i b); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int32_t v) const { + return _mm_cvtsi32_si128(v); + } +}; - WJR_INTRINSIC_INLINE static void maskmoveu(__m128i a, __m128i mask, char *mem_addr); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const { + return _mm_cvtsi32_si128(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i max_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i max_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i max_epi32(__m128i a, __m128i b); +template <> +struct simd_cast_fn<__m128i_t, int32_t> { + WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m128i v) const { + return _mm_cvtsi128_si32(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i max_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i max_epu16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i max_epu32(__m128i a, __m128i b); +template <> +struct simd_cast_fn<__m128i_t, uint32_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m128i v) const { + return _mm_cvtsi128_si32(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint32_t); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int64_t v) const { + return _mm_cvtsi64_si128(v); + } +}; - WJR_INTRINSIC_INLINE static int8_t max_epi8(__m128i a); - WJR_INTRINSIC_INLINE static int16_t max_epi16(__m128i a); - WJR_INTRINSIC_INLINE static int32_t max_epi32(__m128i a); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const { + return _mm_cvtsi64_si128(v); + } +}; - WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m128i a); - WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m128i a); - WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m128i a); +template <> +struct simd_cast_fn<__m128i_t, int64_t> { + WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m128i v) const { + return _mm_cvtsi128_si64(v); + } +}; - WJR_INTRINSIC_INLINE static int8_t max(__m128i a, int8_t); - WJR_INTRINSIC_INLINE static int16_t max(__m128i a, int16_t); - WJR_INTRINSIC_INLINE static int32_t max(__m128i a, int32_t); - WJR_INTRINSIC_INLINE static uint8_t max(__m128i a, uint8_t); - WJR_INTRINSIC_INLINE static uint16_t max(__m128i a, uint16_t); - WJR_INTRINSIC_INLINE static uint32_t max(__m128i a, uint32_t); +template <> +struct simd_cast_fn<__m128i_t, uint64_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m128i v) const { + return _mm_cvtsi128_si64(v); + } +}; - WJR_INTRINSIC_INLINE static void mfence(); +#endif // SSE2 - WJR_INTRINSIC_INLINE static __m128i min_epi8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i min_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i min_epi32(__m128i a, __m128i b); +#if WJR_HAS_SIMD(AVX) - WJR_INTRINSIC_INLINE static __m128i min_epu8(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i min_epu16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i min_epu32(__m128i a, __m128i b); +struct __m256_t { + using type = __m256; +}; - WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int8_t); - WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint8_t); - WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint32_t); +struct __m256i_t { + using type = __m256i; +}; - WJR_INTRINSIC_INLINE static int8_t min_epi8(__m128i a); - WJR_INTRINSIC_INLINE static int16_t min_epi16(__m128i a); - WJR_INTRINSIC_INLINE static int32_t min_epi32(__m128i a); +struct __m256d_t { + using type = __m256d; +}; - WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m128i a); - WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m128i a); - WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m128i a); +template <> +struct simd_cast_fn<__m256_t, __m256i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256 v) const { + return _mm256_castps_si256(v); + } +}; - WJR_INTRINSIC_INLINE static int8_t min(__m128i a, int8_t); - WJR_INTRINSIC_INLINE static int16_t min(__m128i a, int16_t); - WJR_INTRINSIC_INLINE static int32_t min(__m128i a, int32_t); +template <> +struct simd_cast_fn<__m256_t, __m256d_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256 v) const { + return _mm256_castps_pd(v); + } +}; - WJR_INTRINSIC_INLINE static uint8_t min(__m128i a, uint8_t); - WJR_INTRINSIC_INLINE static uint16_t min(__m128i a, uint16_t); - WJR_INTRINSIC_INLINE static uint32_t min(__m128i a, uint32_t); +template <> +struct simd_cast_fn<__m256i_t, __m256_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256i v) const { + return _mm256_castsi256_ps(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i move_epi64(__m128i a); +template <> +struct simd_cast_fn<__m256i_t, __m256d_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256i v) const { + return _mm256_castsi256_pd(v); + } +}; - WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m128i a); - WJR_INTRINSIC_INLINE static mask_type movemask_pd(__m128d v); +template <> +struct simd_cast_fn<__m256d_t, __m256_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256d v) const { + return _mm256_castpd_ps(v); + } +}; - WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int8_t); - WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int32_t); - WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int64_t); +template <> +struct simd_cast_fn<__m256d_t, __m256i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256d v) const { + return _mm256_castpd_si256(v); + } +}; - WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint8_t); - WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint32_t); - WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint64_t); +template <> +struct simd_cast_fn<__m128i_t, __m256i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const { + return _mm256_castsi128_si256(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i mul_epu32(__m128i a, __m128i b); +template <> +struct simd_cast_fn<__m256i_t, __m128i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m256i v) const { + return _mm256_castsi256_si128(v); + } +}; - WJR_INTRINSIC_INLINE static __m128i mulhi_epi16(__m128i a, __m128i b); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int8_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i mulhi_epu16(__m128i a, __m128i b); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i mullo_epi16(__m128i a, __m128i b); +template <> +struct simd_cast_fn<__m256i_t, int8_t> { + WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i negate_epi8(__m128i a); - WJR_INTRINSIC_INLINE static __m128i negate_epi16(__m128i a); - WJR_INTRINSIC_INLINE static __m128i negate_epi32(__m128i a); - WJR_INTRINSIC_INLINE static __m128i negate_epi64(__m128i a); +template <> +struct simd_cast_fn<__m256i_t, uint8_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int8_t); - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int16_t); - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int32_t); - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int64_t); - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint8_t); - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint16_t); - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint32_t); - WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint64_t); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int16_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i Not(__m128i v); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i Or(__m128i a, __m128i b); +template <> +struct simd_cast_fn<__m256i_t, int16_t> { + WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i packs_epi16(__m128i a, __m128i b); - WJR_INTRINSIC_INLINE static __m128i packs_epi32(__m128i a, __m128i b); +template <> +struct simd_cast_fn<__m256i_t, uint16_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i packus_epi16(__m128i a, __m128i b); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int32_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i loadu_si48(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu_si80(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu_si96(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu_si112(const void *ptr); - WJR_INTRINSIC_INLINE static __m128i loadu_si128(const void *ptr); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i loadu_si16x(const void *ptr, int n); +template <> +struct simd_cast_fn<__m256i_t, int32_t> { + WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i sad_epu8(__m128i a, __m128i b); +template <> +struct simd_cast_fn<__m256i_t, uint32_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i zeros(); - WJR_INTRINSIC_INLINE static __m128i ones(); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int64_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i set_epi8(char e15, char e14, char e13, char e12, - char e11, char e10, char e9, char e8, - char e7, char e6, char e5, char e4, - char e3, char e2, char e1, char e0); +template <> +struct simd_cast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const { + return simd_cast<__m128i_t, __m256i_t>(simd_cast(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i set_epi16(short e7, short e6, short e5, short e4, - short e3, short e2, short e1, short e0); - WJR_INTRINSIC_INLINE static __m128i set_epi32(int e3, int e2, int e1, int e0); - WJR_INTRINSIC_INLINE static __m128i set_epi64x(long long e1, long long e0); +template <> +struct simd_cast_fn<__m256i_t, int64_t> { + WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i setr_epi8(char e15, char e14, char e13, char e12, - char e11, char e10, char e9, char e8, - char e7, char e6, char e5, char e4, - char e3, char e2, char e1, char e0); +template <> +struct simd_cast_fn<__m256i_t, uint64_t> { + WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m256i v) const { + return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v)); + } +}; - WJR_INTRINSIC_INLINE static __m128i setr_epi16(short e7, short e6, short e5, short e4, - short e3, short e2, short e1, - short e0); - WJR_INTRINSIC_INLINE static __m128i setr_epi32(int e3, int e2, int e1, int e0); +#endif // AVX - WJR_INTRINSIC_INLINE static __m128i set1_epi8(int8_t val); - WJR_INTRINSIC_INLINE static __m128i set1_epi16(int16_t val); - WJR_INTRINSIC_INLINE static __m128i set1_epi32(int32_t val); - WJR_INTRINSIC_INLINE static __m128i set1_epi64(int64_t val); +} // namespace wjr - WJR_INTRINSIC_INLINE static __m128i set1(int8_t val, int8_t); - WJR_INTRINSIC_INLINE static __m128i set1(int16_t val, int16_t); - WJR_INTRINSIC_INLINE static __m128i set1(int32_t val, int32_t); - WJR_INTRINSIC_INLINE static __m128i set1(int64_t val, int64_t); - WJR_INTRINSIC_INLINE static __m128i set1(uint8_t val, uint8_t); - WJR_INTRINSIC_INLINE static __m128i set1(uint16_t val, uint16_t); - WJR_INTRINSIC_INLINE static __m128i set1(uint32_t val, uint32_t); - WJR_INTRINSIC_INLINE static __m128i set1(uint64_t val, uint64_t); +#endif // WJR_X86_SIMD_SIMD_CAST_HPP__ - WJR_INTRINSIC_INLINE static __m128i setmin_epi8(); - WJR_INTRINSIC_INLINE static __m128i setmin_epi16(); - WJR_INTRINSIC_INLINE static __m128i setmin_epi32(); +#include - WJR_INTRINSIC_INLINE static __m128i setmin(int8_t); - WJR_INTRINSIC_INLINE static __m128i setmin(int16_t); - WJR_INTRINSIC_INLINE static __m128i setmin(int32_t); - WJR_INTRINSIC_INLINE static __m128i setmin(uint8_t); - WJR_INTRINSIC_INLINE static __m128i setmin(uint16_t); - WJR_INTRINSIC_INLINE static __m128i setmin(uint32_t); +// Already included +// Already included - WJR_INTRINSIC_INLINE static __m128i setmax_epi8(); - WJR_INTRINSIC_INLINE static __m128i setmax_epi16(); - WJR_INTRINSIC_INLINE static __m128i setmax_epi32(); +namespace wjr { - WJR_INTRINSIC_INLINE static __m128i setmax(int8_t); - WJR_INTRINSIC_INLINE static __m128i setmax(int16_t); - WJR_INTRINSIC_INLINE static __m128i setmax(int32_t); - WJR_INTRINSIC_INLINE static __m128i setmax(uint8_t); - WJR_INTRINSIC_INLINE static __m128i setmax(uint16_t); - WJR_INTRINSIC_INLINE static __m128i setmax(uint32_t); +struct sse { + using mask_type = uint16_t; - template - WJR_INTRINSIC_INLINE static __m128i shl(__m128i a); +#if WJR_HAS_SIMD(SSE) - template - WJR_INTRINSIC_INLINE static __m128i shr(__m128i b); + using float_type = __m128; + using float_tag_type = __m128_t; - template - WJR_INTRINSIC_INLINE static __m128i shuffle_epi32(__m128i v); +#endif // SSE - template - WJR_INTRINSIC_INLINE static __m128i shufflehi_epi16(__m128i v); +#if WJR_HAS_SIMD(SSE2) + + using int_type = __m128i; + using int_tag_type = __m128i_t; + using double_type = __m128d; + using double_tag_type = __m128d_t; + +#endif // SSE2 + + constexpr static size_t width(); + constexpr static mask_type mask(); + +#if WJR_HAS_SIMD(SSE) + + WJR_INTRINSIC_INLINE static mask_type movemask_ps(__m128 v); + WJR_INTRINSIC_INLINE static void sfence(); + + template + WJR_INTRINSIC_INLINE static __m128 shuffle_ps(__m128 a, __m128 b); + +#endif // SSE + +#if WJR_HAS_SIMD(SSE2) + + WJR_INTRINSIC_INLINE static __m128i add_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i add_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i add_epi32(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i add_epi64(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int64_t); + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint32_t); + WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint64_t); + + WJR_INTRINSIC_INLINE static int8_t add_epi8(__m128i a); + WJR_INTRINSIC_INLINE static int16_t add_epi16(__m128i a); + WJR_INTRINSIC_INLINE static int32_t add_epi32(__m128i a); + WJR_INTRINSIC_INLINE static int64_t add_epi64(__m128i a); + + WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m128i a); + WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m128i a); + WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m128i a); + WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m128i a); + + WJR_INTRINSIC_INLINE static int8_t add(__m128i a, int8_t); + WJR_INTRINSIC_INLINE static int16_t add(__m128i a, int16_t); + WJR_INTRINSIC_INLINE static int32_t add(__m128i a, int32_t); + WJR_INTRINSIC_INLINE static int64_t add(__m128i a, int64_t); + WJR_INTRINSIC_INLINE static uint8_t add(__m128i a, uint8_t); + WJR_INTRINSIC_INLINE static uint16_t add(__m128i a, uint16_t); + WJR_INTRINSIC_INLINE static uint32_t add(__m128i a, uint32_t); + WJR_INTRINSIC_INLINE static uint64_t add(__m128i a, uint64_t); + + WJR_INTRINSIC_INLINE static __m128i adds_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i adds_epi16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i adds_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i adds_epu16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint16_t); + + template + WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i alignr_epi16(__m128i a, __m128i b, int c); + WJR_INTRINSIC_INLINE static __m128i alignr_epi32(__m128i a, __m128i b, int c); + WJR_INTRINSIC_INLINE static __m128i alignr_epi64(__m128i a, __m128i b, int c); + + WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int16_t); + WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int32_t); + WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int64_t); + WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint16_t); + WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint32_t); + WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint64_t); + + WJR_INTRINSIC_INLINE static __m128i And(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i AndNot(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i avg_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i avg_epu16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint16_t); + + // notice that mask must be 0 or 255(every byte) + WJR_INTRINSIC_INLINE static __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask); + WJR_INTRINSIC_INLINE static __m128i blendv_epi16(__m128i a, __m128i b, __m128i mask); + WJR_INTRINSIC_INLINE static __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask); + + WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, + int8_t); + WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, + int16_t); + WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, + int32_t); + WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, + uint8_t); + WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, + uint16_t); + WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask, + uint32_t); + + template + WJR_INTRINSIC_INLINE static __m128i bslli(__m128i val); + + template + WJR_INTRINSIC_INLINE static __m128i bsrli(__m128i val); + + WJR_INTRINSIC_INLINE static __m128i cmpeq_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpeq_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpeq_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint32_t); + + WJR_INTRINSIC_INLINE static __m128i cmpge_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpge_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpge_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmpge_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpge_epu16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpge_epu32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint32_t); + + WJR_INTRINSIC_INLINE static __m128i cmpgt_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpgt_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpgt_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmpgt_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpgt_epu16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpgt_epu32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint32_t); + + WJR_INTRINSIC_INLINE static __m128i cmple_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmple_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmple_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmple_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmple_epu16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmple_epu32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint32_t); + + WJR_INTRINSIC_INLINE static __m128i cmplt_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmplt_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmplt_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmplt_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmplt_epu16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmplt_epu32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint32_t); + + WJR_INTRINSIC_INLINE static __m128i cmpne_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpne_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i cmpne_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint32_t); + + template + WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::equal_to<>, T); + template + WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::not_equal_to<>, T); + template + WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater<>, T); + template + WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater_equal<>, + T); + template + WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less<>, T); + template + WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less_equal<>, T); + + WJR_INTRINSIC_INLINE static __m128i concat(uint64_t lo, uint64_t hi); + + template + WJR_INTRINSIC_INLINE static int extract_epi8(__m128i a); + + template + WJR_INTRINSIC_INLINE static int extract_epi16(__m128i a); + template + WJR_INTRINSIC_INLINE static int extract_epi32(__m128i a); + template + WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m128i a); + + template + WJR_INTRINSIC_INLINE static int extract(__m128i a, int8_t); + template + WJR_INTRINSIC_INLINE static int extract(__m128i a, int16_t); + template + WJR_INTRINSIC_INLINE static int extract(__m128i a, int32_t); + template + WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, int64_t); + template + WJR_INTRINSIC_INLINE static int extract(__m128i a, uint8_t); + template + WJR_INTRINSIC_INLINE static int extract(__m128i a, uint16_t); + template + WJR_INTRINSIC_INLINE static int extract(__m128i a, uint32_t); + template + WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, uint64_t); + + WJR_INTRINSIC_INLINE static uint64_t getlow(__m128i v); + WJR_INTRINSIC_INLINE static uint64_t gethigh(__m128i v); + + template + WJR_INTRINSIC_INLINE static __m128i insert_epi16(__m128i a, int i); + + template + WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, int16_t); + template + WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, uint16_t); + + WJR_INTRINSIC_INLINE static void lfence(); + + WJR_INTRINSIC_INLINE static __m128i load(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu_si16(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu_si32(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu_si64(const void *ptr); + + template )> + WJR_INTRINSIC_INLINE static __m128i logical_and(__m128i a, __m128i b, T); + + template )> + WJR_INTRINSIC_INLINE static __m128i logical_not(__m128i v, T); + + template )> + WJR_INTRINSIC_INLINE static __m128i logical_or(__m128i a, __m128i b, T); + + WJR_INTRINSIC_INLINE static __m128i madd_epi16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static void maskmoveu(__m128i a, __m128i mask, char *mem_addr); + + WJR_INTRINSIC_INLINE static __m128i max_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i max_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i max_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i max_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i max_epu16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i max_epu32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint32_t); + + WJR_INTRINSIC_INLINE static int8_t max_epi8(__m128i a); + WJR_INTRINSIC_INLINE static int16_t max_epi16(__m128i a); + WJR_INTRINSIC_INLINE static int32_t max_epi32(__m128i a); + + WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m128i a); + WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m128i a); + WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m128i a); + + WJR_INTRINSIC_INLINE static int8_t max(__m128i a, int8_t); + WJR_INTRINSIC_INLINE static int16_t max(__m128i a, int16_t); + WJR_INTRINSIC_INLINE static int32_t max(__m128i a, int32_t); + WJR_INTRINSIC_INLINE static uint8_t max(__m128i a, uint8_t); + WJR_INTRINSIC_INLINE static uint16_t max(__m128i a, uint16_t); + WJR_INTRINSIC_INLINE static uint32_t max(__m128i a, uint32_t); + + WJR_INTRINSIC_INLINE static void mfence(); + + WJR_INTRINSIC_INLINE static __m128i min_epi8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i min_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i min_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i min_epu8(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i min_epu16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i min_epu32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int8_t); + WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint8_t); + WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint32_t); + + WJR_INTRINSIC_INLINE static int8_t min_epi8(__m128i a); + WJR_INTRINSIC_INLINE static int16_t min_epi16(__m128i a); + WJR_INTRINSIC_INLINE static int32_t min_epi32(__m128i a); + + WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m128i a); + WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m128i a); + WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m128i a); + + WJR_INTRINSIC_INLINE static int8_t min(__m128i a, int8_t); + WJR_INTRINSIC_INLINE static int16_t min(__m128i a, int16_t); + WJR_INTRINSIC_INLINE static int32_t min(__m128i a, int32_t); + + WJR_INTRINSIC_INLINE static uint8_t min(__m128i a, uint8_t); + WJR_INTRINSIC_INLINE static uint16_t min(__m128i a, uint16_t); + WJR_INTRINSIC_INLINE static uint32_t min(__m128i a, uint32_t); + + WJR_INTRINSIC_INLINE static __m128i move_epi64(__m128i a); + + WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m128i a); + WJR_INTRINSIC_INLINE static mask_type movemask_pd(__m128d v); + + WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int8_t); + WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int32_t); + WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int64_t); + + WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint8_t); + WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint32_t); + WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint64_t); + + WJR_INTRINSIC_INLINE static __m128i mul_epu32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i mulhi_epi16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i mulhi_epu16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i mullo_epi16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i negate_epi8(__m128i a); + WJR_INTRINSIC_INLINE static __m128i negate_epi16(__m128i a); + WJR_INTRINSIC_INLINE static __m128i negate_epi32(__m128i a); + WJR_INTRINSIC_INLINE static __m128i negate_epi64(__m128i a); + + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int8_t); + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int16_t); + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int32_t); + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int64_t); + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint8_t); + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint16_t); + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint32_t); + WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint64_t); + + WJR_INTRINSIC_INLINE static __m128i Not(__m128i v); + + WJR_INTRINSIC_INLINE static __m128i Or(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i packs_epi16(__m128i a, __m128i b); + WJR_INTRINSIC_INLINE static __m128i packs_epi32(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i packus_epi16(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i loadu_si48(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu_si80(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu_si96(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu_si112(const void *ptr); + WJR_INTRINSIC_INLINE static __m128i loadu_si128(const void *ptr); + + WJR_INTRINSIC_INLINE static __m128i loadu_si16x(const void *ptr, int n); + + WJR_INTRINSIC_INLINE static __m128i sad_epu8(__m128i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m128i zeros(); + WJR_INTRINSIC_INLINE static __m128i ones(); + + WJR_INTRINSIC_INLINE static __m128i set_epi8(char e15, char e14, char e13, char e12, + char e11, char e10, char e9, char e8, + char e7, char e6, char e5, char e4, + char e3, char e2, char e1, char e0); + + WJR_INTRINSIC_INLINE static __m128i set_epi16(short e7, short e6, short e5, short e4, + short e3, short e2, short e1, short e0); + WJR_INTRINSIC_INLINE static __m128i set_epi32(int e3, int e2, int e1, int e0); + WJR_INTRINSIC_INLINE static __m128i set_epi64x(long long e1, long long e0); + + WJR_INTRINSIC_INLINE static __m128i setr_epi8(char e15, char e14, char e13, char e12, + char e11, char e10, char e9, char e8, + char e7, char e6, char e5, char e4, + char e3, char e2, char e1, char e0); + + WJR_INTRINSIC_INLINE static __m128i setr_epi16(short e7, short e6, short e5, short e4, + short e3, short e2, short e1, + short e0); + WJR_INTRINSIC_INLINE static __m128i setr_epi32(int e3, int e2, int e1, int e0); + + WJR_INTRINSIC_INLINE static __m128i set1_epi8(int8_t val); + WJR_INTRINSIC_INLINE static __m128i set1_epi16(int16_t val); + WJR_INTRINSIC_INLINE static __m128i set1_epi32(int32_t val); + WJR_INTRINSIC_INLINE static __m128i set1_epi64(int64_t val); + + WJR_INTRINSIC_INLINE static __m128i set1(int8_t val, int8_t); + WJR_INTRINSIC_INLINE static __m128i set1(int16_t val, int16_t); + WJR_INTRINSIC_INLINE static __m128i set1(int32_t val, int32_t); + WJR_INTRINSIC_INLINE static __m128i set1(int64_t val, int64_t); + WJR_INTRINSIC_INLINE static __m128i set1(uint8_t val, uint8_t); + WJR_INTRINSIC_INLINE static __m128i set1(uint16_t val, uint16_t); + WJR_INTRINSIC_INLINE static __m128i set1(uint32_t val, uint32_t); + WJR_INTRINSIC_INLINE static __m128i set1(uint64_t val, uint64_t); + + WJR_INTRINSIC_INLINE static __m128i setmin_epi8(); + WJR_INTRINSIC_INLINE static __m128i setmin_epi16(); + WJR_INTRINSIC_INLINE static __m128i setmin_epi32(); + + WJR_INTRINSIC_INLINE static __m128i setmin(int8_t); + WJR_INTRINSIC_INLINE static __m128i setmin(int16_t); + WJR_INTRINSIC_INLINE static __m128i setmin(int32_t); + WJR_INTRINSIC_INLINE static __m128i setmin(uint8_t); + WJR_INTRINSIC_INLINE static __m128i setmin(uint16_t); + WJR_INTRINSIC_INLINE static __m128i setmin(uint32_t); + + WJR_INTRINSIC_INLINE static __m128i setmax_epi8(); + WJR_INTRINSIC_INLINE static __m128i setmax_epi16(); + WJR_INTRINSIC_INLINE static __m128i setmax_epi32(); + + WJR_INTRINSIC_INLINE static __m128i setmax(int8_t); + WJR_INTRINSIC_INLINE static __m128i setmax(int16_t); + WJR_INTRINSIC_INLINE static __m128i setmax(int32_t); + WJR_INTRINSIC_INLINE static __m128i setmax(uint8_t); + WJR_INTRINSIC_INLINE static __m128i setmax(uint16_t); + WJR_INTRINSIC_INLINE static __m128i setmax(uint32_t); + + template + WJR_INTRINSIC_INLINE static __m128i shl(__m128i a); + + template + WJR_INTRINSIC_INLINE static __m128i shr(__m128i b); + + template + WJR_INTRINSIC_INLINE static __m128i shuffle_epi32(__m128i v); + + template + WJR_INTRINSIC_INLINE static __m128i shufflehi_epi16(__m128i v); template WJR_INTRINSIC_INLINE static __m128i shufflelo_epi16(__m128i v); @@ -4680,2009 +5373,2015 @@ struct sse { #endif // SSE4_1 }; -struct avx { - using mask_type = uint32_t; +namespace sse_detail { +#if WJR_HAS_SIMD(SSE2) -#if WJR_HAS_SIMD(AVX) +const static __m128i srli_epi8_mask[8] = { + sse::set1_epi16(0xFFFF), sse::set1_epi16(0x7F7F), sse::set1_epi16(0x3F3F), + sse::set1_epi16(0x1F1F), sse::set1_epi16(0xF0F), sse::set1_epi16(0x707), + sse::set1_epi16(0x303), sse::set1_epi16(0x101), +}; - using float_type = __m256; - using float_tag_type = __m256_t; - using int_type = __m256i; - using int_tag_type = __m256i_t; - using double_type = __m256d; - using double_tag_type = __m256d_t; +#endif +} // namespace sse_detail -#endif // AVX +#if WJR_HAS_SIMD(SSE2) - constexpr static size_t width(); - constexpr static mask_type mask(); +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const { + return _mm_set1_epi8(v); + } +}; -#if WJR_HAS_SIMD(AVX) +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const { + return _mm_set1_epi16(v); + } +}; - WJR_INTRINSIC_INLINE static __m256i concat(__m128i a, __m128i b); +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const { + return _mm_set1_epi32(v); + } +}; - template - WJR_INTRINSIC_INLINE static int extract_epi32(__m256i v); - template - WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m256i v); +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const { + return _mm_set1_epi64x(v); + } +}; - template - WJR_INTRINSIC_INLINE static int extract(__m256i v, int32_t); - template - WJR_INTRINSIC_INLINE static int64_t extract(__m256i v, int64_t); +template <> +struct broadcast_fn<__m128i_t, __m128i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128i v) const { return v; } +}; - template - WJR_INTRINSIC_INLINE static __m128i extract_si128(__m256i v); +#endif // SSE2 - WJR_INTRINSIC_INLINE static __m128i getlow(__m256i a); +/*------------------------sse------------------------*/ - WJR_INTRINSIC_INLINE static __m128i gethigh(__m256i a); +constexpr size_t sse::width() { return 128; } - template - WJR_INTRINSIC_INLINE static __m256i insert_epi8(__m256i v, int8_t i); - template - WJR_INTRINSIC_INLINE static __m256i insert_epi16(__m256i v, int16_t i); - template - WJR_INTRINSIC_INLINE static __m256i insert_epi32(__m256i v, int32_t i); - template - WJR_INTRINSIC_INLINE static __m256i insert_epi64(__m256i v, int64_t i); +constexpr sse::mask_type sse::mask() { return 0xFFFF; } - template - WJR_INTRINSIC_INLINE static __m256i insert_si128(__m256i a, __m128i b); +#if WJR_HAS_SIMD(SSE) - WJR_INTRINSIC_INLINE static __m256i load(const void *p); - WJR_INTRINSIC_INLINE static __m256i loadu(const void *p); +sse::mask_type sse::movemask_ps(__m128 v) { + return static_cast(_mm_movemask_ps(v)); +} - WJR_INTRINSIC_INLINE static __m256i ones(); +void sse::sfence() { return _mm_sfence(); } - WJR_INTRINSIC_INLINE static __m256i loadu_si16(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si32(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si48(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si64(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si80(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si96(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si112(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si128(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si144(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si160(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si176(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si192(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si208(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si224(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si240(const void *ptr); - WJR_INTRINSIC_INLINE static __m256i loadu_si256(const void *ptr); +template +__m128 sse::shuffle_ps(__m128 a, __m128 b) { + static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]"); + return _mm_shuffle_ps(a, b, imm8); +} - WJR_INTRINSIC_INLINE static __m256i loadu_si16x(const void *ptr, int n); +#endif // SSE - WJR_INTRINSIC_INLINE static __m256i - set_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25, - char e24, char e23, char e22, char e21, char e20, char e19, char e18, - char e17, char e16, char e15, char e14, char e13, char e12, char e11, - char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, - char e2, char e1, char e0); +#if WJR_HAS_SIMD(SSE2) - WJR_INTRINSIC_INLINE static __m256i set_epi16(short e15, short e14, short e13, - short e12, short e11, short e10, - short e9, short e8, short e7, short e6, - short e5, short e4, short e3, short e2, - short e1, short e0); +__m128i sse::add_epi8(__m128i a, __m128i b) { return _mm_add_epi8(a, b); } +__m128i sse::add_epi16(__m128i a, __m128i b) { return _mm_add_epi16(a, b); } +__m128i sse::add_epi32(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } +__m128i sse::add_epi64(__m128i a, __m128i b) { return _mm_add_epi64(a, b); } - WJR_INTRINSIC_INLINE static __m256i set_epi32(int e7, int e6, int e5, int e4, int e3, - int e2, int e1, int e0); +__m128i sse::add(__m128i a, __m128i b, int8_t) { return add_epi8(a, b); } +__m128i sse::add(__m128i a, __m128i b, int16_t) { return add_epi16(a, b); } +__m128i sse::add(__m128i a, __m128i b, int32_t) { return add_epi32(a, b); } +__m128i sse::add(__m128i a, __m128i b, int64_t) { return add_epi64(a, b); } +__m128i sse::add(__m128i a, __m128i b, uint8_t) { return add_epi8(a, b); } +__m128i sse::add(__m128i a, __m128i b, uint16_t) { return add_epi16(a, b); } +__m128i sse::add(__m128i a, __m128i b, uint32_t) { return add_epi32(a, b); } +__m128i sse::add(__m128i a, __m128i b, uint64_t) { return add_epi64(a, b); } - WJR_INTRINSIC_INLINE static __m256i set_epi64x(long long e3, long long e2, - long long e1, long long e0); +int8_t sse::add_epi8(__m128i a) { return static_cast(add_epu8(a)); } +int16_t sse::add_epi16(__m128i a) { return static_cast(add_epu16(a)); } +int32_t sse::add_epi32(__m128i a) { return static_cast(add_epu32(a)); } +int64_t sse::add_epi64(__m128i a) { return static_cast(add_epu64(a)); } - WJR_INTRINSIC_INLINE static __m256i - setr_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25, - char e24, char e23, char e22, char e21, char e20, char e19, char e18, - char e17, char e16, char e15, char e14, char e13, char e12, char e11, - char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, - char e2, char e1, char e0); +uint8_t sse::add_epu8(__m128i a) { + auto b = shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a); + a = add(a, b, uint8_t()); + b = zeros(); + a = sad_epu8(a, b); + return simd_cast<__m128i_t, uint8_t>(a); +} - WJR_INTRINSIC_INLINE static __m256i setr_epi16(short e15, short e14, short e13, - short e12, short e11, short e10, - short e9, short e8, short e7, short e6, - short e5, short e4, short e3, short e2, - short e1, short e0); +uint16_t sse::add_epu16(__m128i a) { + a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint16_t()); + a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint16_t()); + a = add(a, srli<2>(a), uint16_t()); + return simd_cast<__m128i_t, uint16_t>(a); +} - WJR_INTRINSIC_INLINE static __m256i setr_epi32(int e7, int e6, int e5, int e4, int e3, - int e2, int e1, int e0); +uint32_t sse::add_epu32(__m128i a) { + a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint32_t()); + a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint32_t()); + return simd_cast<__m128i_t, uint32_t>(a); +} - WJR_INTRINSIC_INLINE static __m256i setr_epi64x(long long e3, long long e2, - long long e1, long long e0); +uint64_t sse::add_epu64(__m128i a) { + a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint64_t()); + return simd_cast<__m128i_t, uint64_t>(a); +} - WJR_INTRINSIC_INLINE static __m256i set1_epi8(int8_t a); - WJR_INTRINSIC_INLINE static __m256i set1_epi16(int16_t a); - WJR_INTRINSIC_INLINE static __m256i set1_epi32(int32_t a); - WJR_INTRINSIC_INLINE static __m256i set1_epi64(int64_t a); +int8_t sse::add(__m128i a, int8_t) { return add_epi8(a); } +int16_t sse::add(__m128i a, int16_t) { return add_epi16(a); } +int32_t sse::add(__m128i a, int32_t) { return add_epi32(a); } +int64_t sse::add(__m128i a, int64_t) { return add_epi64(a); } +uint8_t sse::add(__m128i a, uint8_t) { return add_epu8(a); } +uint16_t sse::add(__m128i a, uint16_t) { return add_epu16(a); } +uint32_t sse::add(__m128i a, uint32_t) { return add_epu32(a); } +uint64_t sse::add(__m128i a, uint64_t) { return add_epu64(a); } - WJR_INTRINSIC_INLINE static __m256i set1(int8_t a, int8_t); - WJR_INTRINSIC_INLINE static __m256i set1(int16_t a, int16_t); - WJR_INTRINSIC_INLINE static __m256i set1(int32_t a, int32_t); - WJR_INTRINSIC_INLINE static __m256i set1(int64_t a, int64_t); - WJR_INTRINSIC_INLINE static __m256i set1(uint8_t a, uint8_t); - WJR_INTRINSIC_INLINE static __m256i set1(uint16_t a, uint16_t); - WJR_INTRINSIC_INLINE static __m256i set1(uint32_t a, uint32_t); - WJR_INTRINSIC_INLINE static __m256i set1(uint64_t a, uint64_t); +__m128i sse::adds_epi8(__m128i a, __m128i b) { return _mm_adds_epi8(a, b); } +__m128i sse::adds_epi16(__m128i a, __m128i b) { return _mm_adds_epi16(a, b); } - WJR_INTRINSIC_INLINE static __m256i setmin_epi8(); - WJR_INTRINSIC_INLINE static __m256i setmin_epi16(); - WJR_INTRINSIC_INLINE static __m256i setmin_epi32(); - WJR_INTRINSIC_INLINE static __m256i setmin_epi64(); +__m128i sse::adds_epu8(__m128i a, __m128i b) { return _mm_adds_epu8(a, b); } +__m128i sse::adds_epu16(__m128i a, __m128i b) { return _mm_adds_epu16(a, b); } - WJR_INTRINSIC_INLINE static __m256i setmin(int8_t); - WJR_INTRINSIC_INLINE static __m256i setmin(int16_t); - WJR_INTRINSIC_INLINE static __m256i setmin(int32_t); - WJR_INTRINSIC_INLINE static __m256i setmin(int64_t); +__m128i sse::adds(__m128i a, __m128i b, int8_t) { return adds_epi8(a, b); } +__m128i sse::adds(__m128i a, __m128i b, int16_t) { return adds_epi16(a, b); } +__m128i sse::adds(__m128i a, __m128i b, uint8_t) { return adds_epu8(a, b); } +__m128i sse::adds(__m128i a, __m128i b, uint16_t) { return adds_epu16(a, b); } - WJR_INTRINSIC_INLINE static __m256i setmax_epi8(); - WJR_INTRINSIC_INLINE static __m256i setmax_epi16(); - WJR_INTRINSIC_INLINE static __m256i setmax_epi32(); - WJR_INTRINSIC_INLINE static __m256i setmax_epi64(); +template +__m128i sse::alignr(__m128i a, __m128i b) { + constexpr int s = imm8 & 0x1F; +#if WJR_HAS_SIMD(SSSE3) + return _mm_alignr_epi8(a, b, s); +#else + if constexpr (s == 0) { + return b; + } + if constexpr (s == 16) { + return a; + } + if constexpr (s < 16) { + return Or(slli<16 - s>(a), srli(b)); + } + return srli(a); +#endif // SSSE3 +} - WJR_INTRINSIC_INLINE static __m256i setmax(int8_t); - WJR_INTRINSIC_INLINE static __m256i setmax(int16_t); - WJR_INTRINSIC_INLINE static __m256i setmax(int32_t); - WJR_INTRINSIC_INLINE static __m256i setmax(int64_t); +__m128i sse::alignr_epi16(__m128i a, __m128i b, int c) { + return Or(slli(a, 16 - c, uint16_t()), srli(b, c, uint16_t())); +} - WJR_INTRINSIC_INLINE static void stream(__m256i *p, __m256i a); +__m128i sse::alignr_epi32(__m128i a, __m128i b, int c) { + return Or(slli(a, 32 - c, uint32_t()), srli(b, c, uint32_t())); +} - WJR_INTRINSIC_INLINE static void store(void *p, __m256i a); - WJR_INTRINSIC_INLINE static void storeu(void *p, __m256i a); +__m128i sse::alignr_epi64(__m128i a, __m128i b, int c) { + return Or(slli(a, 64 - c, uint64_t()), srli(b, c, uint64_t())); +} - WJR_INTRINSIC_INLINE static int test_all_zeros(__m256i a); +__m128i sse::alignr(__m128i a, __m128i b, int c, int16_t) { + return alignr_epi16(a, b, c); +} +__m128i sse::alignr(__m128i a, __m128i b, int c, int32_t) { + return alignr_epi32(a, b, c); +} +__m128i sse::alignr(__m128i a, __m128i b, int c, int64_t) { + return alignr_epi64(a, b, c); +} +__m128i sse::alignr(__m128i a, __m128i b, int c, uint16_t) { + return alignr_epi16(a, b, c); +} +__m128i sse::alignr(__m128i a, __m128i b, int c, uint32_t) { + return alignr_epi32(a, b, c); +} +__m128i sse::alignr(__m128i a, __m128i b, int c, uint64_t) { + return alignr_epi64(a, b, c); +} - WJR_INTRINSIC_INLINE static int testc(__m256i a, __m256i b); +__m128i sse::And(__m128i a, __m128i b) { return _mm_and_si128(a, b); } - WJR_INTRINSIC_INLINE static int testnzc(__m256i a, __m256i b); +__m128i sse::AndNot(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); } - WJR_INTRINSIC_INLINE static int testz(__m256i a, __m256i b); +__m128i sse::avg_epu8(__m128i a, __m128i b) { return _mm_avg_epu8(a, b); } +__m128i sse::avg_epu16(__m128i a, __m128i b) { return _mm_avg_epu16(a, b); } - WJR_INTRINSIC_INLINE static __m256i zeros(); +__m128i sse::avg(__m128i a, __m128i b, int8_t) { return avg_epu8(a, b); } +__m128i sse::avg(__m128i a, __m128i b, int16_t) { return avg_epu16(a, b); } +__m128i sse::avg(__m128i a, __m128i b, uint8_t) { return avg_epu8(a, b); } +__m128i sse::avg(__m128i a, __m128i b, uint16_t) { return avg_epu16(a, b); } -#endif // AVX +// notice that mask must be 0 or 255(every byte) +__m128i sse::blendv_epi8(__m128i a, __m128i b, __m128i mask) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_blendv_epi8(a, b, mask); +#elif defined(WJR_COMPILER_GCC) + return ((~mask) & a) | (mask & b); +#else + return Or(AndNot(mask, a), And(mask, b)); +#endif +} -#if WJR_HAS_SIMD(AVX2) +__m128i sse::blendv_epi16(__m128i a, __m128i b, __m128i mask) { + return blendv_epi8(b, a, logical_not(mask, uint16_t())); +} - WJR_INTRINSIC_INLINE static __m256i And(__m256i a, __m256i b); +__m128i sse::blendv_epi32(__m128i a, __m128i b, __m128i mask) { + return blendv_epi8(b, a, logical_not(mask, uint32_t())); +} - WJR_INTRINSIC_INLINE static __m256i AndNot(__m256i a, __m256i b); +__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int8_t) { + return blendv_epi8(a, b, mask); +} - WJR_INTRINSIC_INLINE static __m256i Or(__m256i a, __m256i b); +__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int16_t) { + return blendv_epi16(a, b, mask); +} - WJR_INTRINSIC_INLINE static __m256i Xor(__m256i a, __m256i b); +__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int32_t) { + return blendv_epi32(a, b, mask); +} - WJR_INTRINSIC_INLINE static __m256i Not(__m256i v); +__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint8_t) { + return blendv_epi8(a, b, mask); +} - WJR_INTRINSIC_INLINE static __m256i abs_epi8(__m256i v); - WJR_INTRINSIC_INLINE static __m256i abs_epi16(__m256i v); - WJR_INTRINSIC_INLINE static __m256i abs_epi32(__m256i v); +__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint16_t) { + return blendv_epi16(a, b, mask); +} - WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int8_t); - WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int16_t); - WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int32_t); - WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int64_t); +__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint32_t) { + return blendv_epi32(a, b, mask); +} - WJR_INTRINSIC_INLINE static __m256i add_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i add_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i add_epi32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i add_epi64(__m256i a, __m256i b); +template +__m128i sse::bslli(__m128i val) { + return _mm_bslli_si128(val, imm8); +} - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint32_t); - WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint64_t); +template +__m128i sse::bsrli(__m128i val) { + return _mm_bsrli_si128(val, imm8); +} - WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m256i v); - WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m256i v); - WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m256i v); - WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m256i v); +__m128i sse::cmpeq_epi8(__m128i a, __m128i b) { return _mm_cmpeq_epi8(a, b); } +__m128i sse::cmpeq_epi16(__m128i a, __m128i b) { return _mm_cmpeq_epi16(a, b); } +__m128i sse::cmpeq_epi32(__m128i a, __m128i b) { return _mm_cmpeq_epi32(a, b); } - WJR_INTRINSIC_INLINE static int8_t add_epi8(__m256i v); - WJR_INTRINSIC_INLINE static int16_t add_epi16(__m256i v); - WJR_INTRINSIC_INLINE static int32_t add_epi32(__m256i v); - WJR_INTRINSIC_INLINE static int64_t add_epi64(__m256i v); +__m128i sse::cmpeq(__m128i a, __m128i b, int8_t) { return cmpeq_epi8(a, b); } +__m128i sse::cmpeq(__m128i a, __m128i b, int16_t) { return cmpeq_epi16(a, b); } +__m128i sse::cmpeq(__m128i a, __m128i b, int32_t) { return cmpeq_epi32(a, b); } +__m128i sse::cmpeq(__m128i a, __m128i b, uint8_t) { return cmpeq_epi8(a, b); } +__m128i sse::cmpeq(__m128i a, __m128i b, uint16_t) { return cmpeq_epi16(a, b); } +__m128i sse::cmpeq(__m128i a, __m128i b, uint32_t) { return cmpeq_epi32(a, b); } - WJR_INTRINSIC_INLINE static int8_t add(__m256i v, int8_t); - WJR_INTRINSIC_INLINE static int16_t add(__m256i v, int16_t); - WJR_INTRINSIC_INLINE static int32_t add(__m256i v, int32_t); - WJR_INTRINSIC_INLINE static int64_t add(__m256i v, int64_t); - WJR_INTRINSIC_INLINE static uint8_t add(__m256i v, uint8_t); - WJR_INTRINSIC_INLINE static uint16_t add(__m256i v, uint16_t); - WJR_INTRINSIC_INLINE static uint32_t add(__m256i v, uint32_t); - WJR_INTRINSIC_INLINE static uint64_t add(__m256i v, uint64_t); +__m128i sse::cmpge_epi8(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comge_epi8(a, b); +#elif WJR_HAS_SIMD(SSE4_1) + return cmpeq(min(a, b, int8_t()), b, uint8_t()); +#else + return Not(cmpgt(b, a, int8_t())); +#endif +} - WJR_INTRINSIC_INLINE static __m256i adds_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i adds_epi16(__m256i a, __m256i b); +__m128i sse::cmpge_epi16(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comge_epi16(a, b); +#else + return cmpeq(min(a, b, int16_t()), b, uint16_t()); +#endif +} - WJR_INTRINSIC_INLINE static __m256i adds_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i adds_epu16(__m256i a, __m256i b); +__m128i sse::cmpge_epi32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comge_epi32(a, b); +#elif WJR_HAS_SIMD(SSE4_1) + return cmpeq(min(a, b, int32_t()), b, uint32_t()); +#else + return Not(cmpgt(b, a, int32_t())); +#endif +} - WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint16_t); +__m128i sse::cmpge_epu8(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comge_epu8(a, b); +#else + return cmpeq(min(a, b, uint8_t()), b, uint8_t()); +#endif +} - template - WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b); +__m128i sse::cmpge_epu16(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comge_epu16(a, b); +#elif WJR_HAS_SIMD(SSE4_1) + return cmpeq(min(a, b, uint16_t()), b, uint16_t()); +#else + return logical_not(subs(b, a, uint16_t()), uint16_t()); +#endif +} - WJR_INTRINSIC_INLINE static __m256i alignr_epi16(__m256i a, __m256i b, int c); - WJR_INTRINSIC_INLINE static __m256i alignr_epi32(__m256i a, __m256i b, int c); - WJR_INTRINSIC_INLINE static __m256i alignr_epi64(__m256i a, __m256i b, int c); +__m128i sse::cmpge_epu32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comge_epu32(a, b); +#elif WJR_HAS_SIMD(SSE4_1) + return cmpeq(min(a, b, uint32_t()), b, uint32_t()); +#else + return Not(cmpgt(b, a, uint32_t())); +#endif +} - WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int16_t); - WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int32_t); - WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int64_t); - WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint16_t); - WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint32_t); - WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint64_t); +__m128i sse::cmpge(__m128i a, __m128i b, int8_t) { return cmpge_epi8(a, b); } +__m128i sse::cmpge(__m128i a, __m128i b, int16_t) { return cmpge_epi16(a, b); } +__m128i sse::cmpge(__m128i a, __m128i b, int32_t) { return cmpge_epi32(a, b); } +__m128i sse::cmpge(__m128i a, __m128i b, uint8_t) { return cmpge_epu8(a, b); } +__m128i sse::cmpge(__m128i a, __m128i b, uint16_t) { return cmpge_epu16(a, b); } +__m128i sse::cmpge(__m128i a, __m128i b, uint32_t) { return cmpge_epu32(a, b); } - WJR_INTRINSIC_INLINE static __m256i avg_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i avg_epu16(__m256i a, __m256i b); +__m128i sse::cmpgt_epi8(__m128i a, __m128i b) { return _mm_cmpgt_epi8(a, b); } +__m128i sse::cmpgt_epi16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(a, b); } +__m128i sse::cmpgt_epi32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(a, b); } - WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint16_t); +__m128i sse::cmpgt_epu8(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comgt_epu8(a, b); +#else + return cmpgt_epi8(Xor(a, setmin_epi8()), Xor(b, setmin_epi8())); +#endif +} - template - WJR_INTRINSIC_INLINE static __m256i blend_epi16(__m256i a, __m256i b); - template - WJR_INTRINSIC_INLINE static __m256i blend_epi32(__m256i a, __m256i b); +__m128i sse::cmpgt_epu16(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comgt_epu16(a, b); +#else + return cmpgt_epi16(Xor(a, setmin_epi16()), Xor(b, setmin_epi16())); +#endif +} - WJR_INTRINSIC_INLINE static __m256i blendv_epi8(__m256i a, __m256i b, __m256i mask); +__m128i sse::cmpgt_epu32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comgt_epu32(a, b); +#else + return cmpgt_epi32(Xor(a, setmin_epi32()), Xor(b, setmin_epi32())); +#endif +} - template - WJR_INTRINSIC_INLINE static __m256i bslli_epi128(__m256i a); +__m128i sse::cmpgt(__m128i a, __m128i b, int8_t) { return cmpgt_epi8(a, b); } +__m128i sse::cmpgt(__m128i a, __m128i b, int16_t) { return cmpgt_epi16(a, b); } +__m128i sse::cmpgt(__m128i a, __m128i b, int32_t) { return cmpgt_epi32(a, b); } +__m128i sse::cmpgt(__m128i a, __m128i b, uint8_t) { return cmpgt_epu8(a, b); } +__m128i sse::cmpgt(__m128i a, __m128i b, uint16_t) { return cmpgt_epu16(a, b); } +__m128i sse::cmpgt(__m128i a, __m128i b, uint32_t) { return cmpgt_epu32(a, b); } - template - WJR_INTRINSIC_INLINE static __m256i bsrli_epi128(__m256i a); +__m128i sse::cmple_epi8(__m128i a, __m128i b) { return cmpge_epi8(b, a); } +__m128i sse::cmple_epi16(__m128i a, __m128i b) { return cmpge_epi16(b, a); } +__m128i sse::cmple_epi32(__m128i a, __m128i b) { return cmpge_epi32(b, a); } - WJR_INTRINSIC_INLINE static __m256i cmpeq_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpeq_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpeq_epi32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpeq_epi64(__m256i a, __m256i b); +__m128i sse::cmple_epu8(__m128i a, __m128i b) { return cmpge_epu8(b, a); } +__m128i sse::cmple_epu16(__m128i a, __m128i b) { return cmpge_epu16(b, a); } +__m128i sse::cmple_epu32(__m128i a, __m128i b) { return cmpge_epu32(b, a); } - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint32_t); - WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint64_t); +__m128i sse::cmple(__m128i a, __m128i b, int8_t) { return cmple_epi8(a, b); } +__m128i sse::cmple(__m128i a, __m128i b, int16_t) { return cmple_epi16(a, b); } +__m128i sse::cmple(__m128i a, __m128i b, int32_t) { return cmple_epi32(a, b); } +__m128i sse::cmple(__m128i a, __m128i b, uint8_t) { return cmple_epu8(a, b); } +__m128i sse::cmple(__m128i a, __m128i b, uint16_t) { return cmple_epu16(a, b); } +__m128i sse::cmple(__m128i a, __m128i b, uint32_t) { return cmple_epu32(a, b); } - WJR_INTRINSIC_INLINE static __m256i cmpge_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpge_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpge_epi32(__m256i a, __m256i b); +__m128i sse::cmplt_epi8(__m128i a, __m128i b) { return _mm_cmplt_epi8(a, b); } +__m128i sse::cmplt_epi16(__m128i a, __m128i b) { return _mm_cmplt_epi16(a, b); } +__m128i sse::cmplt_epi32(__m128i a, __m128i b) { return _mm_cmplt_epi32(a, b); } - WJR_INTRINSIC_INLINE static __m256i cmpge_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpge_epu16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpge_epu32(__m256i a, __m256i b); +__m128i sse::cmplt_epu8(__m128i a, __m128i b) { return cmpgt_epu8(b, a); } +__m128i sse::cmplt_epu16(__m128i a, __m128i b) { return cmpgt_epu16(b, a); } +__m128i sse::cmplt_epu32(__m128i a, __m128i b) { return cmpgt_epu32(b, a); } - WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint32_t); +__m128i sse::cmplt(__m128i a, __m128i b, int8_t) { return cmplt_epi8(a, b); } +__m128i sse::cmplt(__m128i a, __m128i b, int16_t) { return cmplt_epi16(a, b); } +__m128i sse::cmplt(__m128i a, __m128i b, int32_t) { return cmplt_epi32(a, b); } +__m128i sse::cmplt(__m128i a, __m128i b, uint8_t) { return cmplt_epu8(a, b); } +__m128i sse::cmplt(__m128i a, __m128i b, uint16_t) { return cmplt_epu16(a, b); } +__m128i sse::cmplt(__m128i a, __m128i b, uint32_t) { return cmplt_epu32(a, b); } - WJR_INTRINSIC_INLINE static __m256i cmpgt_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpgt_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpgt_epi32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpgt_epi64(__m256i a, __m256i b); +__m128i sse::cmpne_epi8(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comneq_epi8(a, b); +#else + return Not(cmpeq_epi8(a, b)); +#endif +} - WJR_INTRINSIC_INLINE static __m256i cmpgt_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpgt_epu16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpgt_epu32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpgt_epu64(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint32_t); - WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint64_t); - - WJR_INTRINSIC_INLINE static __m256i cmple_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmple_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmple_epi32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i cmple_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmple_epu16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmple_epu32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint32_t); - - WJR_INTRINSIC_INLINE static __m256i cmplt_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmplt_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmplt_epi32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i cmplt_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmplt_epu16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmplt_epu32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint32_t); - - WJR_INTRINSIC_INLINE static __m256i cmpne_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpne_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i cmpne_epi32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint32_t); - - template - WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::equal_to<>, T); - template - WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::not_equal_to<>, T); - template - WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater<>, T); - template - WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater_equal<>, - T); - template - WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less<>, T); - template - WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less_equal<>, T); - - template - WJR_INTRINSIC_INLINE static int extract_epi8(__m256i v); - template - WJR_INTRINSIC_INLINE static int extract_epi16(__m256i v); - - template - WJR_INTRINSIC_INLINE static int extract(__m256i v, int8_t); - template - WJR_INTRINSIC_INLINE static int extract(__m256i v, int16_t); - - WJR_INTRINSIC_INLINE static __m256i hadd_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i hadd_epi32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int32_t); - - WJR_INTRINSIC_INLINE static __m256i hadds_epi16(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i hsub_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i hsub_epi32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int32_t); - - WJR_INTRINSIC_INLINE static __m256i hsubs_epi16(__m256i a, __m256i b); - - template )> - WJR_INTRINSIC_INLINE static __m256i logical_and(__m256i a, __m256i b, T); - - template )> - WJR_INTRINSIC_INLINE static __m256i logical_not(__m256i v, T); - - template )> - WJR_INTRINSIC_INLINE static __m256i logical_or(__m256i a, __m256i b, T); - - WJR_INTRINSIC_INLINE static __m256i madd_epi16(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i max_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i max_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i max_epi32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i max_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i max_epu16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i max_epu32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint32_t); - - WJR_INTRINSIC_INLINE static int8_t max_epi8(__m256i a); - WJR_INTRINSIC_INLINE static int16_t max_epi16(__m256i a); - WJR_INTRINSIC_INLINE static int32_t max_epi32(__m256i a); - WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m256i a); - WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m256i a); - WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m256i a); - - WJR_INTRINSIC_INLINE static int8_t max(__m256i a, int8_t); - WJR_INTRINSIC_INLINE static int16_t max(__m256i a, int16_t); - WJR_INTRINSIC_INLINE static int32_t max(__m256i a, int32_t); - - WJR_INTRINSIC_INLINE static uint8_t max(__m256i a, uint8_t); - WJR_INTRINSIC_INLINE static uint16_t max(__m256i a, uint16_t); - WJR_INTRINSIC_INLINE static uint32_t max(__m256i a, uint32_t); - - WJR_INTRINSIC_INLINE static __m256i min_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i min_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i min_epi32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i min_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i min_epu16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i min_epu32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint32_t); - - WJR_INTRINSIC_INLINE static int8_t min_epi8(__m256i a); - WJR_INTRINSIC_INLINE static int16_t min_epi16(__m256i a); - WJR_INTRINSIC_INLINE static int32_t min_epi32(__m256i a); - - WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m256i a); - WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m256i a); - WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m256i a); - - WJR_INTRINSIC_INLINE static int8_t min(__m256i a, int8_t); - WJR_INTRINSIC_INLINE static int16_t min(__m256i a, int16_t); - WJR_INTRINSIC_INLINE static int32_t min(__m256i a, int32_t); - WJR_INTRINSIC_INLINE static uint8_t min(__m256i a, uint8_t); - WJR_INTRINSIC_INLINE static uint16_t min(__m256i a, uint16_t); - WJR_INTRINSIC_INLINE static uint32_t min(__m256i a, uint32_t); - - WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m256i a); - - WJR_INTRINSIC_INLINE static __m256i mul_epi32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i mul_epu32(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i mulhi_epi16(__m256i a, __m256i b); - - WJR_INTRINSIC_INLINE static __m256i mulhi_epu16(__m256i a, __m256i b); +__m128i sse::cmpne_epi16(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comneq_epi16(a, b); +#else + return Not(cmpeq_epi16(a, b)); +#endif +} - WJR_INTRINSIC_INLINE static __m256i mullo_epi16(__m256i a, __m256i b); +__m128i sse::cmpne_epi32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(XOP) + return _mm_comneq_epi32(a, b); +#else + return Not(cmpeq_epi32(a, b)); +#endif +} - WJR_INTRINSIC_INLINE static __m256i packs_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i packs_epi32(__m256i a, __m256i b); +__m128i sse::cmpne(__m128i a, __m128i b, int8_t) { return cmpne_epi8(a, b); } +__m128i sse::cmpne(__m128i a, __m128i b, int16_t) { return cmpne_epi16(a, b); } +__m128i sse::cmpne(__m128i a, __m128i b, int32_t) { return cmpne_epi32(a, b); } +__m128i sse::cmpne(__m128i a, __m128i b, uint8_t) { return cmpne_epi8(a, b); } +__m128i sse::cmpne(__m128i a, __m128i b, uint16_t) { return cmpne_epi16(a, b); } +__m128i sse::cmpne(__m128i a, __m128i b, uint32_t) { return cmpne_epi32(a, b); } - WJR_INTRINSIC_INLINE static __m256i packus_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i packus_epi32(__m256i a, __m256i b); +template +__m128i sse::cmp(__m128i a, __m128i b, std::equal_to<>, T) { + return cmpeq(a, b, T()); +} - template - WJR_INTRINSIC_INLINE static __m256i shl(__m256i a); +template +__m128i sse::cmp(__m128i a, __m128i b, std::not_equal_to<>, T) { + return cmpne(a, b, T()); +} - template - WJR_INTRINSIC_INLINE static __m256i shr(__m256i a); +template +__m128i sse::cmp(__m128i a, __m128i b, std::greater<>, T) { + return cmpgt(a, b, T()); +} - WJR_INTRINSIC_INLINE static __m256i shuffle_epi8(__m256i a, __m256i b); - template - WJR_INTRINSIC_INLINE static __m256i shuffle_epi32(__m256i a); +template +__m128i sse::cmp(__m128i a, __m128i b, std::greater_equal<>, T) { + return cmpge(a, b, T()); +} - template - WJR_INTRINSIC_INLINE static __m256i shufflehi_epi16(__m256i a); +template +__m128i sse::cmp(__m128i a, __m128i b, std::less<>, T) { + return cmplt(a, b, T()); +} - template - WJR_INTRINSIC_INLINE static __m256i shufflelo_epi16(__m256i a); +template +__m128i sse::cmp(__m128i a, __m128i b, std::less_equal<>, T) { + return cmple(a, b, T()); +} - WJR_INTRINSIC_INLINE static __m256i sll_epi16(__m256i a, __m128i b); - WJR_INTRINSIC_INLINE static __m256i sll_epi32(__m256i a, __m128i b); - WJR_INTRINSIC_INLINE static __m256i sll_epi64(__m256i a, __m128i b); +__m128i sse::concat(uint64_t lo, uint64_t hi) { return set_epi64x(hi, lo); } - WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint32_t); - WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint64_t); +template +int sse::extract_epi8(__m128i a) { + static_assert(imm8 >= 0 && imm8 < 16, "imm8 must be in range [0, 15]"); +#if WJR_HAS_SIMD(SSE4_1) + return _mm_extract_epi8(a, imm8); +#else + if constexpr (imm8 & 1) { + return extract_epi16<(imm8 >> 1)>(a) >> 8; + } else { + return extract_epi16<(imm8 >> 1)>(a) & 0xff; + } +#endif +} - template - WJR_INTRINSIC_INLINE static __m256i slli(__m256i a); - WJR_INTRINSIC_INLINE static __m256i slli_epi16(__m256i a, int imm8); - WJR_INTRINSIC_INLINE static __m256i slli_epi32(__m256i a, int imm8); - WJR_INTRINSIC_INLINE static __m256i slli_epi64(__m256i a, int imm8); +template +int sse::extract_epi16(__m128i a) { + static_assert(imm8 >= 0 && imm8 < 8, "imm8 must be in range [0, 7]"); + return _mm_extract_epi16(a, imm8); +} - WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int16_t); - WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int32_t); - WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int64_t); - WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint16_t); - WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint32_t); - WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint64_t); +template +int sse::extract_epi32(__m128i a) { + static_assert(imm8 >= 0 && imm8 < 4, "imm8 must be in range [0, 3]"); +#if WJR_HAS_SIMD(SSE4_1) + return _mm_extract_epi32(a, imm8); +#else + if constexpr (imm8 == 0) { + return simd_cast<__m128i_t, uint32_t>(a); + } else if constexpr (imm8 == 1) { + return static_cast(simd_cast<__m128i_t, uint64_t>(a) >> 32); + } else if constexpr (imm8 == 2) { + return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + } else { + return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 3, 3, 3)>(a)); + } +#endif +} - WJR_INTRINSIC_INLINE static __m256i sra_epi16(__m256i a, __m128i b); - WJR_INTRINSIC_INLINE static __m256i sra_epi32(__m256i a, __m128i b); +template +int64_t sse::extract_epi64(__m128i a) { + static_assert(imm8 >= 0 && imm8 < 2, "imm8 must be in range [0, 1]"); +#if WJR_HAS_SIMD(SSE4_1) + return _mm_extract_epi64(a, imm8); +#else + if constexpr (imm8 == 0) { + return simd_cast<__m128i_t, uint64_t>(a); + } else { + return simd_cast<__m128i_t, uint64_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + } +#endif +} - WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int32_t); +template +int sse::extract(__m128i a, int8_t) { + return extract_epi8(a); +} - WJR_INTRINSIC_INLINE static __m256i srai_epi16(__m256i a, int imm8); - WJR_INTRINSIC_INLINE static __m256i srai_epi32(__m256i a, int imm8); +template +int sse::extract(__m128i a, int16_t) { + return extract_epi16(a); +} - WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int16_t); - WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int32_t); +template +int sse::extract(__m128i a, int32_t) { + return extract_epi32(a); +} - WJR_INTRINSIC_INLINE static __m256i stream_load(const void *p); +template +int64_t sse::extract(__m128i a, int64_t) { + return extract_epi64(a); +} - WJR_INTRINSIC_INLINE static __m256i srl_epi16(__m256i a, __m128i b); - WJR_INTRINSIC_INLINE static __m256i srl_epi32(__m256i a, __m128i b); - WJR_INTRINSIC_INLINE static __m256i srl_epi64(__m256i a, __m128i b); +template +int sse::extract(__m128i a, uint8_t) { + return extract_epi8(a); +} - WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint32_t); - WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint64_t); +template +int sse::extract(__m128i a, uint16_t) { + return extract_epi16(a); +} - template - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a); - WJR_INTRINSIC_INLINE static __m256i srli_epi8(__m256i a, int imm8); - WJR_INTRINSIC_INLINE static __m256i srli_epi16(__m256i a, int imm8); - WJR_INTRINSIC_INLINE static __m256i srli_epi32(__m256i a, int imm8); - WJR_INTRINSIC_INLINE static __m256i srli_epi64(__m256i a, int imm8); +template +int sse::extract(__m128i a, uint32_t) { + return extract_epi32(a); +} - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int8_t); - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int16_t); - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int32_t); - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int64_t); - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint8_t); - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint16_t); - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint32_t); - WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint64_t); +template +int64_t sse::extract(__m128i a, uint64_t) { + return extract_epi64(a); +} - WJR_INTRINSIC_INLINE static __m256i sub_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i sub_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i sub_epi32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i sub_epi64(__m256i a, __m256i b); +uint64_t sse::getlow(__m128i v) { return simd_cast<__m128i_t, uint64_t>(v); } +uint64_t sse::gethigh(__m128i v) { return extract_epi64<1>(v); } - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint32_t); - WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint64_t); +template +__m128i sse::insert_epi16(__m128i a, int i) { + return _mm_insert_epi16(a, i, imm8); +} - WJR_INTRINSIC_INLINE static __m256i subs_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i subs_epi16(__m256i a, __m256i b); +template +__m128i sse::insert(__m128i a, int i, int16_t) { + return insert_epi16(a, i); +} - WJR_INTRINSIC_INLINE static __m256i subs_epu8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i subs_epu16(__m256i a, __m256i b); +template +__m128i sse::insert(__m128i a, int i, uint16_t) { + return insert_epi16(a, i); +} - WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint16_t); +void sse::lfence() { _mm_lfence(); } - WJR_INTRINSIC_INLINE static int test_all_ones(__m256i a); +__m128i sse::load(const void *ptr) { + return _mm_load_si128(static_cast(ptr)); +} +__m128i sse::loadu(const void *ptr) { + return _mm_loadu_si128(static_cast(ptr)); +} +__m128i sse::loadu_si16(const void *ptr) { + return simd_cast(read_memory(ptr)); +} - WJR_INTRINSIC_INLINE static __m256i unpackhi_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i unpackhi_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i unpackhi_epi32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i unpackhi_epi64(__m256i a, __m256i b); +__m128i sse::loadu_si32(const void *ptr) { + return simd_cast(read_memory(ptr)); +} - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint32_t); - WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint64_t); +__m128i sse::loadu_si64(const void *ptr) { + return simd_cast(read_memory(ptr)); +} - WJR_INTRINSIC_INLINE static __m256i unpacklo_epi8(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i unpacklo_epi16(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i unpacklo_epi32(__m256i a, __m256i b); - WJR_INTRINSIC_INLINE static __m256i unpacklo_epi64(__m256i a, __m256i b); +template )> +__m128i sse::logical_and(__m128i a, __m128i b, T) { + return Not(Or(logical_not(a, T()), logical_not(b, T()))); +} - WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int8_t); - WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int16_t); - WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int32_t); - WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int64_t); - WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint8_t); - WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint16_t); - WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint32_t); +template )> +__m128i sse::logical_not(__m128i v, T) { + auto Zero = zeros(); + return cmpeq(v, Zero, T()); +} -#endif // AVX2 -}; +template )> +__m128i sse::logical_or(__m128i a, __m128i b, T) { + return Not(logical_not(Or(a, b), T())); +} -namespace sse_detail { -#if WJR_HAS_SIMD(SSE2) +__m128i sse::madd_epi16(__m128i a, __m128i b) { return _mm_madd_epi16(a, b); } -const static __m128i srli_epi8_mask[8] = { - sse::set1_epi16(0xFFFF), sse::set1_epi16(0x7F7F), sse::set1_epi16(0x3F3F), - sse::set1_epi16(0x1F1F), sse::set1_epi16(0xF0F), sse::set1_epi16(0x707), - sse::set1_epi16(0x303), sse::set1_epi16(0x101), -}; +void sse::maskmoveu(__m128i a, __m128i mask, char *mem_addr) { + return _mm_maskmoveu_si128(a, mask, mem_addr); +} +__m128i sse::max_epi8(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_max_epi8(a, b); +#else + return blendv_epi8(b, a, cmpgt_epi8(a, b)); #endif -} // namespace sse_detail - -namespace avx_detail { -#if WJR_HAS_SIMD(AVX2) +} -const static __m256i srli_epi8_mask[8] = { - avx::set1_epi16(0xFFFF), avx::set1_epi16(0x7F7F), avx::set1_epi16(0x3F3F), - avx::set1_epi16(0x1F1F), avx::set1_epi16(0xF0F), avx::set1_epi16(0x707), - avx::set1_epi16(0x303), avx::set1_epi16(0x101), -}; +__m128i sse::max_epi16(__m128i a, __m128i b) { return _mm_max_epi16(a, b); } +__m128i sse::max_epi32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_max_epi32(a, b); +#else + return blendv_epi8(b, a, cmpgt_epi32(a, b)); #endif -} // namespace avx_detail - -#if WJR_HAS_SIMD(SSE2) +} -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const { - return _mm_set1_epi8(v); - } -}; +__m128i sse::max_epu8(__m128i a, __m128i b) { return _mm_max_epu8(a, b); } -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const { - return _mm_set1_epi16(v); - } -}; +__m128i sse::max_epu16(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_max_epu16(a, b); +#else + return add(subs_epu16(b, a), a, uint16_t()); +#endif +} -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const { - return _mm_set1_epi32(v); - } -}; +__m128i sse::max_epu32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_max_epu32(a, b); +#else + return blendv_epi8(b, a, cmpgt_epu32(a, b)); +#endif +} -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const { - return _mm_set1_epi64x(v); - } -}; +__m128i sse::max(__m128i a, __m128i b, int8_t) { return max_epi8(a, b); } +__m128i sse::max(__m128i a, __m128i b, int16_t) { return max_epi16(a, b); } +__m128i sse::max(__m128i a, __m128i b, int32_t) { return max_epi32(a, b); } +__m128i sse::max(__m128i a, __m128i b, uint8_t) { return max_epu8(a, b); } +__m128i sse::max(__m128i a, __m128i b, uint16_t) { return max_epu16(a, b); } +__m128i sse::max(__m128i a, __m128i b, uint32_t) { return max_epu32(a, b); } -template <> -struct broadcast_fn<__m128i_t, __m128i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128i v) const { return v; } -}; +int8_t sse::max_epi8(__m128i a) { return 0x7fu ^ min_epu8(Xor(a, set1_epi8(0x7fu))); } -#endif // SSE2 +int16_t sse::max_epi16(__m128i a) { +#if WJR_HAS_SIMD(SSE4_1) + return 0x7fffu ^ min_epu16(Xor(a, set1_epi16(0x7fffu))); +#else + a = max_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); + return simd_cast<__m128i_t, int16_t>(a); +#endif +} -#if WJR_HAS_SIMD(AVX) +int32_t sse::max_epi32(__m128i a) { + a = max_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = max_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + return simd_cast<__m128i_t, int32_t>(a); +} -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const { - return _mm256_set1_epi8(v); - } -}; +uint8_t sse::max_epu8(__m128i a) { +#if WJR_HAS_SIMD(SSE4_1) + return 0xffu ^ min_epu8(Xor(a, ones())); +#else + a = max_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); + auto X = simd_cast<__m128i_t, uint32_t>(a); + return std::max((uint8_t)X, (uint8_t)(X >> 8)); +#endif +} -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const { - return _mm256_set1_epi16(v); - } -}; +uint16_t sse::max_epu16(__m128i a) { +#if WJR_HAS_SIMD(SSE4_1) + return 0xffffu ^ min_epu16(Xor(a, ones())); +#else + a = max_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); + return simd_cast<__m128i_t, uint16_t>(a); +#endif +} -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const { - return _mm256_set1_epi32(v); - } -}; +uint32_t sse::max_epu32(__m128i a) { + a = max_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = max_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + return simd_cast<__m128i_t, uint32_t>(a); +} -template <> -struct broadcast_fn { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const { - return _mm256_set1_epi64x(v); - } -}; +int8_t sse::max(__m128i a, int8_t) { return max_epi8(a); } +int16_t sse::max(__m128i a, int16_t) { return max_epi16(a); } +int32_t sse::max(__m128i a, int32_t) { return max_epi32(a); } +uint8_t sse::max(__m128i a, uint8_t) { return max_epu8(a); } +uint16_t sse::max(__m128i a, uint16_t) { return max_epu16(a); } +uint32_t sse::max(__m128i a, uint32_t) { return max_epu32(a); } -template <> -struct broadcast_fn<__m256i_t, __m256i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256i v) const { return v; } -}; +void sse::mfence() { _mm_mfence(); } -template <> -struct broadcast_fn<__m128i_t, __m256i_t> { - WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const { -#if WJR_HAS_SIMD(AVX2) - return _mm256_broadcastsi128_si256(v); +__m128i sse::min_epi8(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_min_epi8(a, b); #else - return _mm256_insertf128_si256(_mm256_castsi128_si256(v), v, 1); + return blendv_epi8(a, b, cmpgt_epi8(a, b)); #endif - } -}; - -#endif // AVX - -/*------------------------sse------------------------*/ +} -constexpr size_t sse::width() { return 128; } +__m128i sse::min_epi16(__m128i a, __m128i b) { return _mm_min_epi16(a, b); } -constexpr sse::mask_type sse::mask() { return 0xFFFF; } +__m128i sse::min_epi32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_min_epi32(a, b); +#else + return blendv_epi8(a, b, cmpgt_epi32(a, b)); +#endif +} -#if WJR_HAS_SIMD(SSE) +__m128i sse::min_epu8(__m128i a, __m128i b) { return _mm_min_epu8(a, b); } -sse::mask_type sse::movemask_ps(__m128 v) { - return static_cast(_mm_movemask_ps(v)); +__m128i sse::min_epu16(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_min_epu16(a, b); +#else + return blendv_epi8(a, b, cmpgt_epu16(a, b)); +#endif } -void sse::sfence() { return _mm_sfence(); } - -template -__m128 sse::shuffle_ps(__m128 a, __m128 b) { - static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]"); - return _mm_shuffle_ps(a, b, imm8); +__m128i sse::min_epu32(__m128i a, __m128i b) { +#if WJR_HAS_SIMD(SSE4_1) + return _mm_min_epu32(a, b); +#else + return blendv_epi8(a, b, cmpgt_epu32(a, b)); +#endif } -#endif // SSE - -#if WJR_HAS_SIMD(SSE2) +__m128i sse::min(__m128i a, __m128i b, int8_t) { return min_epi8(a, b); } +__m128i sse::min(__m128i a, __m128i b, int16_t) { return min_epi16(a, b); } +__m128i sse::min(__m128i a, __m128i b, int32_t) { return min_epi32(a, b); } +__m128i sse::min(__m128i a, __m128i b, uint8_t) { return min_epu8(a, b); } +__m128i sse::min(__m128i a, __m128i b, uint16_t) { return min_epu16(a, b); } +__m128i sse::min(__m128i a, __m128i b, uint32_t) { return min_epu32(a, b); } -__m128i sse::add_epi8(__m128i a, __m128i b) { return _mm_add_epi8(a, b); } -__m128i sse::add_epi16(__m128i a, __m128i b) { return _mm_add_epi16(a, b); } -__m128i sse::add_epi32(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } -__m128i sse::add_epi64(__m128i a, __m128i b) { return _mm_add_epi64(a, b); } +int8_t sse::min_epi8(__m128i a) { return 0x80u ^ min_epu8(Xor(a, setmin_epi8())); } -__m128i sse::add(__m128i a, __m128i b, int8_t) { return add_epi8(a, b); } -__m128i sse::add(__m128i a, __m128i b, int16_t) { return add_epi16(a, b); } -__m128i sse::add(__m128i a, __m128i b, int32_t) { return add_epi32(a, b); } -__m128i sse::add(__m128i a, __m128i b, int64_t) { return add_epi64(a, b); } -__m128i sse::add(__m128i a, __m128i b, uint8_t) { return add_epi8(a, b); } -__m128i sse::add(__m128i a, __m128i b, uint16_t) { return add_epi16(a, b); } -__m128i sse::add(__m128i a, __m128i b, uint32_t) { return add_epi32(a, b); } -__m128i sse::add(__m128i a, __m128i b, uint64_t) { return add_epi64(a, b); } +int16_t sse::min_epi16(__m128i a) { +#if WJR_HAS_SIMD(SSE4_1) + return 0x8000u ^ min_epu16(Xor(a, setmin_epi16())); +#else + a = min_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); + return simd_cast<__m128i_t, int16_t>(a); +#endif +} -int8_t sse::add_epi8(__m128i a) { return static_cast(add_epu8(a)); } -int16_t sse::add_epi16(__m128i a) { return static_cast(add_epu16(a)); } -int32_t sse::add_epi32(__m128i a) { return static_cast(add_epu32(a)); } -int64_t sse::add_epi64(__m128i a) { return static_cast(add_epu64(a)); } +int32_t sse::min_epi32(__m128i a) { + a = min_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = min_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + return simd_cast<__m128i_t, int32_t>(a); +} -uint8_t sse::add_epu8(__m128i a) { - auto b = shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a); - a = add(a, b, uint8_t()); - b = zeros(); - a = sad_epu8(a, b); +uint8_t sse::min_epu8(__m128i a) { +#if WJR_HAS_SIMD(SSE4_1) + a = min_epu8(a, srli_epi16(a, 8)); + a = _mm_minpos_epu16(a); return simd_cast<__m128i_t, uint8_t>(a); +#else + a = min_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); + auto X = simd_cast<__m128i_t, uint32_t>(a); + return std::min((uint8_t)X, (uint8_t)(X >> 8)); +#endif } -uint16_t sse::add_epu16(__m128i a) { - a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint16_t()); - a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint16_t()); - a = add(a, srli<2>(a), uint16_t()); +uint16_t sse::min_epu16(__m128i a) { +#if WJR_HAS_SIMD(SSE4_1) + return simd_cast<__m128i_t, uint16_t>(_mm_minpos_epu16(a)); +#else + a = min_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); + a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); return simd_cast<__m128i_t, uint16_t>(a); +#endif } -uint32_t sse::add_epu32(__m128i a) { - a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint32_t()); - a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint32_t()); +uint32_t sse::min_epu32(__m128i a) { + a = min_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); + a = min_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); return simd_cast<__m128i_t, uint32_t>(a); } -uint64_t sse::add_epu64(__m128i a) { - a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint64_t()); - return simd_cast<__m128i_t, uint64_t>(a); +int8_t sse::min(__m128i a, int8_t) { return min_epi8(a); } +int16_t sse::min(__m128i a, int16_t) { return min_epi16(a); } +int32_t sse::min(__m128i a, int32_t) { return min_epi32(a); } +uint8_t sse::min(__m128i a, uint8_t) { return min_epu8(a); } +uint16_t sse::min(__m128i a, uint16_t) { return min_epu16(a); } +uint32_t sse::min(__m128i a, uint32_t) { return min_epu32(a); } + +__m128i sse::move_epi64(__m128i a) { return _mm_move_epi64(a); } + +sse::mask_type sse::movemask_epi8(__m128i a) { + return static_cast(_mm_movemask_epi8(a)); +} +sse::mask_type sse::movemask_pd(__m128d v) { + return static_cast(_mm_movemask_pd(v)); } -int8_t sse::add(__m128i a, int8_t) { return add_epi8(a); } -int16_t sse::add(__m128i a, int16_t) { return add_epi16(a); } -int32_t sse::add(__m128i a, int32_t) { return add_epi32(a); } -int64_t sse::add(__m128i a, int64_t) { return add_epi64(a); } -uint8_t sse::add(__m128i a, uint8_t) { return add_epu8(a); } -uint16_t sse::add(__m128i a, uint16_t) { return add_epu16(a); } -uint32_t sse::add(__m128i a, uint32_t) { return add_epu32(a); } -uint64_t sse::add(__m128i a, uint64_t) { return add_epu64(a); } +sse::mask_type sse::movemask(__m128i v, int8_t) { return movemask_epi8(v); } +sse::mask_type sse::movemask(__m128i v, int32_t) { + return movemask_ps(simd_cast<__m128i_t, __m128_t>(v)); +} +sse::mask_type sse::movemask(__m128i v, int64_t) { + return movemask_pd(simd_cast<__m128i_t, __m128d_t>(v)); +} +sse::mask_type sse::movemask(__m128i v, uint8_t) { return movemask(v, int8_t()); } +sse::mask_type sse::movemask(__m128i v, uint32_t) { return movemask(v, int32_t()); } +sse::mask_type sse::movemask(__m128i v, uint64_t) { return movemask(v, int64_t()); } -__m128i sse::adds_epi8(__m128i a, __m128i b) { return _mm_adds_epi8(a, b); } -__m128i sse::adds_epi16(__m128i a, __m128i b) { return _mm_adds_epi16(a, b); } +__m128i sse::mul_epu32(__m128i a, __m128i b) { return _mm_mul_epu32(a, b); } -__m128i sse::adds_epu8(__m128i a, __m128i b) { return _mm_adds_epu8(a, b); } -__m128i sse::adds_epu16(__m128i a, __m128i b) { return _mm_adds_epu16(a, b); } +__m128i sse::mulhi_epi16(__m128i a, __m128i b) { return _mm_mulhi_epi16(a, b); } -__m128i sse::adds(__m128i a, __m128i b, int8_t) { return adds_epi8(a, b); } -__m128i sse::adds(__m128i a, __m128i b, int16_t) { return adds_epi16(a, b); } -__m128i sse::adds(__m128i a, __m128i b, uint8_t) { return adds_epu8(a, b); } -__m128i sse::adds(__m128i a, __m128i b, uint16_t) { return adds_epu16(a, b); } +__m128i sse::mulhi_epu16(__m128i a, __m128i b) { return _mm_mulhi_epu16(a, b); } -template -__m128i sse::alignr(__m128i a, __m128i b) { - constexpr int s = imm8 & 0x1F; +__m128i sse::mullo_epi16(__m128i a, __m128i b) { return _mm_mullo_epi16(a, b); } + +__m128i sse::negate_epi8(__m128i a) { #if WJR_HAS_SIMD(SSSE3) - return _mm_alignr_epi8(a, b, s); + return sign_epi8(a, ones()); #else - if constexpr (s == 0) { - return b; - } - if constexpr (s == 16) { - return a; - } - if constexpr (s < 16) { - return Or(slli<16 - s>(a), srli(b)); - } - return srli(a); -#endif // SSSE3 + return sub_epi8(zeros(), a); +#endif } -__m128i sse::alignr_epi16(__m128i a, __m128i b, int c) { - return Or(slli(a, 16 - c, uint16_t()), srli(b, c, uint16_t())); +__m128i sse::negate_epi16(__m128i a) { +#if WJR_HAS_SIMD(SSSE3) + return sign_epi16(a, ones()); +#else + return sub_epi16(zeros(), a); +#endif } -__m128i sse::alignr_epi32(__m128i a, __m128i b, int c) { - return Or(slli(a, 32 - c, uint32_t()), srli(b, c, uint32_t())); +__m128i sse::negate_epi32(__m128i a) { +#if WJR_HAS_SIMD(SSSE3) + return sign_epi32(a, ones()); +#else + return sub_epi32(zeros(), a); +#endif } -__m128i sse::alignr_epi64(__m128i a, __m128i b, int c) { - return Or(slli(a, 64 - c, uint64_t()), srli(b, c, uint64_t())); -} +__m128i sse::negate_epi64(__m128i a) { return sub_epi64(zeros(), a); } -__m128i sse::alignr(__m128i a, __m128i b, int c, int16_t) { - return alignr_epi16(a, b, c); -} -__m128i sse::alignr(__m128i a, __m128i b, int c, int32_t) { - return alignr_epi32(a, b, c); -} -__m128i sse::alignr(__m128i a, __m128i b, int c, int64_t) { - return alignr_epi64(a, b, c); -} -__m128i sse::alignr(__m128i a, __m128i b, int c, uint16_t) { - return alignr_epi16(a, b, c); -} -__m128i sse::alignr(__m128i a, __m128i b, int c, uint32_t) { - return alignr_epi32(a, b, c); -} -__m128i sse::alignr(__m128i a, __m128i b, int c, uint64_t) { - return alignr_epi64(a, b, c); -} +__m128i sse::negate(__m128i a, int8_t) { return negate_epi8(a); } +__m128i sse::negate(__m128i a, int16_t) { return negate_epi16(a); } +__m128i sse::negate(__m128i a, int32_t) { return negate_epi32(a); } +__m128i sse::negate(__m128i a, int64_t) { return negate_epi64(a); } +__m128i sse::negate(__m128i a, uint8_t) { return negate_epi8(a); } +__m128i sse::negate(__m128i a, uint16_t) { return negate_epi16(a); } +__m128i sse::negate(__m128i a, uint32_t) { return negate_epi32(a); } +__m128i sse::negate(__m128i a, uint64_t) { return negate_epi64(a); } -__m128i sse::And(__m128i a, __m128i b) { return _mm_and_si128(a, b); } +__m128i sse::Not(__m128i v) { return Xor(v, ones()); } -__m128i sse::AndNot(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); } +__m128i sse::Or(__m128i a, __m128i b) { return _mm_or_si128(a, b); } -__m128i sse::avg_epu8(__m128i a, __m128i b) { return _mm_avg_epu8(a, b); } -__m128i sse::avg_epu16(__m128i a, __m128i b) { return _mm_avg_epu16(a, b); } +__m128i sse::packs_epi16(__m128i a, __m128i b) { return _mm_packs_epi16(a, b); } +__m128i sse::packs_epi32(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); } -__m128i sse::avg(__m128i a, __m128i b, int8_t) { return avg_epu8(a, b); } -__m128i sse::avg(__m128i a, __m128i b, int16_t) { return avg_epu16(a, b); } -__m128i sse::avg(__m128i a, __m128i b, uint8_t) { return avg_epu8(a, b); } -__m128i sse::avg(__m128i a, __m128i b, uint16_t) { return avg_epu16(a, b); } +__m128i sse::packus_epi16(__m128i a, __m128i b) { return _mm_packus_epi16(a, b); } -// notice that mask must be 0 or 255(every byte) -__m128i sse::blendv_epi8(__m128i a, __m128i b, __m128i mask) { +__m128i sse::loadu_si48(const void *ptr) { + return insert_epi16<2>(loadu_si32(ptr), reinterpret_cast(ptr)[2]); +} + +__m128i sse::loadu_si80(const void *ptr) { + return insert_epi16<4>(loadu_si64(ptr), reinterpret_cast(ptr)[4]); +} + +__m128i sse::loadu_si96(const void *ptr) { #if WJR_HAS_SIMD(SSE4_1) - return _mm_blendv_epi8(a, b, mask); -#elif defined(WJR_COMPILER_GCC) - return ((~mask) & a) | (mask & b); + return insert_epi32<2>(loadu_si64(ptr), reinterpret_cast(ptr)[2]); #else - return Or(AndNot(mask, a), And(mask, b)); + return insert_epi16<5>(loadu_si80(ptr), reinterpret_cast(ptr)[5]); #endif } -__m128i sse::blendv_epi16(__m128i a, __m128i b, __m128i mask) { - return blendv_epi8(b, a, logical_not(mask, uint16_t())); +__m128i sse::loadu_si112(const void *ptr) { + return insert_epi16<6>(loadu_si96(ptr), reinterpret_cast(ptr)[6]); } -__m128i sse::blendv_epi32(__m128i a, __m128i b, __m128i mask) { - return blendv_epi8(b, a, logical_not(mask, uint32_t())); +__m128i sse::loadu_si128(const void *ptr) { return loadu(ptr); } + +__m128i sse::loadu_si16x(const void *ptr, int n) { + switch (n) { + case 0: + return zeros(); + case 1: + return loadu_si16(ptr); + case 2: + return loadu_si32(ptr); + case 3: + return loadu_si48(ptr); + case 4: + return loadu_si64(ptr); + case 5: + return loadu_si80(ptr); + case 6: + return loadu_si96(ptr); + case 7: + return loadu_si112(ptr); + default: + return loadu_si128(ptr); + } } -__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int8_t) { - return blendv_epi8(a, b, mask); +__m128i sse::sad_epu8(__m128i a, __m128i b) { return _mm_sad_epu8(a, b); } + +__m128i sse::zeros() { return _mm_setzero_si128(); } +__m128i sse::ones() { return _mm_set1_epi32(-1); } + +__m128i sse::set_epi8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, + char e8, char e7, char e6, char e5, char e4, char e3, char e2, + char e1, char e0) { + return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, + e0); } -__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int16_t) { - return blendv_epi16(a, b, mask); +__m128i sse::set_epi16(short e7, short e6, short e5, short e4, short e3, short e2, + short e1, short e0) { + return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); +} +__m128i sse::set_epi32(int e3, int e2, int e1, int e0) { + return _mm_set_epi32(e3, e2, e1, e0); } +__m128i sse::set_epi64x(long long e1, long long e0) { return _mm_set_epi64x(e1, e0); } -__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int32_t) { - return blendv_epi32(a, b, mask); +__m128i sse::setr_epi8(char e15, char e14, char e13, char e12, char e11, char e10, + char e9, char e8, char e7, char e6, char e5, char e4, char e3, + char e2, char e1, char e0) { + return _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, + e0); } -__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint8_t) { - return blendv_epi8(a, b, mask); +__m128i sse::setr_epi16(short e7, short e6, short e5, short e4, short e3, short e2, + short e1, short e0) { + return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); +} +__m128i sse::setr_epi32(int e3, int e2, int e1, int e0) { + return _mm_setr_epi32(e3, e2, e1, e0); } -__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint16_t) { - return blendv_epi16(a, b, mask); +__m128i sse::set1_epi8(int8_t val) { return _mm_set1_epi8(val); } +__m128i sse::set1_epi16(int16_t val) { return _mm_set1_epi16(val); } +__m128i sse::set1_epi32(int32_t val) { return _mm_set1_epi32(val); } +__m128i sse::set1_epi64(int64_t val) { return _mm_set1_epi64x(val); } + +__m128i sse::set1(int8_t val, int8_t) { return set1_epi8(val); } +__m128i sse::set1(int16_t val, int16_t) { return set1_epi16(val); } +__m128i sse::set1(int32_t val, int32_t) { return set1_epi32(val); } +__m128i sse::set1(int64_t val, int64_t) { return set1_epi64(val); } +__m128i sse::set1(uint8_t val, uint8_t) { return set1_epi8(val); } +__m128i sse::set1(uint16_t val, uint16_t) { return set1_epi16(val); } +__m128i sse::set1(uint32_t val, uint32_t) { return set1_epi32(val); } +__m128i sse::set1(uint64_t val, uint64_t) { return set1_epi64(val); } + +__m128i sse::setmin_epi8() { return set1_epi8(0x80u); } +__m128i sse::setmin_epi16() { return set1_epi16(0x8000u); } +__m128i sse::setmin_epi32() { return set1_epi32(0x80000000u); } + +__m128i sse::setmin(int8_t) { return setmin_epi8(); } +__m128i sse::setmin(int16_t) { return setmin_epi16(); } +__m128i sse::setmin(int32_t) { return setmin_epi32(); } +__m128i sse::setmin(uint8_t) { return set1_epi32(0); } +__m128i sse::setmin(uint16_t) { return set1_epi32(0); } +__m128i sse::setmin(uint32_t) { return set1_epi32(0); } + +__m128i sse::setmax_epi8() { return set1_epi8(0x7F); } +__m128i sse::setmax_epi16() { return set1_epi16(0x7FFF); } +__m128i sse::setmax_epi32() { return set1_epi32(0x7FFFFFFF); } + +__m128i sse::setmax(int8_t) { return setmax_epi8(); } +__m128i sse::setmax(int16_t) { return setmax_epi16(); } +__m128i sse::setmax(int32_t) { return setmax_epi32(); } +__m128i sse::setmax(uint8_t) { return set1_epi32(0xFFFFFFFF); } +__m128i sse::setmax(uint16_t) { return set1_epi32(0xFFFFFFFF); } +__m128i sse::setmax(uint32_t) { return set1_epi32(0xFFFFFFFF); } + +template +__m128i sse::shl(__m128i a) { + if constexpr (imm >= 64) { + a = slli<8>(a); + a = slli_epi64(a, imm - 64); + return a; + } else { + auto b = slli_epi64(a, imm); + auto c = slli<8>(a); + c = srli_epi64(c, 64 - imm); + return Or(b, c); + } } -__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint32_t) { - return blendv_epi32(a, b, mask); +template +__m128i sse::shr(__m128i a) { + if constexpr (imm >= 64) { + a = srli<8>(a); + a = srli_epi64(a, imm - 64); + return a; + } else { + auto b = srli_epi64(a, imm); + auto c = srli<8>(a); + c = slli_epi64(c, 64 - imm); + return Or(b, c); + } } template -__m128i sse::bslli(__m128i val) { - return _mm_bslli_si128(val, imm8); +__m128i sse::shuffle_epi32(__m128i v) { + static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]"); + return _mm_shuffle_epi32(v, imm8); } template -__m128i sse::bsrli(__m128i val) { - return _mm_bsrli_si128(val, imm8); +__m128i sse::shufflehi_epi16(__m128i v) { + return _mm_shufflehi_epi16(v, imm8); } -__m128i sse::cmpeq_epi8(__m128i a, __m128i b) { return _mm_cmpeq_epi8(a, b); } -__m128i sse::cmpeq_epi16(__m128i a, __m128i b) { return _mm_cmpeq_epi16(a, b); } -__m128i sse::cmpeq_epi32(__m128i a, __m128i b) { return _mm_cmpeq_epi32(a, b); } +template +__m128i sse::shufflelo_epi16(__m128i v) { + return _mm_shufflelo_epi16(v, imm8); +} -__m128i sse::cmpeq(__m128i a, __m128i b, int8_t) { return cmpeq_epi8(a, b); } -__m128i sse::cmpeq(__m128i a, __m128i b, int16_t) { return cmpeq_epi16(a, b); } -__m128i sse::cmpeq(__m128i a, __m128i b, int32_t) { return cmpeq_epi32(a, b); } -__m128i sse::cmpeq(__m128i a, __m128i b, uint8_t) { return cmpeq_epi8(a, b); } -__m128i sse::cmpeq(__m128i a, __m128i b, uint16_t) { return cmpeq_epi16(a, b); } -__m128i sse::cmpeq(__m128i a, __m128i b, uint32_t) { return cmpeq_epi32(a, b); } +__m128i sse::sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); } +__m128i sse::sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); } +__m128i sse::sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); } -__m128i sse::cmpge_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comge_epi8(a, b); -#elif WJR_HAS_SIMD(SSE4_1) - return cmpeq(min(a, b, int8_t()), b, uint8_t()); -#else - return Not(cmpgt(b, a, int8_t())); -#endif -} +__m128i sse::sll(__m128i a, __m128i b, int16_t) { return sll_epi16(a, b); } +__m128i sse::sll(__m128i a, __m128i b, int32_t) { return sll_epi32(a, b); } +__m128i sse::sll(__m128i a, __m128i b, int64_t) { return sll_epi64(a, b); } +__m128i sse::sll(__m128i a, __m128i b, uint16_t) { return sll_epi16(a, b); } +__m128i sse::sll(__m128i a, __m128i b, uint32_t) { return sll_epi32(a, b); } +__m128i sse::sll(__m128i a, __m128i b, uint64_t) { return sll_epi64(a, b); } -__m128i sse::cmpge_epi16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comge_epi16(a, b); -#else - return cmpeq(min(a, b, int16_t()), b, uint16_t()); -#endif +template +__m128i sse::slli(__m128i v) { + return _mm_slli_si128(v, imm8); } +__m128i sse::slli_epi16(__m128i a, int imm8) { + if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) { + return sse::add_epi16(a, a); + } -__m128i sse::cmpge_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comge_epi32(a, b); -#elif WJR_HAS_SIMD(SSE4_1) - return cmpeq(min(a, b, int32_t()), b, uint32_t()); -#else - return Not(cmpgt(b, a, int32_t())); -#endif + return _mm_slli_epi16(a, imm8); } +__m128i sse::slli_epi32(__m128i a, int imm8) { + if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) { + return sse::add_epi32(a, a); + } -__m128i sse::cmpge_epu8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comge_epu8(a, b); -#else - return cmpeq(min(a, b, uint8_t()), b, uint8_t()); -#endif + return _mm_slli_epi32(a, imm8); } +__m128i sse::slli_epi64(__m128i a, int imm8) { + if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) { + return sse::add_epi64(a, a); + } -__m128i sse::cmpge_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comge_epu16(a, b); -#elif WJR_HAS_SIMD(SSE4_1) - return cmpeq(min(a, b, uint16_t()), b, uint16_t()); -#else - return logical_not(subs(b, a, uint16_t()), uint16_t()); -#endif + return _mm_slli_epi64(a, imm8); } -__m128i sse::cmpge_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comge_epu32(a, b); -#elif WJR_HAS_SIMD(SSE4_1) - return cmpeq(min(a, b, uint32_t()), b, uint32_t()); -#else - return Not(cmpgt(b, a, uint32_t())); -#endif +__m128i sse::slli(__m128i a, int imm8, int16_t) { return slli_epi16(a, imm8); } +__m128i sse::slli(__m128i a, int imm8, int32_t) { return slli_epi32(a, imm8); } +__m128i sse::slli(__m128i a, int imm8, int64_t) { return slli_epi64(a, imm8); } +__m128i sse::slli(__m128i a, int imm8, uint16_t) { return slli_epi16(a, imm8); } +__m128i sse::slli(__m128i a, int imm8, uint32_t) { return slli_epi32(a, imm8); } +__m128i sse::slli(__m128i a, int imm8, uint64_t) { return slli_epi64(a, imm8); } + +__m128i sse::sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); } +__m128i sse::sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); } + +__m128i sse::sra(__m128i a, __m128i b, int16_t) { return sra_epi16(a, b); } +__m128i sse::sra(__m128i a, __m128i b, int32_t) { return sra_epi32(a, b); } + +__m128i sse::srai_epi16(__m128i a, int imm8) { return _mm_srai_epi16(a, imm8); } +__m128i sse::srai_epi32(__m128i a, int imm8) { return _mm_srai_epi32(a, imm8); } + +__m128i sse::srai(__m128i a, int imm8, int16_t) { return srai_epi16(a, imm8); } +__m128i sse::srai(__m128i a, int imm8, int32_t) { return srai_epi32(a, imm8); } + +__m128i sse::srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); } +__m128i sse::srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); } +__m128i sse::srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); } + +__m128i sse::srl(__m128i a, __m128i b, int16_t) { return srl_epi16(a, b); } +__m128i sse::srl(__m128i a, __m128i b, int32_t) { return srl_epi32(a, b); } +__m128i sse::srl(__m128i a, __m128i b, int64_t) { return srl_epi64(a, b); } +__m128i sse::srl(__m128i a, __m128i b, uint16_t) { return srl_epi16(a, b); } +__m128i sse::srl(__m128i a, __m128i b, uint32_t) { return srl_epi32(a, b); } +__m128i sse::srl(__m128i a, __m128i b, uint64_t) { return srl_epi64(a, b); } + +template +__m128i sse::srli(__m128i v) { + return _mm_srli_si128(v, imm8); +} +__m128i sse::srli_epi8(__m128i a, int imm8) { + return And(srli_epi16(a, imm8), sse_detail::srli_epi8_mask[imm8]); } +__m128i sse::srli_epi16(__m128i a, int imm8) { return _mm_srli_epi16(a, imm8); } +__m128i sse::srli_epi32(__m128i a, int imm8) { return _mm_srli_epi32(a, imm8); } +__m128i sse::srli_epi64(__m128i a, int imm8) { return _mm_srli_epi64(a, imm8); } -__m128i sse::cmpge(__m128i a, __m128i b, int8_t) { return cmpge_epi8(a, b); } -__m128i sse::cmpge(__m128i a, __m128i b, int16_t) { return cmpge_epi16(a, b); } -__m128i sse::cmpge(__m128i a, __m128i b, int32_t) { return cmpge_epi32(a, b); } -__m128i sse::cmpge(__m128i a, __m128i b, uint8_t) { return cmpge_epu8(a, b); } -__m128i sse::cmpge(__m128i a, __m128i b, uint16_t) { return cmpge_epu16(a, b); } -__m128i sse::cmpge(__m128i a, __m128i b, uint32_t) { return cmpge_epu32(a, b); } +__m128i sse::srli(__m128i a, int imm8, int8_t) { return srli_epi8(a, imm8); } +__m128i sse::srli(__m128i a, int imm8, int16_t) { return srli_epi16(a, imm8); } +__m128i sse::srli(__m128i a, int imm8, int32_t) { return srli_epi32(a, imm8); } +__m128i sse::srli(__m128i a, int imm8, int64_t) { return srli_epi64(a, imm8); } +__m128i sse::srli(__m128i a, int imm8, uint8_t) { return srli_epi8(a, imm8); } +__m128i sse::srli(__m128i a, int imm8, uint16_t) { return srli_epi16(a, imm8); } +__m128i sse::srli(__m128i a, int imm8, uint32_t) { return srli_epi32(a, imm8); } +__m128i sse::srli(__m128i a, int imm8, uint64_t) { return srli_epi64(a, imm8); } -__m128i sse::cmpgt_epi8(__m128i a, __m128i b) { return _mm_cmpgt_epi8(a, b); } -__m128i sse::cmpgt_epi16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(a, b); } -__m128i sse::cmpgt_epi32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(a, b); } +void sse::stream(__m128i *ptr, __m128i v) { _mm_stream_si128(ptr, v); } -__m128i sse::cmpgt_epu8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comgt_epu8(a, b); -#else - return cmpgt_epi8(Xor(a, setmin_epi8()), Xor(b, setmin_epi8())); -#endif +void sse::store(void *ptr, __m128i val) { + _mm_store_si128(static_cast<__m128i *>(ptr), val); +} +void sse::storeu(void *ptr, __m128i val) { + _mm_storeu_si128(static_cast<__m128i *>(ptr), val); } -__m128i sse::cmpgt_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comgt_epu16(a, b); -#else - return cmpgt_epi16(Xor(a, setmin_epi16()), Xor(b, setmin_epi16())); -#endif -} +__m128i sse::sub_epi8(__m128i a, __m128i b) { return _mm_sub_epi8(a, b); } +__m128i sse::sub_epi16(__m128i a, __m128i b) { return _mm_sub_epi16(a, b); } +__m128i sse::sub_epi32(__m128i a, __m128i b) { return _mm_sub_epi32(a, b); } +__m128i sse::sub_epi64(__m128i a, __m128i b) { return _mm_sub_epi64(a, b); } -__m128i sse::cmpgt_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comgt_epu32(a, b); -#else - return cmpgt_epi32(Xor(a, setmin_epi32()), Xor(b, setmin_epi32())); -#endif -} +__m128i sse::sub(__m128i a, __m128i b, int8_t) { return sub_epi8(a, b); } +__m128i sse::sub(__m128i a, __m128i b, int16_t) { return sub_epi16(a, b); } +__m128i sse::sub(__m128i a, __m128i b, int32_t) { return sub_epi32(a, b); } +__m128i sse::sub(__m128i a, __m128i b, int64_t) { return sub_epi64(a, b); } +__m128i sse::sub(__m128i a, __m128i b, uint8_t) { return sub_epi8(a, b); } +__m128i sse::sub(__m128i a, __m128i b, uint16_t) { return sub_epi16(a, b); } +__m128i sse::sub(__m128i a, __m128i b, uint32_t) { return sub_epi32(a, b); } +__m128i sse::sub(__m128i a, __m128i b, uint64_t) { return sub_epi64(a, b); } -__m128i sse::cmpgt(__m128i a, __m128i b, int8_t) { return cmpgt_epi8(a, b); } -__m128i sse::cmpgt(__m128i a, __m128i b, int16_t) { return cmpgt_epi16(a, b); } -__m128i sse::cmpgt(__m128i a, __m128i b, int32_t) { return cmpgt_epi32(a, b); } -__m128i sse::cmpgt(__m128i a, __m128i b, uint8_t) { return cmpgt_epu8(a, b); } -__m128i sse::cmpgt(__m128i a, __m128i b, uint16_t) { return cmpgt_epu16(a, b); } -__m128i sse::cmpgt(__m128i a, __m128i b, uint32_t) { return cmpgt_epu32(a, b); } +__m128i sse::subs_epi8(__m128i a, __m128i b) { return _mm_subs_epi8(a, b); } +__m128i sse::subs_epi16(__m128i a, __m128i b) { return _mm_subs_epi16(a, b); } -__m128i sse::cmple_epi8(__m128i a, __m128i b) { return cmpge_epi8(b, a); } -__m128i sse::cmple_epi16(__m128i a, __m128i b) { return cmpge_epi16(b, a); } -__m128i sse::cmple_epi32(__m128i a, __m128i b) { return cmpge_epi32(b, a); } +__m128i sse::subs_epu8(__m128i a, __m128i b) { return _mm_subs_epu8(a, b); } +__m128i sse::subs_epu16(__m128i a, __m128i b) { return _mm_subs_epu16(a, b); } -__m128i sse::cmple_epu8(__m128i a, __m128i b) { return cmpge_epu8(b, a); } -__m128i sse::cmple_epu16(__m128i a, __m128i b) { return cmpge_epu16(b, a); } -__m128i sse::cmple_epu32(__m128i a, __m128i b) { return cmpge_epu32(b, a); } +__m128i sse::subs(__m128i a, __m128i b, int8_t) { return subs_epi8(a, b); } +__m128i sse::subs(__m128i a, __m128i b, int16_t) { return subs_epi16(a, b); } +__m128i sse::subs(__m128i a, __m128i b, uint8_t) { return subs_epu8(a, b); } +__m128i sse::subs(__m128i a, __m128i b, uint16_t) { return subs_epu16(a, b); } -__m128i sse::cmple(__m128i a, __m128i b, int8_t) { return cmple_epi8(a, b); } -__m128i sse::cmple(__m128i a, __m128i b, int16_t) { return cmple_epi16(a, b); } -__m128i sse::cmple(__m128i a, __m128i b, int32_t) { return cmple_epi32(a, b); } -__m128i sse::cmple(__m128i a, __m128i b, uint8_t) { return cmple_epu8(a, b); } -__m128i sse::cmple(__m128i a, __m128i b, uint16_t) { return cmple_epu16(a, b); } -__m128i sse::cmple(__m128i a, __m128i b, uint32_t) { return cmple_epu32(a, b); } +__m128i sse::unpackhi_epi8(__m128i a, __m128i b) { return _mm_unpackhi_epi8(a, b); } +__m128i sse::unpackhi_epi16(__m128i a, __m128i b) { return _mm_unpackhi_epi16(a, b); } +__m128i sse::unpackhi_epi32(__m128i a, __m128i b) { return _mm_unpackhi_epi32(a, b); } +__m128i sse::unpackhi_epi64(__m128i a, __m128i b) { return _mm_unpackhi_epi64(a, b); } -__m128i sse::cmplt_epi8(__m128i a, __m128i b) { return _mm_cmplt_epi8(a, b); } -__m128i sse::cmplt_epi16(__m128i a, __m128i b) { return _mm_cmplt_epi16(a, b); } -__m128i sse::cmplt_epi32(__m128i a, __m128i b) { return _mm_cmplt_epi32(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, int8_t) { return unpackhi_epi8(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, int16_t) { return unpackhi_epi16(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, int32_t) { return unpackhi_epi32(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, int64_t) { return unpackhi_epi64(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, uint8_t) { return unpackhi_epi8(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, uint16_t) { return unpackhi_epi16(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, uint32_t) { return unpackhi_epi32(a, b); } +__m128i sse::unpackhi(__m128i a, __m128i b, uint64_t) { return unpackhi_epi64(a, b); } -__m128i sse::cmplt_epu8(__m128i a, __m128i b) { return cmpgt_epu8(b, a); } -__m128i sse::cmplt_epu16(__m128i a, __m128i b) { return cmpgt_epu16(b, a); } -__m128i sse::cmplt_epu32(__m128i a, __m128i b) { return cmpgt_epu32(b, a); } +__m128i sse::unpacklo_epi8(__m128i a, __m128i b) { return _mm_unpacklo_epi8(a, b); } +__m128i sse::unpacklo_epi16(__m128i a, __m128i b) { return _mm_unpacklo_epi16(a, b); } +__m128i sse::unpacklo_epi32(__m128i a, __m128i b) { return _mm_unpacklo_epi32(a, b); } +__m128i sse::unpacklo_epi64(__m128i a, __m128i b) { return _mm_unpacklo_epi64(a, b); } -__m128i sse::cmplt(__m128i a, __m128i b, int8_t) { return cmplt_epi8(a, b); } -__m128i sse::cmplt(__m128i a, __m128i b, int16_t) { return cmplt_epi16(a, b); } -__m128i sse::cmplt(__m128i a, __m128i b, int32_t) { return cmplt_epi32(a, b); } -__m128i sse::cmplt(__m128i a, __m128i b, uint8_t) { return cmplt_epu8(a, b); } -__m128i sse::cmplt(__m128i a, __m128i b, uint16_t) { return cmplt_epu16(a, b); } -__m128i sse::cmplt(__m128i a, __m128i b, uint32_t) { return cmplt_epu32(a, b); } +__m128i sse::unpacklo(__m128i a, __m128i b, int8_t) { return unpacklo_epi8(a, b); } +__m128i sse::unpacklo(__m128i a, __m128i b, int16_t) { return unpacklo_epi16(a, b); } +__m128i sse::unpacklo(__m128i a, __m128i b, int32_t) { return unpacklo_epi32(a, b); } +__m128i sse::unpacklo(__m128i a, __m128i b, int64_t) { return unpacklo_epi64(a, b); } +__m128i sse::unpacklo(__m128i a, __m128i b, uint8_t) { return unpacklo_epi8(a, b); } +__m128i sse::unpacklo(__m128i a, __m128i b, uint16_t) { return unpacklo_epi16(a, b); } +__m128i sse::unpacklo(__m128i a, __m128i b, uint32_t) { return unpacklo_epi32(a, b); } -__m128i sse::cmpne_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comneq_epi8(a, b); -#else - return Not(cmpeq_epi8(a, b)); -#endif -} +__m128i sse::Xor(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } -__m128i sse::cmpne_epi16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comneq_epi16(a, b); -#else - return Not(cmpeq_epi16(a, b)); #endif -} -__m128i sse::cmpne_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) - return _mm_comneq_epi32(a, b); -#else - return Not(cmpeq_epi32(a, b)); -#endif -} +#if WJR_HAS_SIMD(SSE3) -__m128i sse::cmpne(__m128i a, __m128i b, int8_t) { return cmpne_epi8(a, b); } -__m128i sse::cmpne(__m128i a, __m128i b, int16_t) { return cmpne_epi16(a, b); } -__m128i sse::cmpne(__m128i a, __m128i b, int32_t) { return cmpne_epi32(a, b); } -__m128i sse::cmpne(__m128i a, __m128i b, uint8_t) { return cmpne_epi8(a, b); } -__m128i sse::cmpne(__m128i a, __m128i b, uint16_t) { return cmpne_epi16(a, b); } -__m128i sse::cmpne(__m128i a, __m128i b, uint32_t) { return cmpne_epi32(a, b); } +__m128i sse::lddqu(const __m128i *ptr) { return _mm_lddqu_si128(ptr); } -template -__m128i sse::cmp(__m128i a, __m128i b, std::equal_to<>, T) { - return cmpeq(a, b, T()); -} +#endif -template -__m128i sse::cmp(__m128i a, __m128i b, std::not_equal_to<>, T) { - return cmpne(a, b, T()); -} +#if WJR_HAS_SIMD(SSSE3) -template -__m128i sse::cmp(__m128i a, __m128i b, std::greater<>, T) { - return cmpgt(a, b, T()); -} +__m128i sse::abs_epi8(__m128i val) { return _mm_abs_epi8(val); } +__m128i sse::abs_epi16(__m128i val) { return _mm_abs_epi16(val); } +__m128i sse::abs_epi32(__m128i val) { return _mm_abs_epi32(val); } -template -__m128i sse::cmp(__m128i a, __m128i b, std::greater_equal<>, T) { - return cmpge(a, b, T()); -} +__m128i sse::abs(__m128i val, int8_t) { return abs_epi8(val); } +__m128i sse::abs(__m128i val, int16_t) { return abs_epi16(val); } +__m128i sse::abs(__m128i val, int32_t) { return abs_epi32(val); } +__m128i sse::abs(__m128i val, uint8_t) { return val; } +__m128i sse::abs(__m128i val, uint16_t) { return val; } +__m128i sse::abs(__m128i val, uint32_t) { return val; } -template -__m128i sse::cmp(__m128i a, __m128i b, std::less<>, T) { - return cmplt(a, b, T()); -} +__m128i sse::shuffle_epi8(__m128i v, __m128i imm8) { return _mm_shuffle_epi8(v, imm8); } -template -__m128i sse::cmp(__m128i a, __m128i b, std::less_equal<>, T) { - return cmple(a, b, T()); -} +__m128i sse::sign_epi8(__m128i a, __m128i b) { return _mm_sign_epi8(a, b); } +__m128i sse::sign_epi16(__m128i a, __m128i b) { return _mm_sign_epi16(a, b); } +__m128i sse::sign_epi32(__m128i a, __m128i b) { return _mm_sign_epi32(a, b); } -__m128i sse::concat(uint64_t lo, uint64_t hi) { return set_epi64x(hi, lo); } +__m128i sse::sign(__m128i a, __m128i b, int8_t) { return sign_epi8(a, b); } +__m128i sse::sign(__m128i a, __m128i b, int16_t) { return sign_epi16(a, b); } +__m128i sse::sign(__m128i a, __m128i b, int32_t) { return sign_epi32(a, b); } +__m128i sse::sign(__m128i a, __m128i b, uint8_t) { return sign_epi8(a, b); } +__m128i sse::sign(__m128i a, __m128i b, uint16_t) { return sign_epi16(a, b); } +__m128i sse::sign(__m128i a, __m128i b, uint32_t) { return sign_epi32(a, b); } -template -int sse::extract_epi8(__m128i a) { - static_assert(imm8 >= 0 && imm8 < 16, "imm8 must be in range [0, 15]"); -#if WJR_HAS_SIMD(SSE4_1) - return _mm_extract_epi8(a, imm8); -#else - if constexpr (imm8 & 1) { - return extract_epi16<(imm8 >> 1)>(a) >> 8; - } else { - return extract_epi16<(imm8 >> 1)>(a) & 0xff; - } #endif -} - -template -int sse::extract_epi16(__m128i a) { - static_assert(imm8 >= 0 && imm8 < 8, "imm8 must be in range [0, 7]"); - return _mm_extract_epi16(a, imm8); -} -template -int sse::extract_epi32(__m128i a) { - static_assert(imm8 >= 0 && imm8 < 4, "imm8 must be in range [0, 3]"); #if WJR_HAS_SIMD(SSE4_1) - return _mm_extract_epi32(a, imm8); -#else - if constexpr (imm8 == 0) { - return simd_cast<__m128i_t, uint32_t>(a); - } else if constexpr (imm8 == 1) { - return static_cast(simd_cast<__m128i_t, uint64_t>(a) >> 32); - } else if constexpr (imm8 == 2) { - return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - } else { - return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 3, 3, 3)>(a)); - } -#endif -} template -int64_t sse::extract_epi64(__m128i a) { - static_assert(imm8 >= 0 && imm8 < 2, "imm8 must be in range [0, 1]"); -#if WJR_HAS_SIMD(SSE4_1) - return _mm_extract_epi64(a, imm8); -#else - if constexpr (imm8 == 0) { - return simd_cast<__m128i_t, uint64_t>(a); - } else { - return simd_cast<__m128i_t, uint64_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - } -#endif +__m128i sse::blend_epi16(__m128i a, __m128i b) { + return _mm_blend_epi16(a, b, imm8); } -template -int sse::extract(__m128i a, int8_t) { - return extract_epi8(a); -} +__m128i sse::cmpeq_epi64(__m128i a, __m128i b) { return _mm_cmpeq_epi64(a, b); } -template -int sse::extract(__m128i a, int16_t) { - return extract_epi16(a); -} +__m128i sse::cmpeq(__m128i a, __m128i b, int64_t) { return cmpeq_epi64(a, b); } +__m128i sse::cmpeq(__m128i a, __m128i b, uint64_t) { return cmpeq_epi64(a, b); } + +__m128i sse::cmpgt_epi64(__m128i a, __m128i b) { return _mm_cmpgt_epi64(a, b); } + +__m128i sse::cmpgt(__m128i a, __m128i b, int64_t) { return cmpgt_epi64(a, b); } template -int sse::extract(__m128i a, int32_t) { - return extract_epi32(a); +__m128i sse::insert_epi8(__m128i a, int i) { + return _mm_insert_epi8(a, i, imm8); } template -int64_t sse::extract(__m128i a, int64_t) { - return extract_epi64(a); +__m128i sse::insert_epi32(__m128i a, int i) { + return _mm_insert_epi32(a, i, imm8); } template -int sse::extract(__m128i a, uint8_t) { - return extract_epi8(a); +__m128i sse::insert_epi64(__m128i a, int64_t i) { + return _mm_insert_epi64(a, i, imm8); } template -int sse::extract(__m128i a, uint16_t) { - return extract_epi16(a); +__m128i sse::insert(__m128i a, int i, int8_t) { + return insert_epi8(a, i); } template -int sse::extract(__m128i a, uint32_t) { - return extract_epi32(a); +__m128i sse::insert(__m128i a, int i, int32_t) { + return insert_epi32(a, i); } template -int64_t sse::extract(__m128i a, uint64_t) { - return extract_epi64(a); +__m128i sse::insert(__m128i a, int64_t i, int64_t) { + return insert_epi64(a, i); } -uint64_t sse::getlow(__m128i v) { return simd_cast<__m128i_t, uint64_t>(v); } -uint64_t sse::gethigh(__m128i v) { return extract_epi64<1>(v); } - template -__m128i sse::insert_epi16(__m128i a, int i) { - return _mm_insert_epi16(a, i, imm8); +__m128i sse::insert(__m128i a, int i, uint8_t) { + return insert_epi8(a, i); } template -__m128i sse::insert(__m128i a, int i, int16_t) { - return insert_epi16(a, i); +__m128i sse::insert(__m128i a, int i, uint32_t) { + return insert_epi32(a, i); } template -__m128i sse::insert(__m128i a, int i, uint16_t) { - return insert_epi16(a, i); +__m128i sse::insert(__m128i a, int64_t i, uint64_t) { + return insert_epi64(a, i); } -void sse::lfence() { _mm_lfence(); } +__m128i sse::minpos_epu16(__m128i a) { return _mm_minpos_epu16(a); } -__m128i sse::load(const void *ptr) { - return _mm_load_si128(static_cast(ptr)); -} -__m128i sse::loadu(const void *ptr) { - return _mm_loadu_si128(static_cast(ptr)); -} -__m128i sse::loadu_si16(const void *ptr) { - return simd_cast(read_memory(ptr)); -} +__m128i sse::mul_epi32(__m128i a, __m128i b) { return _mm_mul_epi32(a, b); } -__m128i sse::loadu_si32(const void *ptr) { - return simd_cast(read_memory(ptr)); -} +__m128i sse::mullo_epi32(__m128i a, __m128i b) { return _mm_mullo_epi32(a, b); } -__m128i sse::loadu_si64(const void *ptr) { - return simd_cast(read_memory(ptr)); -} +__m128i sse::packus_epi32(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); } -template )> -__m128i sse::logical_and(__m128i a, __m128i b, T) { - return Not(Or(logical_not(a, T()), logical_not(b, T()))); +__m128i sse::stream_load(void *p) { + return _mm_stream_load_si128(static_cast<__m128i *>(p)); } -template )> -__m128i sse::logical_not(__m128i v, T) { - auto Zero = zeros(); - return cmpeq(v, Zero, T()); -} +int sse::test_all_ones(__m128i a) { return _mm_test_all_ones(a); } -template )> -__m128i sse::logical_or(__m128i a, __m128i b, T) { - return Not(logical_not(Or(a, b), T())); -} +int sse::test_all_zeros(__m128i a, __m128i b) { return _mm_test_all_zeros(a, b); } -__m128i sse::madd_epi16(__m128i a, __m128i b) { return _mm_madd_epi16(a, b); } +int sse::test_all_zeros(__m128i a) { return _mm_test_all_zeros(a, a); } -void sse::maskmoveu(__m128i a, __m128i mask, char *mem_addr) { - return _mm_maskmoveu_si128(a, mask, mem_addr); +int sse::test_mix_ones_zeros(__m128i a, __m128i b) { + return _mm_test_mix_ones_zeros(a, b); } -__m128i sse::max_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_max_epi8(a, b); -#else - return blendv_epi8(b, a, cmpgt_epi8(a, b)); -#endif -} +int sse::testc(__m128i a, __m128i b) { return _mm_testc_si128(a, b); } -__m128i sse::max_epi16(__m128i a, __m128i b) { return _mm_max_epi16(a, b); } +int sse::testnzc(__m128i a, __m128i b) { return _mm_testnzc_si128(a, b); } + +int sse::testz(__m128i a, __m128i b) { return _mm_testz_si128(a, b); } -__m128i sse::max_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_max_epi32(a, b); -#else - return blendv_epi8(b, a, cmpgt_epi32(a, b)); #endif -} -__m128i sse::max_epu8(__m128i a, __m128i b) { return _mm_max_epu8(a, b); } +} // namespace wjr -__m128i sse::max_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_max_epu16(a, b); -#else - return add(subs_epu16(b, a), a, uint16_t()); -#endif -} +#endif // WJR_X86_SIMD_SSE_HPP__ -__m128i sse::max_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_max_epu32(a, b); -#else - return blendv_epi8(b, a, cmpgt_epu32(a, b)); -#endif -} +namespace wjr { -__m128i sse::max(__m128i a, __m128i b, int8_t) { return max_epi8(a, b); } -__m128i sse::max(__m128i a, __m128i b, int16_t) { return max_epi16(a, b); } -__m128i sse::max(__m128i a, __m128i b, int32_t) { return max_epi32(a, b); } -__m128i sse::max(__m128i a, __m128i b, uint8_t) { return max_epu8(a, b); } -__m128i sse::max(__m128i a, __m128i b, uint16_t) { return max_epu16(a, b); } -__m128i sse::max(__m128i a, __m128i b, uint32_t) { return max_epu32(a, b); } +struct avx { + using mask_type = uint32_t; -int8_t sse::max_epi8(__m128i a) { return 0x7fu ^ min_epu8(Xor(a, set1_epi8(0x7fu))); } +#if WJR_HAS_SIMD(AVX) -int16_t sse::max_epi16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) - return 0x7fffu ^ min_epu16(Xor(a, set1_epi16(0x7fffu))); -#else - a = max_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); - return simd_cast<__m128i_t, int16_t>(a); -#endif -} + using float_type = __m256; + using float_tag_type = __m256_t; + using int_type = __m256i; + using int_tag_type = __m256i_t; + using double_type = __m256d; + using double_tag_type = __m256d_t; -int32_t sse::max_epi32(__m128i a) { - a = max_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = max_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - return simd_cast<__m128i_t, int32_t>(a); -} +#endif // AVX -uint8_t sse::max_epu8(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) - return 0xffu ^ min_epu8(Xor(a, ones())); -#else - a = max_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); - auto X = simd_cast<__m128i_t, uint32_t>(a); - return std::max((uint8_t)X, (uint8_t)(X >> 8)); -#endif -} + constexpr static size_t width(); + constexpr static mask_type mask(); -uint16_t sse::max_epu16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) - return 0xffffu ^ min_epu16(Xor(a, ones())); -#else - a = max_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); - return simd_cast<__m128i_t, uint16_t>(a); -#endif -} +#if WJR_HAS_SIMD(AVX) -uint32_t sse::max_epu32(__m128i a) { - a = max_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = max_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - return simd_cast<__m128i_t, uint32_t>(a); -} + WJR_INTRINSIC_INLINE static __m256i concat(__m128i a, __m128i b); + + template + WJR_INTRINSIC_INLINE static int extract_epi32(__m256i v); + template + WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m256i v); + + template + WJR_INTRINSIC_INLINE static int extract(__m256i v, int32_t); + template + WJR_INTRINSIC_INLINE static int64_t extract(__m256i v, int64_t); + + template + WJR_INTRINSIC_INLINE static __m128i extract_si128(__m256i v); + + WJR_INTRINSIC_INLINE static __m128i getlow(__m256i a); + + WJR_INTRINSIC_INLINE static __m128i gethigh(__m256i a); + + template + WJR_INTRINSIC_INLINE static __m256i insert_epi8(__m256i v, int8_t i); + template + WJR_INTRINSIC_INLINE static __m256i insert_epi16(__m256i v, int16_t i); + template + WJR_INTRINSIC_INLINE static __m256i insert_epi32(__m256i v, int32_t i); + template + WJR_INTRINSIC_INLINE static __m256i insert_epi64(__m256i v, int64_t i); + + template + WJR_INTRINSIC_INLINE static __m256i insert_si128(__m256i a, __m128i b); + + WJR_INTRINSIC_INLINE static __m256i load(const void *p); + WJR_INTRINSIC_INLINE static __m256i loadu(const void *p); + + WJR_INTRINSIC_INLINE static __m256i ones(); + + WJR_INTRINSIC_INLINE static __m256i loadu_si16(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si32(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si48(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si64(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si80(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si96(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si112(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si128(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si144(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si160(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si176(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si192(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si208(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si224(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si240(const void *ptr); + WJR_INTRINSIC_INLINE static __m256i loadu_si256(const void *ptr); + + WJR_INTRINSIC_INLINE static __m256i loadu_si16x(const void *ptr, int n); + + WJR_INTRINSIC_INLINE static __m256i + set_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25, + char e24, char e23, char e22, char e21, char e20, char e19, char e18, + char e17, char e16, char e15, char e14, char e13, char e12, char e11, + char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, + char e2, char e1, char e0); + + WJR_INTRINSIC_INLINE static __m256i set_epi16(short e15, short e14, short e13, + short e12, short e11, short e10, + short e9, short e8, short e7, short e6, + short e5, short e4, short e3, short e2, + short e1, short e0); + + WJR_INTRINSIC_INLINE static __m256i set_epi32(int e7, int e6, int e5, int e4, int e3, + int e2, int e1, int e0); + + WJR_INTRINSIC_INLINE static __m256i set_epi64x(long long e3, long long e2, + long long e1, long long e0); + + WJR_INTRINSIC_INLINE static __m256i + setr_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25, + char e24, char e23, char e22, char e21, char e20, char e19, char e18, + char e17, char e16, char e15, char e14, char e13, char e12, char e11, + char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, + char e2, char e1, char e0); + + WJR_INTRINSIC_INLINE static __m256i setr_epi16(short e15, short e14, short e13, + short e12, short e11, short e10, + short e9, short e8, short e7, short e6, + short e5, short e4, short e3, short e2, + short e1, short e0); + + WJR_INTRINSIC_INLINE static __m256i setr_epi32(int e7, int e6, int e5, int e4, int e3, + int e2, int e1, int e0); + + WJR_INTRINSIC_INLINE static __m256i setr_epi64x(long long e3, long long e2, + long long e1, long long e0); + + WJR_INTRINSIC_INLINE static __m256i set1_epi8(int8_t a); + WJR_INTRINSIC_INLINE static __m256i set1_epi16(int16_t a); + WJR_INTRINSIC_INLINE static __m256i set1_epi32(int32_t a); + WJR_INTRINSIC_INLINE static __m256i set1_epi64(int64_t a); + + WJR_INTRINSIC_INLINE static __m256i set1(int8_t a, int8_t); + WJR_INTRINSIC_INLINE static __m256i set1(int16_t a, int16_t); + WJR_INTRINSIC_INLINE static __m256i set1(int32_t a, int32_t); + WJR_INTRINSIC_INLINE static __m256i set1(int64_t a, int64_t); + WJR_INTRINSIC_INLINE static __m256i set1(uint8_t a, uint8_t); + WJR_INTRINSIC_INLINE static __m256i set1(uint16_t a, uint16_t); + WJR_INTRINSIC_INLINE static __m256i set1(uint32_t a, uint32_t); + WJR_INTRINSIC_INLINE static __m256i set1(uint64_t a, uint64_t); + + WJR_INTRINSIC_INLINE static __m256i setmin_epi8(); + WJR_INTRINSIC_INLINE static __m256i setmin_epi16(); + WJR_INTRINSIC_INLINE static __m256i setmin_epi32(); + WJR_INTRINSIC_INLINE static __m256i setmin_epi64(); -int8_t sse::max(__m128i a, int8_t) { return max_epi8(a); } -int16_t sse::max(__m128i a, int16_t) { return max_epi16(a); } -int32_t sse::max(__m128i a, int32_t) { return max_epi32(a); } -uint8_t sse::max(__m128i a, uint8_t) { return max_epu8(a); } -uint16_t sse::max(__m128i a, uint16_t) { return max_epu16(a); } -uint32_t sse::max(__m128i a, uint32_t) { return max_epu32(a); } + WJR_INTRINSIC_INLINE static __m256i setmin(int8_t); + WJR_INTRINSIC_INLINE static __m256i setmin(int16_t); + WJR_INTRINSIC_INLINE static __m256i setmin(int32_t); + WJR_INTRINSIC_INLINE static __m256i setmin(int64_t); -void sse::mfence() { _mm_mfence(); } + WJR_INTRINSIC_INLINE static __m256i setmax_epi8(); + WJR_INTRINSIC_INLINE static __m256i setmax_epi16(); + WJR_INTRINSIC_INLINE static __m256i setmax_epi32(); + WJR_INTRINSIC_INLINE static __m256i setmax_epi64(); -__m128i sse::min_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_min_epi8(a, b); -#else - return blendv_epi8(a, b, cmpgt_epi8(a, b)); -#endif -} + WJR_INTRINSIC_INLINE static __m256i setmax(int8_t); + WJR_INTRINSIC_INLINE static __m256i setmax(int16_t); + WJR_INTRINSIC_INLINE static __m256i setmax(int32_t); + WJR_INTRINSIC_INLINE static __m256i setmax(int64_t); -__m128i sse::min_epi16(__m128i a, __m128i b) { return _mm_min_epi16(a, b); } + WJR_INTRINSIC_INLINE static void stream(__m256i *p, __m256i a); -__m128i sse::min_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_min_epi32(a, b); -#else - return blendv_epi8(a, b, cmpgt_epi32(a, b)); -#endif -} + WJR_INTRINSIC_INLINE static void store(void *p, __m256i a); + WJR_INTRINSIC_INLINE static void storeu(void *p, __m256i a); -__m128i sse::min_epu8(__m128i a, __m128i b) { return _mm_min_epu8(a, b); } + WJR_INTRINSIC_INLINE static int test_all_zeros(__m256i a); -__m128i sse::min_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_min_epu16(a, b); -#else - return blendv_epi8(a, b, cmpgt_epu16(a, b)); -#endif -} + WJR_INTRINSIC_INLINE static int testc(__m256i a, __m256i b); -__m128i sse::min_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) - return _mm_min_epu32(a, b); -#else - return blendv_epi8(a, b, cmpgt_epu32(a, b)); -#endif -} + WJR_INTRINSIC_INLINE static int testnzc(__m256i a, __m256i b); -__m128i sse::min(__m128i a, __m128i b, int8_t) { return min_epi8(a, b); } -__m128i sse::min(__m128i a, __m128i b, int16_t) { return min_epi16(a, b); } -__m128i sse::min(__m128i a, __m128i b, int32_t) { return min_epi32(a, b); } -__m128i sse::min(__m128i a, __m128i b, uint8_t) { return min_epu8(a, b); } -__m128i sse::min(__m128i a, __m128i b, uint16_t) { return min_epu16(a, b); } -__m128i sse::min(__m128i a, __m128i b, uint32_t) { return min_epu32(a, b); } + WJR_INTRINSIC_INLINE static int testz(__m256i a, __m256i b); -int8_t sse::min_epi8(__m128i a) { return 0x80u ^ min_epu8(Xor(a, setmin_epi8())); } + WJR_INTRINSIC_INLINE static __m256i zeros(); -int16_t sse::min_epi16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) - return 0x8000u ^ min_epu16(Xor(a, setmin_epi16())); -#else - a = min_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); - return simd_cast<__m128i_t, int16_t>(a); -#endif -} +#endif // AVX -int32_t sse::min_epi32(__m128i a) { - a = min_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = min_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - return simd_cast<__m128i_t, int32_t>(a); -} +#if WJR_HAS_SIMD(AVX2) -uint8_t sse::min_epu8(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) - a = min_epu8(a, srli_epi16(a, 8)); - a = _mm_minpos_epu16(a); - return simd_cast<__m128i_t, uint8_t>(a); -#else - a = min_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); - auto X = simd_cast<__m128i_t, uint32_t>(a); - return std::min((uint8_t)X, (uint8_t)(X >> 8)); -#endif -} + WJR_INTRINSIC_INLINE static __m256i And(__m256i a, __m256i b); -uint16_t sse::min_epu16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) - return simd_cast<__m128i_t, uint16_t>(_mm_minpos_epu16(a)); -#else - a = min_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); - return simd_cast<__m128i_t, uint16_t>(a); -#endif -} + WJR_INTRINSIC_INLINE static __m256i AndNot(__m256i a, __m256i b); -uint32_t sse::min_epu32(__m128i a) { - a = min_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); - a = min_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); - return simd_cast<__m128i_t, uint32_t>(a); -} + WJR_INTRINSIC_INLINE static __m256i Or(__m256i a, __m256i b); -int8_t sse::min(__m128i a, int8_t) { return min_epi8(a); } -int16_t sse::min(__m128i a, int16_t) { return min_epi16(a); } -int32_t sse::min(__m128i a, int32_t) { return min_epi32(a); } -uint8_t sse::min(__m128i a, uint8_t) { return min_epu8(a); } -uint16_t sse::min(__m128i a, uint16_t) { return min_epu16(a); } -uint32_t sse::min(__m128i a, uint32_t) { return min_epu32(a); } + WJR_INTRINSIC_INLINE static __m256i Xor(__m256i a, __m256i b); -__m128i sse::move_epi64(__m128i a) { return _mm_move_epi64(a); } + WJR_INTRINSIC_INLINE static __m256i Not(__m256i v); -sse::mask_type sse::movemask_epi8(__m128i a) { - return static_cast(_mm_movemask_epi8(a)); -} -sse::mask_type sse::movemask_pd(__m128d v) { - return static_cast(_mm_movemask_pd(v)); -} + WJR_INTRINSIC_INLINE static __m256i abs_epi8(__m256i v); + WJR_INTRINSIC_INLINE static __m256i abs_epi16(__m256i v); + WJR_INTRINSIC_INLINE static __m256i abs_epi32(__m256i v); -sse::mask_type sse::movemask(__m128i v, int8_t) { return movemask_epi8(v); } -sse::mask_type sse::movemask(__m128i v, int32_t) { - return movemask_ps(simd_cast<__m128i_t, __m128_t>(v)); -} -sse::mask_type sse::movemask(__m128i v, int64_t) { - return movemask_pd(simd_cast<__m128i_t, __m128d_t>(v)); -} -sse::mask_type sse::movemask(__m128i v, uint8_t) { return movemask(v, int8_t()); } -sse::mask_type sse::movemask(__m128i v, uint32_t) { return movemask(v, int32_t()); } -sse::mask_type sse::movemask(__m128i v, uint64_t) { return movemask(v, int64_t()); } + WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int8_t); + WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int16_t); + WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int32_t); + WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int64_t); -__m128i sse::mul_epu32(__m128i a, __m128i b) { return _mm_mul_epu32(a, b); } + WJR_INTRINSIC_INLINE static __m256i add_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i add_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i add_epi32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i add_epi64(__m256i a, __m256i b); -__m128i sse::mulhi_epi16(__m128i a, __m128i b) { return _mm_mulhi_epi16(a, b); } + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint32_t); + WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint64_t); -__m128i sse::mulhi_epu16(__m128i a, __m128i b) { return _mm_mulhi_epu16(a, b); } + WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m256i v); + WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m256i v); + WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m256i v); + WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m256i v); -__m128i sse::mullo_epi16(__m128i a, __m128i b) { return _mm_mullo_epi16(a, b); } + WJR_INTRINSIC_INLINE static int8_t add_epi8(__m256i v); + WJR_INTRINSIC_INLINE static int16_t add_epi16(__m256i v); + WJR_INTRINSIC_INLINE static int32_t add_epi32(__m256i v); + WJR_INTRINSIC_INLINE static int64_t add_epi64(__m256i v); -__m128i sse::negate_epi8(__m128i a) { -#if WJR_HAS_SIMD(SSSE3) - return sign_epi8(a, ones()); -#else - return sub_epi8(zeros(), a); -#endif -} + WJR_INTRINSIC_INLINE static int8_t add(__m256i v, int8_t); + WJR_INTRINSIC_INLINE static int16_t add(__m256i v, int16_t); + WJR_INTRINSIC_INLINE static int32_t add(__m256i v, int32_t); + WJR_INTRINSIC_INLINE static int64_t add(__m256i v, int64_t); + WJR_INTRINSIC_INLINE static uint8_t add(__m256i v, uint8_t); + WJR_INTRINSIC_INLINE static uint16_t add(__m256i v, uint16_t); + WJR_INTRINSIC_INLINE static uint32_t add(__m256i v, uint32_t); + WJR_INTRINSIC_INLINE static uint64_t add(__m256i v, uint64_t); -__m128i sse::negate_epi16(__m128i a) { -#if WJR_HAS_SIMD(SSSE3) - return sign_epi16(a, ones()); -#else - return sub_epi16(zeros(), a); -#endif -} + WJR_INTRINSIC_INLINE static __m256i adds_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i adds_epi16(__m256i a, __m256i b); -__m128i sse::negate_epi32(__m128i a) { -#if WJR_HAS_SIMD(SSSE3) - return sign_epi32(a, ones()); -#else - return sub_epi32(zeros(), a); -#endif -} + WJR_INTRINSIC_INLINE static __m256i adds_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i adds_epu16(__m256i a, __m256i b); -__m128i sse::negate_epi64(__m128i a) { return sub_epi64(zeros(), a); } + WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint16_t); -__m128i sse::negate(__m128i a, int8_t) { return negate_epi8(a); } -__m128i sse::negate(__m128i a, int16_t) { return negate_epi16(a); } -__m128i sse::negate(__m128i a, int32_t) { return negate_epi32(a); } -__m128i sse::negate(__m128i a, int64_t) { return negate_epi64(a); } -__m128i sse::negate(__m128i a, uint8_t) { return negate_epi8(a); } -__m128i sse::negate(__m128i a, uint16_t) { return negate_epi16(a); } -__m128i sse::negate(__m128i a, uint32_t) { return negate_epi32(a); } -__m128i sse::negate(__m128i a, uint64_t) { return negate_epi64(a); } + template + WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b); -__m128i sse::Not(__m128i v) { return Xor(v, ones()); } + WJR_INTRINSIC_INLINE static __m256i alignr_epi16(__m256i a, __m256i b, int c); + WJR_INTRINSIC_INLINE static __m256i alignr_epi32(__m256i a, __m256i b, int c); + WJR_INTRINSIC_INLINE static __m256i alignr_epi64(__m256i a, __m256i b, int c); -__m128i sse::Or(__m128i a, __m128i b) { return _mm_or_si128(a, b); } + WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int16_t); + WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int32_t); + WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int64_t); + WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint16_t); + WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint32_t); + WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint64_t); -__m128i sse::packs_epi16(__m128i a, __m128i b) { return _mm_packs_epi16(a, b); } -__m128i sse::packs_epi32(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); } + WJR_INTRINSIC_INLINE static __m256i avg_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i avg_epu16(__m256i a, __m256i b); -__m128i sse::packus_epi16(__m128i a, __m128i b) { return _mm_packus_epi16(a, b); } + WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint16_t); -__m128i sse::loadu_si48(const void *ptr) { - return insert_epi16<2>(loadu_si32(ptr), reinterpret_cast(ptr)[2]); -} + template + WJR_INTRINSIC_INLINE static __m256i blend_epi16(__m256i a, __m256i b); + template + WJR_INTRINSIC_INLINE static __m256i blend_epi32(__m256i a, __m256i b); -__m128i sse::loadu_si80(const void *ptr) { - return insert_epi16<4>(loadu_si64(ptr), reinterpret_cast(ptr)[4]); -} + WJR_INTRINSIC_INLINE static __m256i blendv_epi8(__m256i a, __m256i b, __m256i mask); -__m128i sse::loadu_si96(const void *ptr) { -#if WJR_HAS_SIMD(SSE4_1) - return insert_epi32<2>(loadu_si64(ptr), reinterpret_cast(ptr)[2]); -#else - return insert_epi16<5>(loadu_si80(ptr), reinterpret_cast(ptr)[5]); -#endif -} + template + WJR_INTRINSIC_INLINE static __m256i bslli_epi128(__m256i a); -__m128i sse::loadu_si112(const void *ptr) { - return insert_epi16<6>(loadu_si96(ptr), reinterpret_cast(ptr)[6]); -} + template + WJR_INTRINSIC_INLINE static __m256i bsrli_epi128(__m256i a); -__m128i sse::loadu_si128(const void *ptr) { return loadu(ptr); } + WJR_INTRINSIC_INLINE static __m256i cmpeq_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpeq_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpeq_epi32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpeq_epi64(__m256i a, __m256i b); -__m128i sse::loadu_si16x(const void *ptr, int n) { - switch (n) { - case 0: - return zeros(); - case 1: - return loadu_si16(ptr); - case 2: - return loadu_si32(ptr); - case 3: - return loadu_si48(ptr); - case 4: - return loadu_si64(ptr); - case 5: - return loadu_si80(ptr); - case 6: - return loadu_si96(ptr); - case 7: - return loadu_si112(ptr); - default: - return loadu_si128(ptr); - } -} + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint32_t); + WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint64_t); -__m128i sse::sad_epu8(__m128i a, __m128i b) { return _mm_sad_epu8(a, b); } + WJR_INTRINSIC_INLINE static __m256i cmpge_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpge_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpge_epi32(__m256i a, __m256i b); -__m128i sse::zeros() { return _mm_setzero_si128(); } -__m128i sse::ones() { return _mm_set1_epi32(-1); } + WJR_INTRINSIC_INLINE static __m256i cmpge_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpge_epu16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpge_epu32(__m256i a, __m256i b); -__m128i sse::set_epi8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, - char e8, char e7, char e6, char e5, char e4, char e3, char e2, - char e1, char e0) { - return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, - e0); -} + WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint32_t); -__m128i sse::set_epi16(short e7, short e6, short e5, short e4, short e3, short e2, - short e1, short e0) { - return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); -} -__m128i sse::set_epi32(int e3, int e2, int e1, int e0) { - return _mm_set_epi32(e3, e2, e1, e0); -} -__m128i sse::set_epi64x(long long e1, long long e0) { return _mm_set_epi64x(e1, e0); } + WJR_INTRINSIC_INLINE static __m256i cmpgt_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpgt_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpgt_epi32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpgt_epi64(__m256i a, __m256i b); -__m128i sse::setr_epi8(char e15, char e14, char e13, char e12, char e11, char e10, - char e9, char e8, char e7, char e6, char e5, char e4, char e3, - char e2, char e1, char e0) { - return _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, - e0); -} + WJR_INTRINSIC_INLINE static __m256i cmpgt_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpgt_epu16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpgt_epu32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpgt_epu64(__m256i a, __m256i b); -__m128i sse::setr_epi16(short e7, short e6, short e5, short e4, short e3, short e2, - short e1, short e0) { - return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); -} -__m128i sse::setr_epi32(int e3, int e2, int e1, int e0) { - return _mm_setr_epi32(e3, e2, e1, e0); -} + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint32_t); + WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint64_t); -__m128i sse::set1_epi8(int8_t val) { return _mm_set1_epi8(val); } -__m128i sse::set1_epi16(int16_t val) { return _mm_set1_epi16(val); } -__m128i sse::set1_epi32(int32_t val) { return _mm_set1_epi32(val); } -__m128i sse::set1_epi64(int64_t val) { return _mm_set1_epi64x(val); } + WJR_INTRINSIC_INLINE static __m256i cmple_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmple_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmple_epi32(__m256i a, __m256i b); -__m128i sse::set1(int8_t val, int8_t) { return set1_epi8(val); } -__m128i sse::set1(int16_t val, int16_t) { return set1_epi16(val); } -__m128i sse::set1(int32_t val, int32_t) { return set1_epi32(val); } -__m128i sse::set1(int64_t val, int64_t) { return set1_epi64(val); } -__m128i sse::set1(uint8_t val, uint8_t) { return set1_epi8(val); } -__m128i sse::set1(uint16_t val, uint16_t) { return set1_epi16(val); } -__m128i sse::set1(uint32_t val, uint32_t) { return set1_epi32(val); } -__m128i sse::set1(uint64_t val, uint64_t) { return set1_epi64(val); } + WJR_INTRINSIC_INLINE static __m256i cmple_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmple_epu16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmple_epu32(__m256i a, __m256i b); -__m128i sse::setmin_epi8() { return set1_epi8(0x80u); } -__m128i sse::setmin_epi16() { return set1_epi16(0x8000u); } -__m128i sse::setmin_epi32() { return set1_epi32(0x80000000u); } + WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint32_t); -__m128i sse::setmin(int8_t) { return setmin_epi8(); } -__m128i sse::setmin(int16_t) { return setmin_epi16(); } -__m128i sse::setmin(int32_t) { return setmin_epi32(); } -__m128i sse::setmin(uint8_t) { return set1_epi32(0); } -__m128i sse::setmin(uint16_t) { return set1_epi32(0); } -__m128i sse::setmin(uint32_t) { return set1_epi32(0); } + WJR_INTRINSIC_INLINE static __m256i cmplt_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmplt_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmplt_epi32(__m256i a, __m256i b); -__m128i sse::setmax_epi8() { return set1_epi8(0x7F); } -__m128i sse::setmax_epi16() { return set1_epi16(0x7FFF); } -__m128i sse::setmax_epi32() { return set1_epi32(0x7FFFFFFF); } + WJR_INTRINSIC_INLINE static __m256i cmplt_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmplt_epu16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmplt_epu32(__m256i a, __m256i b); -__m128i sse::setmax(int8_t) { return setmax_epi8(); } -__m128i sse::setmax(int16_t) { return setmax_epi16(); } -__m128i sse::setmax(int32_t) { return setmax_epi32(); } -__m128i sse::setmax(uint8_t) { return set1_epi32(0xFFFFFFFF); } -__m128i sse::setmax(uint16_t) { return set1_epi32(0xFFFFFFFF); } -__m128i sse::setmax(uint32_t) { return set1_epi32(0xFFFFFFFF); } + WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint32_t); -template -__m128i sse::shl(__m128i a) { - if constexpr (imm >= 64) { - a = slli<8>(a); - a = slli_epi64(a, imm - 64); - return a; - } else { - auto b = slli_epi64(a, imm); - auto c = slli<8>(a); - c = srli_epi64(c, 64 - imm); - return Or(b, c); - } -} + WJR_INTRINSIC_INLINE static __m256i cmpne_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpne_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i cmpne_epi32(__m256i a, __m256i b); -template -__m128i sse::shr(__m128i a) { - if constexpr (imm >= 64) { - a = srli<8>(a); - a = srli_epi64(a, imm - 64); - return a; - } else { - auto b = srli_epi64(a, imm); - auto c = srli<8>(a); - c = slli_epi64(c, 64 - imm); - return Or(b, c); - } -} + WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint32_t); -template -__m128i sse::shuffle_epi32(__m128i v) { - static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]"); - return _mm_shuffle_epi32(v, imm8); -} + template + WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::equal_to<>, T); + template + WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::not_equal_to<>, T); + template + WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater<>, T); + template + WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater_equal<>, + T); + template + WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less<>, T); + template + WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less_equal<>, T); -template -__m128i sse::shufflehi_epi16(__m128i v) { - return _mm_shufflehi_epi16(v, imm8); -} + template + WJR_INTRINSIC_INLINE static int extract_epi8(__m256i v); + template + WJR_INTRINSIC_INLINE static int extract_epi16(__m256i v); -template -__m128i sse::shufflelo_epi16(__m128i v) { - return _mm_shufflelo_epi16(v, imm8); -} + template + WJR_INTRINSIC_INLINE static int extract(__m256i v, int8_t); + template + WJR_INTRINSIC_INLINE static int extract(__m256i v, int16_t); -__m128i sse::sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); } -__m128i sse::sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); } -__m128i sse::sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i hadd_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i hadd_epi32(__m256i a, __m256i b); -__m128i sse::sll(__m128i a, __m128i b, int16_t) { return sll_epi16(a, b); } -__m128i sse::sll(__m128i a, __m128i b, int32_t) { return sll_epi32(a, b); } -__m128i sse::sll(__m128i a, __m128i b, int64_t) { return sll_epi64(a, b); } -__m128i sse::sll(__m128i a, __m128i b, uint16_t) { return sll_epi16(a, b); } -__m128i sse::sll(__m128i a, __m128i b, uint32_t) { return sll_epi32(a, b); } -__m128i sse::sll(__m128i a, __m128i b, uint64_t) { return sll_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int32_t); -template -__m128i sse::slli(__m128i v) { - return _mm_slli_si128(v, imm8); -} -__m128i sse::slli_epi16(__m128i a, int imm8) { - if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) { - return sse::add_epi16(a, a); - } + WJR_INTRINSIC_INLINE static __m256i hadds_epi16(__m256i a, __m256i b); - return _mm_slli_epi16(a, imm8); -} -__m128i sse::slli_epi32(__m128i a, int imm8) { - if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) { - return sse::add_epi32(a, a); - } + WJR_INTRINSIC_INLINE static __m256i hsub_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i hsub_epi32(__m256i a, __m256i b); + + WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int32_t); - return _mm_slli_epi32(a, imm8); -} -__m128i sse::slli_epi64(__m128i a, int imm8) { - if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) { - return sse::add_epi64(a, a); - } + WJR_INTRINSIC_INLINE static __m256i hsubs_epi16(__m256i a, __m256i b); - return _mm_slli_epi64(a, imm8); -} + template )> + WJR_INTRINSIC_INLINE static __m256i logical_and(__m256i a, __m256i b, T); -__m128i sse::slli(__m128i a, int imm8, int16_t) { return slli_epi16(a, imm8); } -__m128i sse::slli(__m128i a, int imm8, int32_t) { return slli_epi32(a, imm8); } -__m128i sse::slli(__m128i a, int imm8, int64_t) { return slli_epi64(a, imm8); } -__m128i sse::slli(__m128i a, int imm8, uint16_t) { return slli_epi16(a, imm8); } -__m128i sse::slli(__m128i a, int imm8, uint32_t) { return slli_epi32(a, imm8); } -__m128i sse::slli(__m128i a, int imm8, uint64_t) { return slli_epi64(a, imm8); } + template )> + WJR_INTRINSIC_INLINE static __m256i logical_not(__m256i v, T); -__m128i sse::sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); } -__m128i sse::sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); } + template )> + WJR_INTRINSIC_INLINE static __m256i logical_or(__m256i a, __m256i b, T); -__m128i sse::sra(__m128i a, __m128i b, int16_t) { return sra_epi16(a, b); } -__m128i sse::sra(__m128i a, __m128i b, int32_t) { return sra_epi32(a, b); } + WJR_INTRINSIC_INLINE static __m256i madd_epi16(__m256i a, __m256i b); -__m128i sse::srai_epi16(__m128i a, int imm8) { return _mm_srai_epi16(a, imm8); } -__m128i sse::srai_epi32(__m128i a, int imm8) { return _mm_srai_epi32(a, imm8); } + WJR_INTRINSIC_INLINE static __m256i max_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i max_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i max_epi32(__m256i a, __m256i b); -__m128i sse::srai(__m128i a, int imm8, int16_t) { return srai_epi16(a, imm8); } -__m128i sse::srai(__m128i a, int imm8, int32_t) { return srai_epi32(a, imm8); } + WJR_INTRINSIC_INLINE static __m256i max_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i max_epu16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i max_epu32(__m256i a, __m256i b); -__m128i sse::srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); } -__m128i sse::srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); } -__m128i sse::srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint32_t); -__m128i sse::srl(__m128i a, __m128i b, int16_t) { return srl_epi16(a, b); } -__m128i sse::srl(__m128i a, __m128i b, int32_t) { return srl_epi32(a, b); } -__m128i sse::srl(__m128i a, __m128i b, int64_t) { return srl_epi64(a, b); } -__m128i sse::srl(__m128i a, __m128i b, uint16_t) { return srl_epi16(a, b); } -__m128i sse::srl(__m128i a, __m128i b, uint32_t) { return srl_epi32(a, b); } -__m128i sse::srl(__m128i a, __m128i b, uint64_t) { return srl_epi64(a, b); } + WJR_INTRINSIC_INLINE static int8_t max_epi8(__m256i a); + WJR_INTRINSIC_INLINE static int16_t max_epi16(__m256i a); + WJR_INTRINSIC_INLINE static int32_t max_epi32(__m256i a); + WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m256i a); + WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m256i a); + WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m256i a); -template -__m128i sse::srli(__m128i v) { - return _mm_srli_si128(v, imm8); -} -__m128i sse::srli_epi8(__m128i a, int imm8) { - return And(srli_epi16(a, imm8), sse_detail::srli_epi8_mask[imm8]); -} -__m128i sse::srli_epi16(__m128i a, int imm8) { return _mm_srli_epi16(a, imm8); } -__m128i sse::srli_epi32(__m128i a, int imm8) { return _mm_srli_epi32(a, imm8); } -__m128i sse::srli_epi64(__m128i a, int imm8) { return _mm_srli_epi64(a, imm8); } + WJR_INTRINSIC_INLINE static int8_t max(__m256i a, int8_t); + WJR_INTRINSIC_INLINE static int16_t max(__m256i a, int16_t); + WJR_INTRINSIC_INLINE static int32_t max(__m256i a, int32_t); -__m128i sse::srli(__m128i a, int imm8, int8_t) { return srli_epi8(a, imm8); } -__m128i sse::srli(__m128i a, int imm8, int16_t) { return srli_epi16(a, imm8); } -__m128i sse::srli(__m128i a, int imm8, int32_t) { return srli_epi32(a, imm8); } -__m128i sse::srli(__m128i a, int imm8, int64_t) { return srli_epi64(a, imm8); } -__m128i sse::srli(__m128i a, int imm8, uint8_t) { return srli_epi8(a, imm8); } -__m128i sse::srli(__m128i a, int imm8, uint16_t) { return srli_epi16(a, imm8); } -__m128i sse::srli(__m128i a, int imm8, uint32_t) { return srli_epi32(a, imm8); } -__m128i sse::srli(__m128i a, int imm8, uint64_t) { return srli_epi64(a, imm8); } + WJR_INTRINSIC_INLINE static uint8_t max(__m256i a, uint8_t); + WJR_INTRINSIC_INLINE static uint16_t max(__m256i a, uint16_t); + WJR_INTRINSIC_INLINE static uint32_t max(__m256i a, uint32_t); -void sse::stream(__m128i *ptr, __m128i v) { _mm_stream_si128(ptr, v); } + WJR_INTRINSIC_INLINE static __m256i min_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i min_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i min_epi32(__m256i a, __m256i b); -void sse::store(void *ptr, __m128i val) { - _mm_store_si128(static_cast<__m128i *>(ptr), val); -} -void sse::storeu(void *ptr, __m128i val) { - _mm_storeu_si128(static_cast<__m128i *>(ptr), val); -} + WJR_INTRINSIC_INLINE static __m256i min_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i min_epu16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i min_epu32(__m256i a, __m256i b); -__m128i sse::sub_epi8(__m128i a, __m128i b) { return _mm_sub_epi8(a, b); } -__m128i sse::sub_epi16(__m128i a, __m128i b) { return _mm_sub_epi16(a, b); } -__m128i sse::sub_epi32(__m128i a, __m128i b) { return _mm_sub_epi32(a, b); } -__m128i sse::sub_epi64(__m128i a, __m128i b) { return _mm_sub_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint32_t); -__m128i sse::sub(__m128i a, __m128i b, int8_t) { return sub_epi8(a, b); } -__m128i sse::sub(__m128i a, __m128i b, int16_t) { return sub_epi16(a, b); } -__m128i sse::sub(__m128i a, __m128i b, int32_t) { return sub_epi32(a, b); } -__m128i sse::sub(__m128i a, __m128i b, int64_t) { return sub_epi64(a, b); } -__m128i sse::sub(__m128i a, __m128i b, uint8_t) { return sub_epi8(a, b); } -__m128i sse::sub(__m128i a, __m128i b, uint16_t) { return sub_epi16(a, b); } -__m128i sse::sub(__m128i a, __m128i b, uint32_t) { return sub_epi32(a, b); } -__m128i sse::sub(__m128i a, __m128i b, uint64_t) { return sub_epi64(a, b); } + WJR_INTRINSIC_INLINE static int8_t min_epi8(__m256i a); + WJR_INTRINSIC_INLINE static int16_t min_epi16(__m256i a); + WJR_INTRINSIC_INLINE static int32_t min_epi32(__m256i a); -__m128i sse::subs_epi8(__m128i a, __m128i b) { return _mm_subs_epi8(a, b); } -__m128i sse::subs_epi16(__m128i a, __m128i b) { return _mm_subs_epi16(a, b); } + WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m256i a); + WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m256i a); + WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m256i a); -__m128i sse::subs_epu8(__m128i a, __m128i b) { return _mm_subs_epu8(a, b); } -__m128i sse::subs_epu16(__m128i a, __m128i b) { return _mm_subs_epu16(a, b); } + WJR_INTRINSIC_INLINE static int8_t min(__m256i a, int8_t); + WJR_INTRINSIC_INLINE static int16_t min(__m256i a, int16_t); + WJR_INTRINSIC_INLINE static int32_t min(__m256i a, int32_t); + WJR_INTRINSIC_INLINE static uint8_t min(__m256i a, uint8_t); + WJR_INTRINSIC_INLINE static uint16_t min(__m256i a, uint16_t); + WJR_INTRINSIC_INLINE static uint32_t min(__m256i a, uint32_t); -__m128i sse::subs(__m128i a, __m128i b, int8_t) { return subs_epi8(a, b); } -__m128i sse::subs(__m128i a, __m128i b, int16_t) { return subs_epi16(a, b); } -__m128i sse::subs(__m128i a, __m128i b, uint8_t) { return subs_epu8(a, b); } -__m128i sse::subs(__m128i a, __m128i b, uint16_t) { return subs_epu16(a, b); } + WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m256i a); -__m128i sse::unpackhi_epi8(__m128i a, __m128i b) { return _mm_unpackhi_epi8(a, b); } -__m128i sse::unpackhi_epi16(__m128i a, __m128i b) { return _mm_unpackhi_epi16(a, b); } -__m128i sse::unpackhi_epi32(__m128i a, __m128i b) { return _mm_unpackhi_epi32(a, b); } -__m128i sse::unpackhi_epi64(__m128i a, __m128i b) { return _mm_unpackhi_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i mul_epi32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i mul_epu32(__m256i a, __m256i b); -__m128i sse::unpackhi(__m128i a, __m128i b, int8_t) { return unpackhi_epi8(a, b); } -__m128i sse::unpackhi(__m128i a, __m128i b, int16_t) { return unpackhi_epi16(a, b); } -__m128i sse::unpackhi(__m128i a, __m128i b, int32_t) { return unpackhi_epi32(a, b); } -__m128i sse::unpackhi(__m128i a, __m128i b, int64_t) { return unpackhi_epi64(a, b); } -__m128i sse::unpackhi(__m128i a, __m128i b, uint8_t) { return unpackhi_epi8(a, b); } -__m128i sse::unpackhi(__m128i a, __m128i b, uint16_t) { return unpackhi_epi16(a, b); } -__m128i sse::unpackhi(__m128i a, __m128i b, uint32_t) { return unpackhi_epi32(a, b); } -__m128i sse::unpackhi(__m128i a, __m128i b, uint64_t) { return unpackhi_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i mulhi_epi16(__m256i a, __m256i b); -__m128i sse::unpacklo_epi8(__m128i a, __m128i b) { return _mm_unpacklo_epi8(a, b); } -__m128i sse::unpacklo_epi16(__m128i a, __m128i b) { return _mm_unpacklo_epi16(a, b); } -__m128i sse::unpacklo_epi32(__m128i a, __m128i b) { return _mm_unpacklo_epi32(a, b); } -__m128i sse::unpacklo_epi64(__m128i a, __m128i b) { return _mm_unpacklo_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i mulhi_epu16(__m256i a, __m256i b); -__m128i sse::unpacklo(__m128i a, __m128i b, int8_t) { return unpacklo_epi8(a, b); } -__m128i sse::unpacklo(__m128i a, __m128i b, int16_t) { return unpacklo_epi16(a, b); } -__m128i sse::unpacklo(__m128i a, __m128i b, int32_t) { return unpacklo_epi32(a, b); } -__m128i sse::unpacklo(__m128i a, __m128i b, int64_t) { return unpacklo_epi64(a, b); } -__m128i sse::unpacklo(__m128i a, __m128i b, uint8_t) { return unpacklo_epi8(a, b); } -__m128i sse::unpacklo(__m128i a, __m128i b, uint16_t) { return unpacklo_epi16(a, b); } -__m128i sse::unpacklo(__m128i a, __m128i b, uint32_t) { return unpacklo_epi32(a, b); } + WJR_INTRINSIC_INLINE static __m256i mullo_epi16(__m256i a, __m256i b); -__m128i sse::Xor(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + WJR_INTRINSIC_INLINE static __m256i packs_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i packs_epi32(__m256i a, __m256i b); -#endif + WJR_INTRINSIC_INLINE static __m256i packus_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i packus_epi32(__m256i a, __m256i b); -#if WJR_HAS_SIMD(SSE3) + template + WJR_INTRINSIC_INLINE static __m256i shl(__m256i a); -__m128i sse::lddqu(const __m128i *ptr) { return _mm_lddqu_si128(ptr); } + template + WJR_INTRINSIC_INLINE static __m256i shr(__m256i a); -#endif + WJR_INTRINSIC_INLINE static __m256i shuffle_epi8(__m256i a, __m256i b); + template + WJR_INTRINSIC_INLINE static __m256i shuffle_epi32(__m256i a); -#if WJR_HAS_SIMD(SSSE3) + template + WJR_INTRINSIC_INLINE static __m256i shufflehi_epi16(__m256i a); -__m128i sse::abs_epi8(__m128i val) { return _mm_abs_epi8(val); } -__m128i sse::abs_epi16(__m128i val) { return _mm_abs_epi16(val); } -__m128i sse::abs_epi32(__m128i val) { return _mm_abs_epi32(val); } + template + WJR_INTRINSIC_INLINE static __m256i shufflelo_epi16(__m256i a); -__m128i sse::abs(__m128i val, int8_t) { return abs_epi8(val); } -__m128i sse::abs(__m128i val, int16_t) { return abs_epi16(val); } -__m128i sse::abs(__m128i val, int32_t) { return abs_epi32(val); } -__m128i sse::abs(__m128i val, uint8_t) { return val; } -__m128i sse::abs(__m128i val, uint16_t) { return val; } -__m128i sse::abs(__m128i val, uint32_t) { return val; } + WJR_INTRINSIC_INLINE static __m256i sll_epi16(__m256i a, __m128i b); + WJR_INTRINSIC_INLINE static __m256i sll_epi32(__m256i a, __m128i b); + WJR_INTRINSIC_INLINE static __m256i sll_epi64(__m256i a, __m128i b); -__m128i sse::shuffle_epi8(__m128i v, __m128i imm8) { return _mm_shuffle_epi8(v, imm8); } + WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint32_t); + WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint64_t); -__m128i sse::sign_epi8(__m128i a, __m128i b) { return _mm_sign_epi8(a, b); } -__m128i sse::sign_epi16(__m128i a, __m128i b) { return _mm_sign_epi16(a, b); } -__m128i sse::sign_epi32(__m128i a, __m128i b) { return _mm_sign_epi32(a, b); } + template + WJR_INTRINSIC_INLINE static __m256i slli(__m256i a); + WJR_INTRINSIC_INLINE static __m256i slli_epi16(__m256i a, int imm8); + WJR_INTRINSIC_INLINE static __m256i slli_epi32(__m256i a, int imm8); + WJR_INTRINSIC_INLINE static __m256i slli_epi64(__m256i a, int imm8); -__m128i sse::sign(__m128i a, __m128i b, int8_t) { return sign_epi8(a, b); } -__m128i sse::sign(__m128i a, __m128i b, int16_t) { return sign_epi16(a, b); } -__m128i sse::sign(__m128i a, __m128i b, int32_t) { return sign_epi32(a, b); } -__m128i sse::sign(__m128i a, __m128i b, uint8_t) { return sign_epi8(a, b); } -__m128i sse::sign(__m128i a, __m128i b, uint16_t) { return sign_epi16(a, b); } -__m128i sse::sign(__m128i a, __m128i b, uint32_t) { return sign_epi32(a, b); } + WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int16_t); + WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int32_t); + WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int64_t); + WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint16_t); + WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint32_t); + WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint64_t); -#endif + WJR_INTRINSIC_INLINE static __m256i sra_epi16(__m256i a, __m128i b); + WJR_INTRINSIC_INLINE static __m256i sra_epi32(__m256i a, __m128i b); -#if WJR_HAS_SIMD(SSE4_1) + WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int32_t); -template -__m128i sse::blend_epi16(__m128i a, __m128i b) { - return _mm_blend_epi16(a, b, imm8); -} + WJR_INTRINSIC_INLINE static __m256i srai_epi16(__m256i a, int imm8); + WJR_INTRINSIC_INLINE static __m256i srai_epi32(__m256i a, int imm8); -__m128i sse::cmpeq_epi64(__m128i a, __m128i b) { return _mm_cmpeq_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int16_t); + WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int32_t); -__m128i sse::cmpeq(__m128i a, __m128i b, int64_t) { return cmpeq_epi64(a, b); } -__m128i sse::cmpeq(__m128i a, __m128i b, uint64_t) { return cmpeq_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i stream_load(const void *p); -__m128i sse::cmpgt_epi64(__m128i a, __m128i b) { return _mm_cmpgt_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i srl_epi16(__m256i a, __m128i b); + WJR_INTRINSIC_INLINE static __m256i srl_epi32(__m256i a, __m128i b); + WJR_INTRINSIC_INLINE static __m256i srl_epi64(__m256i a, __m128i b); -__m128i sse::cmpgt(__m128i a, __m128i b, int64_t) { return cmpgt_epi64(a, b); } + WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint32_t); + WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint64_t); -template -__m128i sse::insert_epi8(__m128i a, int i) { - return _mm_insert_epi8(a, i, imm8); -} + template + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a); + WJR_INTRINSIC_INLINE static __m256i srli_epi8(__m256i a, int imm8); + WJR_INTRINSIC_INLINE static __m256i srli_epi16(__m256i a, int imm8); + WJR_INTRINSIC_INLINE static __m256i srli_epi32(__m256i a, int imm8); + WJR_INTRINSIC_INLINE static __m256i srli_epi64(__m256i a, int imm8); -template -__m128i sse::insert_epi32(__m128i a, int i) { - return _mm_insert_epi32(a, i, imm8); -} + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int8_t); + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int16_t); + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int32_t); + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int64_t); + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint8_t); + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint16_t); + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint32_t); + WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint64_t); -template -__m128i sse::insert_epi64(__m128i a, int64_t i) { - return _mm_insert_epi64(a, i, imm8); -} + WJR_INTRINSIC_INLINE static __m256i sub_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i sub_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i sub_epi32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i sub_epi64(__m256i a, __m256i b); -template -__m128i sse::insert(__m128i a, int i, int8_t) { - return insert_epi8(a, i); -} + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint32_t); + WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint64_t); -template -__m128i sse::insert(__m128i a, int i, int32_t) { - return insert_epi32(a, i); -} + WJR_INTRINSIC_INLINE static __m256i subs_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i subs_epi16(__m256i a, __m256i b); -template -__m128i sse::insert(__m128i a, int64_t i, int64_t) { - return insert_epi64(a, i); -} + WJR_INTRINSIC_INLINE static __m256i subs_epu8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i subs_epu16(__m256i a, __m256i b); -template -__m128i sse::insert(__m128i a, int i, uint8_t) { - return insert_epi8(a, i); -} + WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint16_t); -template -__m128i sse::insert(__m128i a, int i, uint32_t) { - return insert_epi32(a, i); -} + WJR_INTRINSIC_INLINE static int test_all_ones(__m256i a); -template -__m128i sse::insert(__m128i a, int64_t i, uint64_t) { - return insert_epi64(a, i); -} + WJR_INTRINSIC_INLINE static __m256i unpackhi_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i unpackhi_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i unpackhi_epi32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i unpackhi_epi64(__m256i a, __m256i b); -__m128i sse::minpos_epu16(__m128i a) { return _mm_minpos_epu16(a); } + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint32_t); + WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint64_t); -__m128i sse::mul_epi32(__m128i a, __m128i b) { return _mm_mul_epi32(a, b); } + WJR_INTRINSIC_INLINE static __m256i unpacklo_epi8(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i unpacklo_epi16(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i unpacklo_epi32(__m256i a, __m256i b); + WJR_INTRINSIC_INLINE static __m256i unpacklo_epi64(__m256i a, __m256i b); -__m128i sse::mullo_epi32(__m128i a, __m128i b) { return _mm_mullo_epi32(a, b); } + WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int8_t); + WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int16_t); + WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int32_t); + WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int64_t); + WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint8_t); + WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint16_t); + WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint32_t); -__m128i sse::packus_epi32(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); } +#endif // AVX2 +}; -__m128i sse::stream_load(void *p) { - return _mm_stream_load_si128(static_cast<__m128i *>(p)); -} +namespace avx_detail { +#if WJR_HAS_SIMD(AVX2) -int sse::test_all_ones(__m128i a) { return _mm_test_all_ones(a); } +const static __m256i srli_epi8_mask[8] = { + avx::set1_epi16(0xFFFF), avx::set1_epi16(0x7F7F), avx::set1_epi16(0x3F3F), + avx::set1_epi16(0x1F1F), avx::set1_epi16(0xF0F), avx::set1_epi16(0x707), + avx::set1_epi16(0x303), avx::set1_epi16(0x101), +}; -int sse::test_all_zeros(__m128i a, __m128i b) { return _mm_test_all_zeros(a, b); } +#endif +} // namespace avx_detail -int sse::test_all_zeros(__m128i a) { return _mm_test_all_zeros(a, a); } +#if WJR_HAS_SIMD(AVX) -int sse::test_mix_ones_zeros(__m128i a, __m128i b) { - return _mm_test_mix_ones_zeros(a, b); -} +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const { + return _mm256_set1_epi8(v); + } +}; -int sse::testc(__m128i a, __m128i b) { return _mm_testc_si128(a, b); } +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const { + return _mm256_set1_epi16(v); + } +}; -int sse::testnzc(__m128i a, __m128i b) { return _mm_testnzc_si128(a, b); } +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const { + return _mm256_set1_epi32(v); + } +}; -int sse::testz(__m128i a, __m128i b) { return _mm_testz_si128(a, b); } +template <> +struct broadcast_fn { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const { + return _mm256_set1_epi64x(v); + } +}; + +template <> +struct broadcast_fn<__m256i_t, __m256i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256i v) const { return v; } +}; +template <> +struct broadcast_fn<__m128i_t, __m256i_t> { + WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const { +#if WJR_HAS_SIMD(AVX2) + return _mm256_broadcastsi128_si256(v); +#else + return _mm256_insertf128_si256(_mm256_castsi128_si256(v), v, 1); #endif + } +}; + +#endif // AVX /*------------------------avx------------------------*/ @@ -7647,8 +8346,14 @@ __m256i avx::unpacklo(__m256i a, __m256i b, uint32_t) { return unpacklo_epi32(a, #endif -#define WJR_REGISTER_NORMAL_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8, ADVANCE, \ - INIT, RET) \ +} // namespace wjr + +#endif // WJR_X86_SIMD_AVX_HPP__ + +namespace wjr { + +#define WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8, ADVANCE, \ + INIT, RET) \ if (WJR_UNLIKELY(N <= 16)) { \ if (WJR_UNLIKELY(N <= 4)) { \ UNROLL2(N - 2); \ @@ -7698,8 +8403,8 @@ __m256i avx::unpacklo(__m256i a, __m256i b, uint32_t) { return unpacklo_epi32(a, WJR_PP_BOOL_IF(IS_UNROLL_8, \ }, ) -#define WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8, \ - ADVANCE, INIT, RET) \ +#define WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8, \ + ADVANCE, INIT, RET) \ if (WJR_UNLIKELY(N <= 16)) { \ if (WJR_UNLIKELY(N <= 4)) { \ UNROLL2(0); \ @@ -7749,6 +8454,151 @@ __m256i avx::unpacklo(__m256i a, __m256i b, uint32_t) { return unpacklo_epi32(a, WJR_PP_BOOL_IF(IS_UNROLL_8, \ }, ) +template +class __x86_simd_base { + static constexpr size_t BitWidth = Simd::width(); + using int_type = typename Simd::int_type; + using Mybase = fixed_size_simd; + +public: + using mask_type = simd_detail::basic_simd_mask; + + WJR_ENABLE_DEFAULT_SPECIAL_MEMBERS(__x86_simd_base); + + template )> + __x86_simd_base(U value) noexcept : m_data(Simd::set1(value, U())) {} + + template + __x86_simd_base(const T *mem, Flags flags = {}) noexcept { + copy_from(mem, flags); + } + + void copy_from(const T *mem, element_aligned_t = {}) noexcept { + m_data = Simd::loadu(mem); + } + + void copy_from(const T *mem, vector_aligned_t) noexcept { m_data = Simd::load(mem); } + + void copy_to(T *mem, element_aligned_t = {}) noexcept { Simd::storeu(mem, m_data); } + + void copy_to(T *mem, vector_aligned_t) noexcept { Simd::store(mem, m_data); } + + Mybase &operator&=(const Mybase &other) noexcept { + m_data = Simd::And(m_data, other.m_data); + return static_cast(*this); + } + + friend Mybase operator&(const Mybase &lhs, const Mybase &rhs) noexcept { + Mybase ret(lhs); + ret &= rhs; + return ret; + } + + Mybase &operator|=(const Mybase &other) noexcept { + m_data = Simd::Or(m_data, other.m_data); + return static_cast(*this); + } + + friend Mybase operator|(const Mybase &lhs, const Mybase &rhs) noexcept { + Mybase ret(lhs); + ret |= rhs; + return ret; + } + + Mybase &operator^=(const Mybase &other) noexcept { + m_data = Simd::Xor(m_data, other.m_data); + return static_cast(*this); + } + + friend Mybase operator^(const Mybase &lhs, const Mybase &rhs) noexcept { + Mybase ret(lhs); + ret ^= rhs; + return ret; + } + + friend constexpr mask_type operator==(const Mybase &lhs, const Mybase &rhs) noexcept { + return Simd::movemask_epi8(Simd::cmpeq(lhs.m_data, rhs.m_data, T())); + } + +private: + int_type m_data; +}; + +#if WJR_HAS_SIMD(SSE2) +#define WJR_HAS_SIMD_NATIVE_128BIT WJR_HAS_DEF + +template <> +class simd> : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +template <> +class simd> : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +template <> +class simd> : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +template <> +class simd> : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +#endif + +#if WJR_HAS_SIMD(AVX2) +#define WJR_HAS_SIMD_NATIVE_256BIT WJR_HAS_DEF + +template <> +class simd> : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +template <> +class simd> + : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +template <> +class simd> : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +template <> +class simd> : public __x86_simd_base { + using Mybase = __x86_simd_base; + +public: + using Mybase::Mybase; +}; + +#endif + } // namespace wjr #endif // WJR_X86_SIMD_SIMD_HPP__ @@ -7921,121 +8771,7 @@ std::basic_ostream &__ostream_insert(std::basic_ostream -#ifndef WJR_ASSERT_HPP__ -#define WJR_ASSERT_HPP__ - -/** - * @file assert.hpp - * @author wjr - * @brief Assertion utilities - * - * @details WJR_DEBUG_LEVEL : 0 ~ 3 \n - * 0 : Release \n - * 1 : Beta \n - * 2 : Runtime detect \n - * 3 : Maximize runtime detect, for debug \n - * If WJR_DEBUG_LEVEL is not defined, \n - * If NDEBUG is defined, WJR_DEBUG_LEVEL is set to 0 by default. \n - * Otherwise, WJR_DEBUG_LEVEL is set to 1 by default. \n - * WJR_ASSERT_L(level, expr) : Specify the level of assertion, \n - * if the WJR_DEBUG_LEVEL is greater than or equal to the level, \n - * the assertion is executed. \n - * WJR_ASSERT(expr) : Equivalent to WJR_ASSERT_L(1, expr) \n - * WJR_ASSERT_0(expr) : Always execute the assertion \n - * - * @version 0.1 - * @date 2024-06-01 - * - * @copyright Copyright (c) 2024 - * - */ - -#include - // Already included - -#ifndef WJR_DEBUG_LEVEL -#if defined(NDEBUG) -#define WJR_DEBUG_LEVEL 0 -#else -#define WJR_DEBUG_LEVEL 1 -#endif -#endif - -#if WJR_DEBUG_LEVEL < 0 || WJR_DEBUG_LEVEL > 3 -#error "WJR_DEBUG_LEVEL must be 0 ~ 3" -#endif - -namespace wjr { - -#define WJR_DEBUG_IF(level, expr0, expr1) \ - WJR_PP_BOOL_IF(WJR_PP_GT(WJR_DEBUG_LEVEL, level), expr0, expr1) - -WJR_NORETURN extern void __assert_failed(const char *expr, const char *file, - const char *func, int line) noexcept; - -// LCOV_EXCL_START - -/// @private -template -WJR_NOINLINE void __assert_handler(const char *expr, const char *file, const char *func, - int line, Args &&...args) noexcept { - std::cerr << "Additional information: "; - (void)(std::cerr << ... << std::forward(args)); - std::cerr << '\n'; - __assert_failed(expr, file, func, line); -} - -/// @private -inline void __assert_handler(const char *expr, const char *file, const char *func, - int line) noexcept { - __assert_failed(expr, file, func, line); -} - -// LCOV_EXCL_STOP - -#define WJR_ASSERT_CHECK_I(expr, ...) \ - do { \ - if (WJR_UNLIKELY(!(expr))) { \ - ::wjr::__assert_handler(#expr, WJR_FILE, WJR_CURRENT_FUNCTION, WJR_LINE, \ - ##__VA_ARGS__); \ - } \ - } while (0) - -// do nothing -#define WJR_ASSERT_UNCHECK_I(expr, ...) \ - do { \ - } while (0) - -// level = [0, 2] -// The higher the level, the less likely it is to be detected -// Runtime detect : 1 -// Maximize detect : 2 -#define WJR_ASSERT_L(level, ...) \ - WJR_DEBUG_IF(level, WJR_ASSERT_CHECK_I, WJR_ASSERT_UNCHECK_I) \ - (__VA_ARGS__) - -// level of assert is zero at default. -#define WJR_ASSERT_L0(...) WJR_ASSERT_CHECK_I(__VA_ARGS__) -#define WJR_ASSERT_L1(...) WJR_ASSERT_L(1, __VA_ARGS__) -#define WJR_ASSERT_L2(...) WJR_ASSERT_L(2, __VA_ARGS__) -#define WJR_ASSERT_L3(...) WJR_ASSERT_L(3, __VA_ARGS__) -#define WJR_ASSERT(...) WJR_ASSERT_L1(__VA_ARGS__) - -#define WJR_ASSERT_ASSUME_L(level, ...) \ - WJR_ASSERT_L(level, __VA_ARGS__); \ - __WJR_ASSERT_ASSUME_L_ASSUME(__VA_ARGS__) -#define __WJR_ASSERT_ASSUME_L_ASSUME(expr, ...) WJR_ASSUME(expr) - -#define WJR_ASSERT_ASSUME_L0(...) WJR_ASSERT_ASSUME_L(0, __VA_ARGS__) -#define WJR_ASSERT_ASSUME_L1(...) WJR_ASSERT_ASSUME_L(1, __VA_ARGS__) -#define WJR_ASSERT_ASSUME_L2(...) WJR_ASSERT_ASSUME_L(2, __VA_ARGS__) -#define WJR_ASSERT_ASSUME_L3(...) WJR_ASSERT_ASSUME_L(3, __VA_ARGS__) -#define WJR_ASSERT_ASSUME(...) WJR_ASSERT_ASSUME_L1(__VA_ARGS__) - -} // namespace wjr - -#endif // WJR_ASSERT_HPP__ #ifndef WJR_CONTAINER_GENERIC_TYPE_TRAITS_HPP__ #define WJR_CONTAINER_GENERIC_TYPE_TRAITS_HPP__ @@ -11344,162 +12080,7 @@ struct pointer_traits> { } // namespace std #endif // WJR_ITERATOR_CONTIGUOUS_ITERATOR_ADAPTER_HPP__ -#ifndef WJR_MATH_DETAIL_HPP__ -#define WJR_MATH_DETAIL_HPP__ - // Already included - -namespace wjr { - -#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)) - -namespace math_detail { - -template -class de_bruijn { -public: - constexpr static uint8_t digits = std::numeric_limits::digits; - constexpr static uint8_t mv = digits == 32 ? 27 : 58; - constexpr de_bruijn() noexcept : lookup(), lookupr() { initialize(); } - - constexpr int get(T idx) const noexcept { return lookup[(idx * seed) >> mv]; } - constexpr int getr(T idx) const noexcept { return lookupr[(idx * seed) >> mv]; } - -private: - constexpr void initialize() noexcept { - for (uint8_t i = 0; i < digits; ++i) { - const auto idx = (seed << i) >> mv; - lookup[idx] = i; - lookupr[idx] = i == 0 ? 0 : digits - i; - } - } - - uint8_t lookup[digits]; - uint8_t lookupr[digits]; -}; - -inline constexpr de_bruijn de_bruijn32 = {}; -inline constexpr de_bruijn de_bruijn64 = {}; - -} // namespace math_detail - -#endif - -/** - * @brief - * - * @note `n & -n` is the lowest bit of n. - */ -template )> -WJR_CONST constexpr T lowbit(T n) noexcept { - return n & -n; -} - -template )> -WJR_CONST constexpr T clear_lowbit(T n) noexcept { - return n & (n - 1); -} - -// preview : - -template )> -WJR_CONST constexpr bool is_zero_or_single_bit(T n) noexcept { - return (n & (n - 1)) == 0; -} - -template )> -WJR_CONST constexpr bool __has_high_bit(T n) noexcept { - return n >> (std::numeric_limits::digits - 1); -} - -template )> -WJR_CONST constexpr T __ceil_div(T n, type_identity_t div) noexcept { - return (n + div - 1) / div; -} - -template )> -WJR_CONST constexpr T __align_down(T n, type_identity_t alignment) noexcept { - WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); - return n & (-alignment); -} - -template )> -WJR_CONST constexpr T __align_down_offset(T n, type_identity_t alignment) noexcept { - WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); - return n & (alignment - 1); -} - -template )> -WJR_CONST constexpr T __align_up(T n, type_identity_t alignment) noexcept { - WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); - return (n + alignment - 1) & (-alignment); -} - -template )> -WJR_CONST constexpr T __align_up_offset(T n, type_identity_t alignment) noexcept { - WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment)); - return (-n) & (alignment - 1); -} - -template )> -WJR_CONST constexpr std::make_signed_t __fasts_from_unsigned(T x) noexcept { - const std::make_signed_t ret = x; - WJR_ASSERT_ASSUME_L2(ret >= 0, "overflow"); - return ret; -} - -template , - WJR_REQUIRES(is_nonbool_signed_integral_v)> -WJR_CONST constexpr U __fasts_abs(T x) noexcept { - return static_cast(x < 0 ? -x : x); -} - -template )> -WJR_CONST constexpr T __fasts_negate(T x) noexcept { - return -x; -} - -template , - WJR_REQUIRES(is_nonbool_signed_integral_v)> -WJR_CONST constexpr T __fasts_conditional_negate(bool condition, T x) noexcept { - return condition ? -x : x; -} - -template , - WJR_REQUIRES(is_nonbool_signed_integral_v)> -WJR_CONST constexpr T __fasts_negate_with(T condition, T x) noexcept { - return __fasts_conditional_negate(condition < 0, x); -} - -template )> -WJR_CONST constexpr T __fasts_increment(T x) noexcept { - WJR_ASSERT_L2(x != std::numeric_limits::min() && - x != std::numeric_limits::max(), - "overflow"); - - return x < 0 ? x - 1 : x + 1; -} - -template )> -WJR_CONST constexpr T __fasts_decrement(T x) noexcept { - WJR_ASSERT_L2(x != 0 && x + 1 != T(0), "overflow"); - - return x < 0 ? x + 1 : x - 1; -} - -template )> -WJR_CONST constexpr T __fasts_add(T x, std::make_unsigned_t y) noexcept { - return x < 0 ? x - y : x + y; -} - -template )> -WJR_CONST constexpr T __fasts_sub(T x, std::make_unsigned_t y) noexcept { - return x < 0 ? x + y : x - y; -} - -} // namespace wjr - -#endif // WJR_MATH_DETAIL_HPP__ #ifndef WJR_MEMORY_COPY_HPP__ #define WJR_MEMORY_COPY_HPP__ @@ -12375,334 +12956,8 @@ constexpr void replace_uninit(list_node *from, list_node *to) noexcept { #ifndef WJR_MATH_BIT_HPP__ #define WJR_MATH_BIT_HPP__ -#ifndef WJR_MATH_CLZ_HPP__ -#define WJR_MATH_CLZ_HPP__ - // Already included -#ifndef WJR_MATH_POPCOUNT_HPP__ -#define WJR_MATH_POPCOUNT_HPP__ - // Already included - -namespace wjr { - -template -WJR_CONST WJR_INTRINSIC_CONSTEXPR int fallback_popcount(T x) noexcept { - constexpr auto nd = std::numeric_limits::digits; - if constexpr (nd < 32) { - return fallback_popcount(static_cast(x)); - } else { - if constexpr (nd == 32) { - x -= (x >> 1) & 0x5555'5555; - x = (x & 0x3333'3333) + ((x >> 2) & 0x3333'3333); - x = (x + (x >> 4)) & 0x0f0f'0f0f; - return (x * 0x0101'0101) >> 24; - } else { - x -= (x >> 1) & 0x5555'5555'5555'5555; - x = (x & 0x3333'3333'3333'3333) + ((x >> 2) & 0x3333'3333'3333'3333); - x = (x + (x >> 4)) & 0x0f0f'0f0f'0f0f'0f0f; - return (x * 0x0101'0101'0101'0101) >> 56; - } - } -} - -#if WJR_HAS_BUILTIN(POPCOUNT) - -template -WJR_CONST WJR_INTRINSIC_INLINE int builtin_popcount(T x) noexcept { - constexpr auto nd = std::numeric_limits::digits; - if constexpr (nd < 32) { - return builtin_popcount(static_cast(x)); - } else { - if constexpr (nd <= std::numeric_limits::digits) { - return __builtin_popcount(x); - } else if constexpr (nd <= std::numeric_limits::digits) { - return __builtin_popcountl(x); - } - if constexpr (nd <= std::numeric_limits::digits) { - return __builtin_popcountll(x); - } else { - static_assert(nd <= 64, "not support yet"); - } - } -} - -#endif // WJR_HAS_BUILTIN(POPCOUNT) - -template -WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount_impl(T x) noexcept { - if (WJR_BUILTIN_CONSTANT_P_TRUE(is_zero_or_single_bit(x))) { - return x != 0; - } - -#if WJR_HAS_BUILTIN(POPCOUNT) - if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) { - return fallback_popcount(x); - } - - return builtin_popcount(x); -#else - return fallback_popcount(x); -#endif -} - -template )> -WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount(T x) noexcept { - const int ret = popcount_impl(x); - WJR_ASSUME(0 <= ret && ret <= std::numeric_limits::digits); - return ret; -} - -} // namespace wjr - -#endif // WJR_MATH_POPCOUNT_HPP__ - -#if WJR_HAS_BUILTIN(__builtin_clz) -#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF -#elif defined(WJR_MSVC) && defined(WJR_X86) -#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF_VAR(2) -#endif - -#if WJR_HAS_BUILTIN(CLZ) == 2 -// Already included -#endif - -namespace wjr { - -template -WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_clz(T x) noexcept { - constexpr auto nd = std::numeric_limits::digits; - - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - - if constexpr (nd >= 16) { - x |= (x >> 8); - } - - if constexpr (nd >= 32) { - x |= (x >> 16); - } - - if constexpr (nd >= 64) { - x |= (x >> 32); - } - - return fallback_popcount(~x); -} - -template -WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_clz(T x) noexcept { - constexpr auto nd = std::numeric_limits::digits; - -#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)) - if constexpr (nd >= 32) { -#endif - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - - if constexpr (nd >= 16) { - x |= (x >> 8); - } - - if constexpr (nd >= 32) { - x |= (x >> 16); - } - - if constexpr (nd >= 64) { - x |= (x >> 32); - } -#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)) - } -#endif - -#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT) - return popcount(~x); -#else - if constexpr (nd < 32) { - return fallback_clz(static_cast(x)) - (32 - nd); - } else { - ++x; - - if constexpr (nd <= 32) { - return math_detail::de_bruijn32.getr(x); - } else if constexpr (nd <= 64) { - return math_detail::de_bruijn64.getr(x); - } else { - static_assert(nd <= 64, "not support yet"); - } - } -#endif -} - -#if WJR_HAS_BUILTIN(CLZ) - -template -WJR_CONST WJR_INTRINSIC_INLINE int builtin_clz(T x) noexcept { - constexpr auto nd = std::numeric_limits::digits; - if constexpr (nd < 32) { - return builtin_clz(static_cast(x)) - (32 - nd); - } else { -#if WJR_HAS_BUILTIN(CLZ) == 1 - if constexpr (nd <= std::numeric_limits::digits) { - constexpr auto delta = std::numeric_limits::digits - nd; - return __builtin_clz(static_cast(x)) - delta; - } else if constexpr (nd <= std::numeric_limits::digits) { - constexpr auto delta = std::numeric_limits::digits - nd; - return __builtin_clzl(static_cast(x)) - delta; - } else if constexpr (nd <= std::numeric_limits::digits) { - constexpr auto delta = std::numeric_limits::digits - nd; - return __builtin_clzll(static_cast(x)) - delta; - } else { - static_assert(nd <= 64, "not supported yet"); - } -#else - if constexpr (nd == 32) { - unsigned long result; - (void)_BitScanReverse(&result, x); - return 31 - result; - } else { - unsigned long result; - (void)_BitScanReverse64(&result, x); - return 63 - result; - } -#endif - } -} - -#endif - -/** - * @brief Fast count leading zeros - * - * @tparam T Must be an unsigned integral type - */ -template )> -WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int clz(T x) noexcept { -#if WJR_HAS_BUILTIN(CLZ) - if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) { - return fallback_clz(x); - } - - return builtin_clz(x); -#else - return fallback_clz(x); -#endif -} - -} // namespace wjr - -#endif // WJR_MATH_CLZ_HPP__ -#ifndef WJR_MATH_CTZ_HPP__ -#define WJR_MATH_CTZ_HPP__ - -// Already included -// Already included - -#if WJR_HAS_BUILTIN(__builtin_ctz) -#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF -#elif defined(WJR_MSVC) && defined(WJR_X86) -#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF_VAR(2) -#endif - -#if WJR_HAS_BUILTIN(CTZ) == 2 -// Already included -#endif - -namespace wjr { - -template -WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_ctz(T x) noexcept { - return fallback_popcount(lowbit(x) - 1); -} - -template -WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_ctz(T x) noexcept { -#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT) - return popcount(lowbit(x) - 1); -#else - constexpr auto nd = std::numeric_limits::digits; - - if constexpr (nd < 32) { - return fallback_ctz(static_cast(x)); - } else { - x = lowbit(x); - - if constexpr (nd <= 32) { - return math_detail::de_bruijn32.get(x); - } else if constexpr (nd <= 64) { - return math_detail::de_bruijn64.get(x); - } else { - static_assert(nd <= 64, "not support yet"); - } - } -#endif // -} - -#if WJR_HAS_BUILTIN(CTZ) - -template -WJR_CONST WJR_INTRINSIC_INLINE int builtin_ctz(T x) noexcept { - constexpr auto nd = std::numeric_limits::digits; - - if constexpr (nd < 32) { - return builtin_ctz(static_cast(x)); - } else { -#if WJR_HAS_BUILTIN(CTZ) == 1 - if constexpr (nd <= std::numeric_limits::digits) { - return __builtin_ctz(static_cast(x)); - } else if constexpr (nd <= std::numeric_limits::digits) { - return __builtin_ctzl(static_cast(x)); - } else if constexpr (nd <= std::numeric_limits::digits) { - return __builtin_ctzll(static_cast(x)); - } else { - static_assert(nd <= 64, "not supported yet"); - } -#else - if constexpr (nd == 32) { - unsigned long result; - (void)_BitScanForward(&result, x); - return result; - } else { - unsigned long result; - (void)_BitScanForward64(&result, x); - return result; - } -#endif - } -} - -#endif - -/** - * @brief Fast count trailing zeros - * - * @details Very fast even on non-optimized platforms by using a De Bruijn sequence. \n - * Try __builtin_clz if available, otherwise fallback to a portable implementation. \n - * In fallback_clz, use popcount and lowbit if POPCOUNT and POPCNT are available, make - * sure popcount is fast. \n - * Then use De Bruijn sequence, just a bit slower than popcount + lowbit. - * - * @tparam T Must be an unsigned integral type - */ -template )> -WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int ctz(T x) noexcept { -#if WJR_HAS_BUILTIN(CTZ) - if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) { - return fallback_ctz(x); - } - - return builtin_ctz(x); -#else - return fallback_ctz(x); -#endif -} - -} // namespace wjr - -#endif // WJR_MATH_CTZ_HPP__ // Already included namespace wjr { @@ -17895,7 +18150,10 @@ enum class chars_format : uint8_t { scientific = 0x01, fixed = 0x02, hex = 0x04, - general = fixed | scientific + general = fixed | scientific, + // only used in integeral_constant + __json_format = 0x08, + json = general | __json_format, }; template @@ -18080,7 +18338,7 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src0, const T *src1, #define WJR_REGISTER_FIND_NOT_N_RET(index) index - WJR_REGISTER_NORMAL_SIMD_FUNCTION( + WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION( n, WJR_REGISTER_FIND_NOT_N_2, WJR_REGISTER_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_FIND_NOT_N_ADVNCE, const auto __src0 = src0, WJR_REGISTER_FIND_NOT_N_RET); @@ -18202,7 +18460,7 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src, T val, size_t n) noexcept const auto y4 = broadcast<__m128i_t, __m256i_t>(y2); #endif - WJR_REGISTER_NORMAL_SIMD_FUNCTION( + WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION( n, WJR_REGISTER_FIND_NOT_N_2, WJR_REGISTER_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_FIND_NOT_N_ADVANCE, const auto __src = src, WJR_REGISTER_FIND_NOT_N_RET); @@ -18328,7 +18586,7 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src0, const T *src1, #define WJR_REGISTER_REVERSE_FIND_NOT_N_RET(index) 0 - WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION( + WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION( n, WJR_REGISTER_REVERSE_FIND_NOT_N_2, WJR_REGISTER_REVERSE_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE, , WJR_REGISTER_REVERSE_FIND_NOT_N_RET); @@ -18451,7 +18709,7 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src, T val, size_t n) const auto y4 = broadcast<__m128i_t, __m256i_t>(y2); #endif - WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION( + WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION( n, WJR_REGISTER_REVERSE_FIND_NOT_N_2, WJR_REGISTER_REVERSE_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE, , WJR_REGISTER_REVERSE_FIND_NOT_N_RET); @@ -25188,45 +25446,55 @@ constexpr int fallback_count_digits10(UnsignedValue n) noexcept { return count + 3; } -inline int builtin_count_digits10_u32(uint32_t n) noexcept { +namespace charconv_detail { + #define WJR_INC(T) (((sizeof(#T) - 1ull) << 32) - T) - static constexpr uint64_t table[] = { - WJR_INC(0), WJR_INC(0), WJR_INC(0), // 8 - WJR_INC(10), WJR_INC(10), WJR_INC(10), // 64 - WJR_INC(100), WJR_INC(100), WJR_INC(100), // 512 - WJR_INC(1000), WJR_INC(1000), WJR_INC(1000), // 4096 - WJR_INC(10000), WJR_INC(10000), WJR_INC(10000), // 32k - WJR_INC(100000), WJR_INC(100000), WJR_INC(100000), // 256k - WJR_INC(1000000), WJR_INC(1000000), WJR_INC(1000000), // 2048k - WJR_INC(10000000), WJR_INC(10000000), WJR_INC(10000000), // 16M - WJR_INC(100000000), WJR_INC(100000000), WJR_INC(100000000), // 128M - WJR_INC(1000000000), WJR_INC(1000000000), WJR_INC(1000000000), // 1024M - WJR_INC(1000000000), WJR_INC(1000000000) // 4B - }; - const auto inc = table[clz(n | 1) ^ 31]; - return static_cast((n + inc) >> 32); + +static constexpr uint64_t __count_digits10_u32_table[] = { + WJR_INC(0), WJR_INC(0), WJR_INC(0), // 8 + WJR_INC(10), WJR_INC(10), WJR_INC(10), // 64 + WJR_INC(100), WJR_INC(100), WJR_INC(100), // 512 + WJR_INC(1000), WJR_INC(1000), WJR_INC(1000), // 4096 + WJR_INC(10000), WJR_INC(10000), WJR_INC(10000), // 32k + WJR_INC(100000), WJR_INC(100000), WJR_INC(100000), // 256k + WJR_INC(1000000), WJR_INC(1000000), WJR_INC(1000000), // 2048k + WJR_INC(10000000), WJR_INC(10000000), WJR_INC(10000000), // 16M + WJR_INC(100000000), WJR_INC(100000000), WJR_INC(100000000), // 128M + WJR_INC(1000000000), WJR_INC(1000000000), WJR_INC(1000000000), // 1024M + WJR_INC(1000000000), WJR_INC(1000000000) // 4B +}; + #undef WJR_INC -} -inline int builtin_count_digits10_u64(uint64_t n) noexcept { #define WJR_POWERS_OF_10(factor) \ factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \ (factor)*1000000, (factor)*10000000, (factor)*100000000, (factor)*1000000000 - static constexpr uint8_t bsr2log10[] = { - 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, - 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, - 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, - 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20}; - const auto t = bsr2log10[clz(n | 1) ^ 63]; - static constexpr const uint64_t zero_or_powers_of_10[] = { - 0, 0, WJR_POWERS_OF_10(1U), WJR_POWERS_OF_10(1000000000ull), - 10000000000000000000ull}; - return t - (n < zero_or_powers_of_10[t]); + +static constexpr uint8_t __count_digits10_u64_bsr2log10[] = { + 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, + 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, + 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, + 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20}; + +static constexpr const uint64_t __count_digits10_u64_zero_or_powers_of_10[] = { + 0, 0, WJR_POWERS_OF_10(1U), WJR_POWERS_OF_10(1000000000ull), 10000000000000000000ull}; + #undef WJR_POWERS_OF_10 + +} // namespace charconv_detail + +WJR_INTRINSIC_CONSTEXPR20 int builtin_count_digits10_u32(uint32_t n) noexcept { + const auto inc = charconv_detail::__count_digits10_u32_table[clz(n | 1) ^ 31]; + return static_cast((n + inc) >> 32); +} + +WJR_INTRINSIC_CONSTEXPR20 int builtin_count_digits10_u64(uint64_t n) noexcept { + const auto t = charconv_detail::__count_digits10_u64_bsr2log10[clz(n | 1) ^ 63]; + return t - (n < charconv_detail::__count_digits10_u64_zero_or_powers_of_10[t]); } template -WJR_CONSTEXPR20 int count_digits10_impl(T n) noexcept { +WJR_INTRINSIC_CONSTEXPR20 int count_digits10_impl(T n) noexcept { if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(n)) { return fallback_count_digits10(n); } @@ -27163,7 +27431,7 @@ WJR_PURE int large_builtin_compare_n(const T *src0, const T *src1, size_t n) noe WJR_ASSUME(n > 2); - WJR_REGISTER_NORMAL_SIMD_FUNCTION( + WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION( n, WJR_REGISTER_COMPARE_NOT_N_2, WJR_REGISTER_COMPARE_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_COMPARE_NOT_N_ADVANCE, , WJR_REGISTER_COMPARE_NOT_N_RET); @@ -27318,7 +27586,7 @@ WJR_PURE int large_builtin_reverse_compare_n(const T *src0, const T *src1, WJR_ASSUME(n > 2); - WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION( + WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION( n, WJR_REGISTER_REVERSE_COMPARE_NOT_N_2, WJR_REGISTER_REVERSE_COMPARE_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_COMPARE_NOT_N_ADVANCE, , WJR_REGISTER_REVERSE_COMPARE_NOT_N_RET); @@ -36599,6 +36867,44 @@ class reader { namespace wjr::fastfloat { +template +struct default_writer { + using float_type = T; + using support_integral = std::false_type; + + WJR_INTRINSIC_CONSTEXPR T &get_float() noexcept { return value; } + + T &value; +}; + +template +WJR_NOINLINE from_chars_result<> __from_chars_impl(const char *first, const char *last, + Writer wr, Op options) noexcept; + +extern template from_chars_result<> +__from_chars_impl, + integral_constant>( + const char *first, const char *last, default_writer wr, + integral_constant options) noexcept; + +extern template from_chars_result<> +__from_chars_impl, + integral_constant>( + const char *first, const char *last, default_writer wr, + integral_constant options) noexcept; + +extern template from_chars_result<> +__from_chars_impl, chars_format>(const char *first, + const char *last, + default_writer wr, + chars_format fmt) noexcept; + +extern template from_chars_result<> +__from_chars_impl, chars_format>(const char *first, + const char *last, + default_writer wr, + chars_format fmt) noexcept; + /** * This function parses the character sequence [first,last) for a number. It parses * floating-point numbers expecting a locale-indepent format equivalent to what is used by @@ -36622,16 +36928,30 @@ namespace wjr::fastfloat { * point and scientific notation respectively. The default is * `fast_float::chars_format::general` which allows both `fixed` and `scientific`. */ -template +template +from_chars_result<> from_chars(const char *first, const char *last, float &value, + integral_constant fmt = {}) noexcept { + return __from_chars_impl(first, last, default_writer{value}, fmt); +} + +template +from_chars_result<> from_chars(const char *first, const char *last, double &value, + integral_constant fmt = {}) noexcept { + return __from_chars_impl(first, last, default_writer{value}, fmt); +} + +template )> from_chars_result<> from_chars(const char *first, const char *last, T &value, - chars_format fmt = chars_format::general) noexcept; + chars_format fmt) noexcept { + if (WJR_BUILTIN_CONSTANT_P(fmt)) { + if (fmt == chars_format::general) { + return from_chars(first, last, value); + } + } -/** - * Like from_chars, but accepts an `options` argument to govern number parsing. - */ -template -from_chars_result<> from_chars_advanced(const char *first, const char *last, T &value, - chars_format options) noexcept; + WJR_ASSERT(!(to_underlying(fmt) & to_underlying(chars_format::__json_format))); + return __from_chars_impl(first, last, default_writer{value}, fmt); +} // Compares two ASCII strings in a case insensitive manner. WJR_PURE WJR_INTRINSIC_CONSTEXPR bool @@ -37813,6 +38133,89 @@ WJR_CONST WJR_INTRINSIC_INLINE adjusted_mantissa compute_float(int64_t q, return answer; } +/// @brief special case of compute_float when q = 0. +template +WJR_CONST WJR_INTRINSIC_INLINE adjusted_mantissa compute_integer(uint64_t w) noexcept { + adjusted_mantissa answer; + // We want the most significant bit of i to be 1. Shift if needed. + const int lz = clz(w); + w <<= lz; + + // The required precision is binary::mantissa_explicit_bits() + 3 because + // 1. We need the implicit bit + // 2. We need an extra bit for rounding purposes + // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a + // shift) + + const uint128_t product = + compute_product_approximation(0, w); + // The "compute_product_approximation" function can be slightly slower than a + // branchless approach: uint128_t product = compute_product(q, w); but in practice, we + // can win big with the compute_product_approximation if its additional branch is + // easily predicted. Which is best is data specific. + const int upperbit = int(product.high >> 63); + + answer.mantissa = + product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3); + + answer.power2 = int32_t(63 + upperbit - lz - binary::minimum_exponent()); + if (answer.power2 <= 0) { // we have a subnormal? + // Here have that answer.power2 <= 0 so -answer.power2 >= 0 + if (-answer.power2 + 1 >= 64) { // if we have more than 64 bits below the minimum + // exponent, you have a zero for sure. + answer.power2 = 0; + answer.mantissa = 0; + // result should be zero + return answer; + } + // next line is safe because -answer.power2 + 1 < 64 + answer.mantissa >>= -answer.power2 + 1; + // Thankfully, we can't have both "round-to-even" and subnormals because + // "round-to-even" only occurs for powers close to 0. + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + // There is a weird scenario where we don't have a subnormal but just. + // Suppose we start with 2.2250738585072013e-308, we end up + // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal + // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round + // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer + // subnormal, but we can only know this after rounding. + // So we only declare a subnormal if we are smaller than the threshold. + answer.power2 = + (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) ? 0 : 1; + return answer; + } + + // usually, we round *up*, but if we fall right in between and and we have an + // even basis, we need to round down + // We are only concerned with the cases where 5**q fits in single 64-bit word. + if (product.low <= 1 && + (answer.mantissa & 3) == 1) { // we may fall between two floats! + // To be in-between two floats we need that in doing + // answer.mantissa = product.high >> (upperbit + 64 - + // binary::mantissa_explicit_bits() - 3); + // ... we dropped out only zeroes. But if this happened, then we can go back!!! + if ((answer.mantissa << (upperbit + 64 - binary::mantissa_explicit_bits() - 3)) == + product.high) { + answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up + } + } + + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) { + answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits()); + answer.power2++; // undo previous addition + } + + answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits()); + if (answer.power2 >= binary::infinite_power()) { // infinity + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + } + return answer; +} + // 1e0 to 1e19 constexpr static uint64_t powers_of_ten_uint64[] = {1UL, 10UL, @@ -38251,7 +38654,8 @@ inline adjusted_mantissa negative_digit_comp(biginteger &bigmant, adjusted_manti // get the value of `b`, rounded down, and get a biginteger representation of b+h adjusted_mantissa am_b = am; - // gcc7 buf: use a lambda to remove the noexcept qualifier bug with -Wnoexcept-type. + // gcc7 buf: use a lambda to remove the noexcept qualifier bug with + // -Wnoexcept-type. round(am_b, [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); }); T b; to_float(false, am_b, b); @@ -38327,8 +38731,8 @@ from_chars_result<> parse_infnan(const char *first, const char *last, T &value) answer.ptr = (first += 3); value = minusSign ? -std::numeric_limits::quiet_NaN() : std::numeric_limits::quiet_NaN(); - // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, C11 7.20.1.3.3. At - // least MSVC produces nan(ind) and nan(snan). + // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, + // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan). if (first != last && *first == '(') { for (const char *ptr = first + 1; ptr != last; ++ptr) { if (*ptr == ')') { @@ -38426,21 +38830,23 @@ WJR_INTRINSIC_INLINE bool rounds_to_nearest() noexcept { struct parsed_number_string { int64_t exponent{0}; - uint64_t mantissa{0}; bool negative{false}; // contains the range of the significant digits span integer{}; // non-nullable span fraction{}; // nullable }; -template -WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const char *last, - T &value, - chars_format options) noexcept { - static_assert(std::is_same::value || std::is_same::value, - "only float and double are supported"); +template +from_chars_result<> __from_chars_impl(const char *first, const char *last, Writer wr, + Op options) noexcept { + static_assert(!std::is_reference_v, ""); + + using T = typename Writer::float_type; + constexpr bool is_support_integral = Writer::support_integral::value; + constexpr bool is_constant_options = !std::is_same_v; from_chars_result<> answer; + if (WJR_UNLIKELY(first == last)) { answer.ec = std::errc::invalid_argument; answer.ptr = first; @@ -38448,7 +38854,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch } const char *p = first; - const auto fmt = to_underlying(options); + const auto fmt = to_underlying(static_cast(options)); parsed_number_string pns; pns.negative = (*p == '-'); @@ -38462,7 +38868,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch } const char *const start_digits = p; - uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) + uint64_t uval = 0; // an unsigned int avoids signed overflows (which are bad) const char *end_of_integer_part; int64_t digit_count; @@ -38477,13 +38883,21 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch do { uint8_t ch = *p; if (!__try_match(ch)) { // This situation rarely occurs + if constexpr (is_constant_options) { + if (fmt & to_underlying(chars_format::__json_format)) { + answer.ec = std::errc{}; + answer.ptr = first; + return answer; + } + } + break; } do { // a multiplication by 10 is cheaper than an arbitrary integer // multiplication - i = 10 * i + ch; // might overflow, we will handle the overflow later + uval = 10 * uval + ch; // might overflow, we will handle the overflow later if (++p == last) { goto INTEGER_AT_END; @@ -38498,6 +38912,15 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch digit_count = static_cast(p - start_digits); pns.integer = span(start_digits, static_cast(digit_count)); + if constexpr (is_constant_options) { + if (fmt & to_underlying(chars_format::__json_format)) { + // at least 1 digit in integer part, without leading zeros + if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) { + return answer; + } + } + } + if (*p != '.') { exponent = 0; if (*p == 'e' || *p == 'E') { @@ -38512,24 +38935,37 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch // can occur at most twice without overflowing, but let it occur more, since // for integers with many digits, digit parsing is the primary bottleneck. while ((std::distance(p, last) >= 8) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + - parse_eight_digits_unrolled( - p); // in rare cases, this will overflow, but that's ok + uval = uval * 100000000 + + parse_eight_digits_unrolled( + p); // in rare cases, this will overflow, but that's ok p += 8; } while ((p != last) && is_integer(*p)) { const auto digit = uint32_t(*p - '0'); ++p; - i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + uval = uval * 10 + digit; // in rare cases, this will overflow, but that's ok } exponent = before - p; pns.fraction = span(before, size_t(p - before)); digit_count -= exponent; - if (WJR_UNLIKELY(digit_count == 0)) { - return detail::parse_infnan(first, last, value); + auto &float_v = wr.get_float(); + if constexpr (is_constant_options) { + if (fmt & to_underlying(chars_format::__json_format)) { + if (WJR_UNLIKELY(exponent == 0)) { + return detail::parse_infnan(first, last, float_v); + } + } else { + if (WJR_UNLIKELY(digit_count == 0)) { + return detail::parse_infnan(first, last, float_v); + } + } + } else { + if (WJR_UNLIKELY(digit_count == 0)) { + return detail::parse_infnan(first, last, float_v); + } } } while (0); @@ -38550,7 +38986,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch if ((p == last) || !is_integer(*p)) { if (!bool(fmt & to_underlying(chars_format::fixed))) { // We are in error. - return detail::parse_infnan(first, last, value); + return detail::parse_infnan(first, last, wr.get_float()); } // Otherwise, we will be ignoring the 'e'. p = location_of_e; @@ -38571,7 +39007,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch // If it scientific and not fixed, we have to bail out. if (bool(fmt & to_underlying(chars_format::scientific)) && !bool(fmt & to_underlying(chars_format::fixed))) { - return detail::parse_infnan(first, last, value); + return detail::parse_infnan(first, last, wr.get_float()); } } @@ -38603,38 +39039,39 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch // Let us start again, this time, avoiding overflows. // We don't need to check if is_integer, since we use the // pre-tokenized spans from above. - i = 0; + uval = 0; p = pns.integer.data(); const char *int_end = p + pns.integer.size(); constexpr uint64_t minimal_nineteen_digit_integer = 1000000000000000000ull; - while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { - i = i * 10 + uint64_t(*p - '0'); + while ((uval < minimal_nineteen_digit_integer) && (p != int_end)) { + uval = uval * 10 + uint64_t(*p - '0'); ++p; } - if (i >= minimal_nineteen_digit_integer) { // We have a big integers + if (uval >= minimal_nineteen_digit_integer) { // We have a big integers exponent = end_of_integer_part - p + exp_number; } else { // We have a value with a fractional component. p = pns.fraction.data(); const char *frac_end = p + pns.fraction.size(); - while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { - i = i * 10 + uint64_t(*p - '0'); + while ((uval < minimal_nineteen_digit_integer) && (p != frac_end)) { + uval = uval * 10 + uint64_t(*p - '0'); ++p; } exponent = pns.fraction.data() - p + exp_number; } - // We have now corrected both exponent and i, to a truncated value + // We have now corrected both exponent and uval, to a truncated value } } pns.exponent = exponent; - pns.mantissa = i; + + T &float_v = wr.get_float(); // The implementation of the Clinger's fast path is convoluted because // we want round-to-nearest in all cases, irrespective of the rounding mode // selected on the thread. - // We proceed optimistically, assuming that detail::rounds_to_nearest() returns - // true. + // We proceed optimistically, assuming that detail::rounds_to_nearest() + // returns true. if (binary_format::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format::max_exponent_fast_path() && !too_many_digits) { @@ -38648,17 +39085,17 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch if (detail::rounds_to_nearest()) { // We have that fegetround() == FE_TONEAREST. // Next is Clinger's fast path. - if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { - value = T(pns.mantissa); + if (uval <= binary_format::max_mantissa_fast_path()) { + float_v = T(uval); if (pns.exponent < 0) { - value = - value / binary_format::exact_power_of_ten(-pns.exponent); + float_v = + float_v / binary_format::exact_power_of_ten(-pns.exponent); } else { - value = - value * binary_format::exact_power_of_ten(pns.exponent); + float_v = + float_v * binary_format::exact_power_of_ten(pns.exponent); } if (pns.negative) { - value = -value; + float_v = -float_v; } return answer; } @@ -38667,46 +39104,44 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's // proposal if (pns.exponent >= 0 && - pns.mantissa <= - binary_format::max_mantissa_fast_path(pns.exponent)) { + uval <= binary_format::max_mantissa_fast_path(pns.exponent)) { #if defined(__clang__) // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD - if (pns.mantissa == 0) { - value = pns.negative ? T(-0.) : T(0.); + if (uval == 0) { + float_v = pns.negative ? T(-0.) : T(0.); return answer; } #endif - value = T(pns.mantissa) * - binary_format::exact_power_of_ten(pns.exponent); + float_v = + T(uval) * binary_format::exact_power_of_ten(pns.exponent); if (pns.negative) { - value = -value; + float_v = -float_v; } return answer; } } } - adjusted_mantissa am = - compute_float>(pns.exponent, pns.mantissa); + adjusted_mantissa am = compute_float>(pns.exponent, uval); if (too_many_digits && am.power2 >= 0) { - if (am != compute_float>(pns.exponent, pns.mantissa + 1)) { - am = compute_error>(pns.exponent, pns.mantissa); + if (am != compute_float>(pns.exponent, uval + 1)) { + am = compute_error>(pns.exponent, uval); } } - // If we called compute_float>(pns.exponent, pns.mantissa) and we - // have an invalid power (am.power2 < 0), then we need to go the long way around - // again. This is very uncommon. + // If we called compute_float>(pns.exponent, uval) + // and we have an invalid power (am.power2 < 0), then we need to go the long + // way around again. This is very uncommon. if (am.power2 < 0) { am.power2 -= invalid_am_bias; - const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa); + const int32_t sci_exp = scientific_exponent(pns.exponent, uval); am = digit_comp(am, pns.integer, pns.fraction, sci_exp); } - to_float(pns.negative, am, value); + to_float(pns.negative, am, float_v); // Test for over/underflow. - if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format::infinite_power()) { answer.ec = std::errc::result_out_of_range; } @@ -38719,6 +39154,15 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch digit_count = static_cast(p - start_digits); pns.integer = span(start_digits, static_cast(digit_count)); + if constexpr (is_constant_options) { + if (fmt & to_underlying(chars_format::__json_format)) { + // at least 1 digit in integer part, without leading zeros + if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) { + return answer; + } + } + } + INTEGER: answer.ec = std::errc(); // be optimistic answer.ptr = p; @@ -38742,41 +39186,53 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch if (digit_count > 19) { p = start; - i = __from_chars_unroll_16<10>(reinterpret_cast(p), - char_converter); + uval = __from_chars_unroll_16<10>(reinterpret_cast(p), + char_converter); p += 16; - i = i * 10 + char_converter.template from<10>(*p++); - i = i * 10 + char_converter.template from<10>(*p++); - i = i * 10 + char_converter.template from<10>(*p++); + uval = uval * 10 + char_converter.template from<10>(*p++); + uval = uval * 10 + char_converter.template from<10>(*p++); + uval = uval * 10 + char_converter.template from<10>(*p++); exponent = end_of_integer_part - p; pns.exponent = exponent; - pns.mantissa = i; WJR_ASSUME(exponent >= 0); - adjusted_mantissa am = - compute_float>(pns.exponent, pns.mantissa); + if constexpr (is_support_integral) { + constexpr uint64_t max_quot = std::numeric_limits::max() / 10; + constexpr uint32_t max_rem = std::numeric_limits::max() % 10; + + if (!pns.negative && digit_count == 20 && + (uval < max_quot || + (uval == max_quot && static_cast(*p - '0') <= max_rem))) { + uint64_t &u64_v = wr.get_u64(); + u64_v = uval; + return answer; + } + } + + T &float_v = wr.get_float(); + + adjusted_mantissa am = compute_float>(pns.exponent, uval); if (am.power2 >= 0) { - if (am != - compute_float>(pns.exponent, pns.mantissa + 1)) { - am = compute_error>(pns.exponent, pns.mantissa); + if (am != compute_float>(pns.exponent, uval + 1)) { + am = compute_error>(pns.exponent, uval); } } - // If we called compute_float>(pns.exponent, pns.mantissa) - // and we have an invalid power (am.power2 < 0), then we need to go the long - // way around again. This is very uncommon. + // If we called compute_float>(pns.exponent, + // uval) and we have an invalid power (am.power2 < 0), then we + // need to go the long way around again. This is very uncommon. if (am.power2 < 0) { am.power2 -= invalid_am_bias; - const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa); + const int32_t sci_exp = scientific_exponent(pns.exponent, uval); am = digit_comp(am, pns.integer, pns.fraction, sci_exp); } - to_float(pns.negative, am, value); + to_float(pns.negative, am, float_v); // Test for over/underflow. - if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format::infinite_power()) { answer.ec = std::errc::result_out_of_range; } @@ -38786,60 +39242,44 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch } pns.exponent = 0; - pns.mantissa = i; - // Unfortunately, the conventional Clinger's fast path is only possible - // when the system rounds to the nearest float. - // - // We expect the next branch to almost always be selected. - // We could check it first (before the previous branch), but - // there might be performance advantages at having the check - // be last. - if (detail::rounds_to_nearest()) { - // We have that fegetround() == FE_TONEAREST. - // Next is Clinger's fast path. - if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { - value = T(pns.mantissa); - if (pns.negative) { - value = -value; - } + if constexpr (is_support_integral) { + if (!pns.negative) { + uint64_t &u64_v = wr.get_u64(); + u64_v = uval; return answer; - } - } else { - // We do not have that fegetround() == FE_TONEAREST. - // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's - // proposal - if (pns.mantissa <= binary_format::max_mantissa_fast_path(0)) { -#if defined(__clang__) - // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD - if (pns.mantissa == 0) { - value = pns.negative ? T(-0.) : T(0.); - return answer; - } -#endif - value = T(pns.mantissa); - if (pns.negative) { - value = -value; - } + } else if (uval <= static_cast(-std::numeric_limits::min())) { + int64_t &i64_v = wr.get_i64(); + i64_v = static_cast(-uval); return answer; } } - adjusted_mantissa am = compute_float>(0, pns.mantissa); + auto &float_v = wr.get_float(); - // If we called compute_float>(pns.exponent, pns.mantissa) and we - // have an invalid power (am.power2 < 0), then we need to go the long way around - // again. This is very uncommon. - if (am.power2 < 0) { - am.power2 -= invalid_am_bias; + if (WJR_LIKELY(uval <= binary_format::max_mantissa_fast_path())) { +#if defined(__clang__) + // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD + if (uval == 0) { + float_v = pns.negative ? T(-0.) : T(0.); + return answer; + } +#endif - const int32_t sci_exp = scientific_exponent(0, pns.mantissa); - am = digit_comp(am, pns.integer, pns.fraction, sci_exp); + float_v = T(uval); + if (pns.negative) { + float_v = -float_v; + } + + return answer; } - to_float(pns.negative, am, value); + adjusted_mantissa am = compute_integer>(uval); + WJR_ASSERT_ASSUME(am.power2 >= 0); + + to_float(pns.negative, am, float_v); // Test for over/underflow. - if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format::infinite_power()) { answer.ec = std::errc::result_out_of_range; } @@ -38847,12 +39287,6 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch return answer; } -template -from_chars_result<> from_chars(const char *first, const char *last, T &value, - chars_format fmt /*= chars_format::general*/) noexcept { - return from_chars_advanced(first, last, value, fmt); -} - } // namespace wjr::fastfloat namespace wjr { diff --git a/include/wjr/format/fastfloat.hpp b/include/wjr/format/fastfloat.hpp index d23eaf5a..cabceb99 100644 --- a/include/wjr/format/fastfloat.hpp +++ b/include/wjr/format/fastfloat.hpp @@ -6,27 +6,43 @@ namespace wjr::fastfloat { -template +template +struct default_writer { + using float_type = T; + using support_integral = std::false_type; + + WJR_INTRINSIC_CONSTEXPR T &get_float() noexcept { return value; } + + T &value; +}; + +template WJR_NOINLINE from_chars_result<> __from_chars_impl(const char *first, const char *last, - T &value, Op options) noexcept; + Writer wr, Op options) noexcept; extern template from_chars_result<> -__from_chars_impl>( - const char *first, const char *last, float &value, +__from_chars_impl, + integral_constant>( + const char *first, const char *last, default_writer wr, integral_constant options) noexcept; extern template from_chars_result<> -__from_chars_impl>( - const char *first, const char *last, double &value, +__from_chars_impl, + integral_constant>( + const char *first, const char *last, default_writer wr, integral_constant options) noexcept; extern template from_chars_result<> -__from_chars_impl(const char *first, const char *last, float &value, - chars_format fmt) noexcept; +__from_chars_impl, chars_format>(const char *first, + const char *last, + default_writer wr, + chars_format fmt) noexcept; extern template from_chars_result<> -__from_chars_impl(const char *first, const char *last, - double &value, chars_format fmt) noexcept; +__from_chars_impl, chars_format>(const char *first, + const char *last, + default_writer wr, + chars_format fmt) noexcept; /** * This function parses the character sequence [first,last) for a number. It parses @@ -54,13 +70,13 @@ __from_chars_impl(const char *first, const char *last, template from_chars_result<> from_chars(const char *first, const char *last, float &value, integral_constant fmt = {}) noexcept { - return __from_chars_impl(first, last, value, fmt); + return __from_chars_impl(first, last, default_writer{value}, fmt); } template from_chars_result<> from_chars(const char *first, const char *last, double &value, integral_constant fmt = {}) noexcept { - return __from_chars_impl(first, last, value, fmt); + return __from_chars_impl(first, last, default_writer{value}, fmt); } template )> @@ -69,13 +85,11 @@ from_chars_result<> from_chars(const char *first, const char *last, T &value, if (WJR_BUILTIN_CONSTANT_P(fmt)) { if (fmt == chars_format::general) { return from_chars(first, last, value); - } else if (fmt == chars_format::json) { - return from_chars(first, last, value, - integral_constant{}); } } - return __from_chars_impl(first, last, value, fmt); + WJR_ASSERT(!(to_underlying(fmt) & to_underlying(chars_format::__json_format))); + return __from_chars_impl(first, last, default_writer{value}, fmt); } // Compares two ASCII strings in a case insensitive manner. @@ -1955,17 +1969,21 @@ WJR_INTRINSIC_INLINE bool rounds_to_nearest() noexcept { struct parsed_number_string { int64_t exponent{0}; - uint64_t mantissa{0}; bool negative{false}; // contains the range of the significant digits span integer{}; // non-nullable span fraction{}; // nullable }; -template -from_chars_result<> __from_chars_impl(const char *first, const char *last, T &value, +template +from_chars_result<> __from_chars_impl(const char *first, const char *last, Writer wr, Op options) noexcept { + static_assert(!std::is_reference_v, ""); + + using T = typename Writer::float_type; + constexpr bool is_support_integral = Writer::support_integral::value; constexpr bool is_constant_options = !std::is_same_v; + from_chars_result<> answer; if (WJR_UNLIKELY(first == last)) { @@ -1977,10 +1995,6 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va const char *p = first; const auto fmt = to_underlying(static_cast(options)); - if constexpr (!is_constant_options) { - WJR_ASSERT(!(fmt & to_underlying(chars_format::__json_format))); - } - parsed_number_string pns; pns.negative = (*p == '-'); @@ -1993,7 +2007,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va } const char *const start_digits = p; - uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) + uint64_t uval = 0; // an unsigned int avoids signed overflows (which are bad) const char *end_of_integer_part; int64_t digit_count; @@ -2022,7 +2036,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va do { // a multiplication by 10 is cheaper than an arbitrary integer // multiplication - i = 10 * i + ch; // might overflow, we will handle the overflow later + uval = 10 * uval + ch; // might overflow, we will handle the overflow later if (++p == last) { goto INTEGER_AT_END; @@ -2037,6 +2051,15 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va digit_count = static_cast(p - start_digits); pns.integer = span(start_digits, static_cast(digit_count)); + if constexpr (is_constant_options) { + if (fmt & to_underlying(chars_format::__json_format)) { + // at least 1 digit in integer part, without leading zeros + if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) { + return answer; + } + } + } + if (*p != '.') { exponent = 0; if (*p == 'e' || *p == 'E') { @@ -2051,24 +2074,37 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va // can occur at most twice without overflowing, but let it occur more, since // for integers with many digits, digit parsing is the primary bottleneck. while ((std::distance(p, last) >= 8) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + - parse_eight_digits_unrolled( - p); // in rare cases, this will overflow, but that's ok + uval = uval * 100000000 + + parse_eight_digits_unrolled( + p); // in rare cases, this will overflow, but that's ok p += 8; } while ((p != last) && is_integer(*p)) { const auto digit = uint32_t(*p - '0'); ++p; - i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + uval = uval * 10 + digit; // in rare cases, this will overflow, but that's ok } exponent = before - p; pns.fraction = span(before, size_t(p - before)); digit_count -= exponent; - if (WJR_UNLIKELY(digit_count == 0)) { - return detail::parse_infnan(first, last, value); + auto &float_v = wr.get_float(); + if constexpr (is_constant_options) { + if (fmt & to_underlying(chars_format::__json_format)) { + if (WJR_UNLIKELY(exponent == 0)) { + return detail::parse_infnan(first, last, float_v); + } + } else { + if (WJR_UNLIKELY(digit_count == 0)) { + return detail::parse_infnan(first, last, float_v); + } + } + } else { + if (WJR_UNLIKELY(digit_count == 0)) { + return detail::parse_infnan(first, last, float_v); + } } } while (0); @@ -2089,7 +2125,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va if ((p == last) || !is_integer(*p)) { if (!bool(fmt & to_underlying(chars_format::fixed))) { // We are in error. - return detail::parse_infnan(first, last, value); + return detail::parse_infnan(first, last, wr.get_float()); } // Otherwise, we will be ignoring the 'e'. p = location_of_e; @@ -2110,7 +2146,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va // If it scientific and not fixed, we have to bail out. if (bool(fmt & to_underlying(chars_format::scientific)) && !bool(fmt & to_underlying(chars_format::fixed))) { - return detail::parse_infnan(first, last, value); + return detail::parse_infnan(first, last, wr.get_float()); } } @@ -2142,32 +2178,33 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va // Let us start again, this time, avoiding overflows. // We don't need to check if is_integer, since we use the // pre-tokenized spans from above. - i = 0; + uval = 0; p = pns.integer.data(); const char *int_end = p + pns.integer.size(); constexpr uint64_t minimal_nineteen_digit_integer = 1000000000000000000ull; - while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { - i = i * 10 + uint64_t(*p - '0'); + while ((uval < minimal_nineteen_digit_integer) && (p != int_end)) { + uval = uval * 10 + uint64_t(*p - '0'); ++p; } - if (i >= minimal_nineteen_digit_integer) { // We have a big integers + if (uval >= minimal_nineteen_digit_integer) { // We have a big integers exponent = end_of_integer_part - p + exp_number; } else { // We have a value with a fractional component. p = pns.fraction.data(); const char *frac_end = p + pns.fraction.size(); - while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { - i = i * 10 + uint64_t(*p - '0'); + while ((uval < minimal_nineteen_digit_integer) && (p != frac_end)) { + uval = uval * 10 + uint64_t(*p - '0'); ++p; } exponent = pns.fraction.data() - p + exp_number; } - // We have now corrected both exponent and i, to a truncated value + // We have now corrected both exponent and uval, to a truncated value } } pns.exponent = exponent; - pns.mantissa = i; + + T &float_v = wr.get_float(); // The implementation of the Clinger's fast path is convoluted because // we want round-to-nearest in all cases, irrespective of the rounding mode @@ -2187,17 +2224,17 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va if (detail::rounds_to_nearest()) { // We have that fegetround() == FE_TONEAREST. // Next is Clinger's fast path. - if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { - value = T(pns.mantissa); + if (uval <= binary_format::max_mantissa_fast_path()) { + float_v = T(uval); if (pns.exponent < 0) { - value = - value / binary_format::exact_power_of_ten(-pns.exponent); + float_v = + float_v / binary_format::exact_power_of_ten(-pns.exponent); } else { - value = - value * binary_format::exact_power_of_ten(pns.exponent); + float_v = + float_v * binary_format::exact_power_of_ten(pns.exponent); } if (pns.negative) { - value = -value; + float_v = -float_v; } return answer; } @@ -2206,46 +2243,44 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's // proposal if (pns.exponent >= 0 && - pns.mantissa <= - binary_format::max_mantissa_fast_path(pns.exponent)) { + uval <= binary_format::max_mantissa_fast_path(pns.exponent)) { #if defined(__clang__) // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD - if (pns.mantissa == 0) { - value = pns.negative ? T(-0.) : T(0.); + if (uval == 0) { + float_v = pns.negative ? T(-0.) : T(0.); return answer; } #endif - value = T(pns.mantissa) * - binary_format::exact_power_of_ten(pns.exponent); + float_v = + T(uval) * binary_format::exact_power_of_ten(pns.exponent); if (pns.negative) { - value = -value; + float_v = -float_v; } return answer; } } } - adjusted_mantissa am = - compute_float>(pns.exponent, pns.mantissa); + adjusted_mantissa am = compute_float>(pns.exponent, uval); if (too_many_digits && am.power2 >= 0) { - if (am != compute_float>(pns.exponent, pns.mantissa + 1)) { - am = compute_error>(pns.exponent, pns.mantissa); + if (am != compute_float>(pns.exponent, uval + 1)) { + am = compute_error>(pns.exponent, uval); } } - // If we called compute_float>(pns.exponent, pns.mantissa) + // If we called compute_float>(pns.exponent, uval) // and we have an invalid power (am.power2 < 0), then we need to go the long // way around again. This is very uncommon. if (am.power2 < 0) { am.power2 -= invalid_am_bias; - const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa); + const int32_t sci_exp = scientific_exponent(pns.exponent, uval); am = digit_comp(am, pns.integer, pns.fraction, sci_exp); } - to_float(pns.negative, am, value); + to_float(pns.negative, am, float_v); // Test for over/underflow. - if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format::infinite_power()) { answer.ec = std::errc::result_out_of_range; } @@ -2258,6 +2293,15 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va digit_count = static_cast(p - start_digits); pns.integer = span(start_digits, static_cast(digit_count)); + if constexpr (is_constant_options) { + if (fmt & to_underlying(chars_format::__json_format)) { + // at least 1 digit in integer part, without leading zeros + if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) { + return answer; + } + } + } + INTEGER: answer.ec = std::errc(); // be optimistic answer.ptr = p; @@ -2281,41 +2325,53 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va if (digit_count > 19) { p = start; - i = __from_chars_unroll_16<10>(reinterpret_cast(p), - char_converter); + uval = __from_chars_unroll_16<10>(reinterpret_cast(p), + char_converter); p += 16; - i = i * 10 + char_converter.template from<10>(*p++); - i = i * 10 + char_converter.template from<10>(*p++); - i = i * 10 + char_converter.template from<10>(*p++); + uval = uval * 10 + char_converter.template from<10>(*p++); + uval = uval * 10 + char_converter.template from<10>(*p++); + uval = uval * 10 + char_converter.template from<10>(*p++); exponent = end_of_integer_part - p; pns.exponent = exponent; - pns.mantissa = i; WJR_ASSUME(exponent >= 0); - adjusted_mantissa am = - compute_float>(pns.exponent, pns.mantissa); + if constexpr (is_support_integral) { + constexpr uint64_t max_quot = std::numeric_limits::max() / 10; + constexpr uint32_t max_rem = std::numeric_limits::max() % 10; + + if (!pns.negative && digit_count == 20 && + (uval < max_quot || + (uval == max_quot && static_cast(*p - '0') <= max_rem))) { + uint64_t &u64_v = wr.get_u64(); + u64_v = uval; + return answer; + } + } + + T &float_v = wr.get_float(); + + adjusted_mantissa am = compute_float>(pns.exponent, uval); if (am.power2 >= 0) { - if (am != - compute_float>(pns.exponent, pns.mantissa + 1)) { - am = compute_error>(pns.exponent, pns.mantissa); + if (am != compute_float>(pns.exponent, uval + 1)) { + am = compute_error>(pns.exponent, uval); } } // If we called compute_float>(pns.exponent, - // pns.mantissa) and we have an invalid power (am.power2 < 0), then we + // uval) and we have an invalid power (am.power2 < 0), then we // need to go the long way around again. This is very uncommon. if (am.power2 < 0) { am.power2 -= invalid_am_bias; - const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa); + const int32_t sci_exp = scientific_exponent(pns.exponent, uval); am = digit_comp(am, pns.integer, pns.fraction, sci_exp); } - to_float(pns.negative, am, value); + to_float(pns.negative, am, float_v); // Test for over/underflow. - if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format::infinite_power()) { answer.ec = std::errc::result_out_of_range; } @@ -2325,31 +2381,44 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va } pns.exponent = 0; - pns.mantissa = i; - if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { + if constexpr (is_support_integral) { + if (!pns.negative) { + uint64_t &u64_v = wr.get_u64(); + u64_v = uval; + return answer; + } else if (uval <= static_cast(-std::numeric_limits::min())) { + int64_t &i64_v = wr.get_i64(); + i64_v = static_cast(-uval); + return answer; + } + } + + auto &float_v = wr.get_float(); + + if (WJR_LIKELY(uval <= binary_format::max_mantissa_fast_path())) { #if defined(__clang__) // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD - if (pns.mantissa == 0) { - value = pns.negative ? T(-0.) : T(0.); + if (uval == 0) { + float_v = pns.negative ? T(-0.) : T(0.); return answer; } #endif - value = T(pns.mantissa); + float_v = T(uval); if (pns.negative) { - value = -value; + float_v = -float_v; } - + return answer; } - adjusted_mantissa am = compute_integer>(pns.mantissa); + adjusted_mantissa am = compute_integer>(uval); WJR_ASSERT_ASSUME(am.power2 >= 0); - to_float(pns.negative, am, value); + to_float(pns.negative, am, float_v); // Test for over/underflow. - if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format::infinite_power()) { answer.ec = std::errc::result_out_of_range; } diff --git a/src/wjr/format/fastfloat.cpp b/src/wjr/format/fastfloat.cpp index 4893a551..f97409de 100644 --- a/src/wjr/format/fastfloat.cpp +++ b/src/wjr/format/fastfloat.cpp @@ -1,4 +1,25 @@ #include -namespace wjr { -} \ No newline at end of file +namespace wjr::fastfloat { + +template from_chars_result<> +__from_chars_impl, + integral_constant>( + const char *first, const char *last, default_writer wr, + integral_constant options) noexcept; + +template from_chars_result<> +__from_chars_impl, + integral_constant>( + const char *first, const char *last, default_writer wr, + integral_constant options) noexcept; + +template from_chars_result<> __from_chars_impl, chars_format>( + const char *first, const char *last, default_writer wr, + chars_format fmt) noexcept; + +template from_chars_result<> __from_chars_impl, chars_format>( + const char *first, const char *last, default_writer wr, + chars_format fmt) noexcept; + +} // namespace wjr::fastfloat \ No newline at end of file