From b65566971617767293031b7fb457257b65624a5a Mon Sep 17 00:00:00 2001
From: wjr <1966336874@qq.com>
Date: Mon, 5 Aug 2024 15:31:27 +0800
Subject: [PATCH] update

---
 godbolt/wjr.hpp                  | 6502 ++++++++++++++++--------------
 include/wjr/format/fastfloat.hpp |  245 +-
 src/wjr/format/fastfloat.cpp     |   25 +-
 3 files changed, 3648 insertions(+), 3124 deletions(-)
diff --git a/godbolt/wjr.hpp b/godbolt/wjr.hpp
index 4568adfb..1a07fca5 100644
--- a/godbolt/wjr.hpp
+++ b/godbolt/wjr.hpp
@@ -2921,11 +2921,17 @@ WJR_CONST constexpr bool in_range(U value) noexcept {
     }
 }
 
+template <typename From, typename To, typename Enable = void>
+struct __is_value_preserving_impl : std::false_type {};
+
 template <typename From, typename To>
-struct is_value_preserving
+struct __is_value_preserving_impl<From, To, std::enable_if_t<std::is_integral_v<From>>>
     : std::bool_constant<in_range<To>(std::numeric_limits<From>::min()) &&
                          in_range<To>(std::numeric_limits<From>::max())> {};
 
+template <typename From, typename To>
+struct is_value_preserving : __is_value_preserving_impl<From, To> {};
+
 template <typename From>
 struct is_value_preserving<From, From> : std::true_type {};
 
@@ -3567,9 +3573,20 @@ WJR_NODISCARD auto allocate_at_least(Allocator &alloc, SizeType count) {
 #ifndef WJR_X86_SIMD_SIMD_HPP__
 #define WJR_X86_SIMD_SIMD_HPP__
 
+#ifndef WJR_X86_SIMD_AVX_HPP__
+#define WJR_X86_SIMD_AVX_HPP__
+
+#ifndef WJR_X86_SIMD_SSE_HPP__
+#define WJR_X86_SIMD_SSE_HPP__
+
 #ifndef WJR_X86_SIMD_SIMD_CAST_HPP__
 #define WJR_X86_SIMD_SIMD_CAST_HPP__
 
+#include <cstdint>
+
+#ifndef WJR_SIMD_DETAIL_HPP__
+#define WJR_SIMD_DETAIL_HPP__
+
 #ifndef WJR_SIMD_SIMD_CAST_HPP__
 #define WJR_SIMD_SIMD_CAST_HPP__
 
@@ -3584,879 +3601,1555 @@ inline constexpr simd_cast_fn<From, To> simd_cast;
 } // namespace wjr
 
 #endif // WJR_SIMD_SIMD_CAST_HPP__
-#ifndef WJR_X86_SIMD_INTRIN_HPP__
-#define WJR_X86_SIMD_INTRIN_HPP__
+#ifndef WJR_SIMD_SIMD_MASK_HPP__
+#define WJR_SIMD_SIMD_MASK_HPP__
+
+#ifndef WJR_ASSERT_HPP__
+#define WJR_ASSERT_HPP__
+
+/**
+ * @file assert.hpp
+ * @author wjr
+ * @brief Assertion utilities
+ *
+ * @details WJR_DEBUG_LEVEL : 0 ~ 3 \n
+ * 0 : Release \n
+ * 1 : Beta \n
+ * 2 : Runtime detect \n
+ * 3 : Maximize runtime detect, for debug \n
+ * If WJR_DEBUG_LEVEL is not defined, \n
+ * If NDEBUG is defined, WJR_DEBUG_LEVEL is set to 0 by default. \n
+ * Otherwise, WJR_DEBUG_LEVEL is set to 1 by default. \n
+ * WJR_ASSERT_L(level, expr) : Specify the level of assertion, \n
+ * if the WJR_DEBUG_LEVEL is greater than or equal to the level, \n
+ * the assertion is executed. \n
+ * WJR_ASSERT(expr) : Equivalent to WJR_ASSERT_L(1, expr) \n
+ * WJR_ASSERT_0(expr) : Always execute the assertion \n
+ *
+ * @version 0.1
+ * @date 2024-06-01
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+
+#include <iostream>
 
 // Already included
 
-#if defined(_MSC_VER)
-/* Microsoft C/C++-compatible compiler */
-#include <intrin.h>
-#elif defined(__GNUC__)
-/* GCC-compatible compiler, targeting x86/x86-64 */
-#include <x86intrin.h>
+#ifndef WJR_DEBUG_LEVEL
+#if defined(NDEBUG)
+#define WJR_DEBUG_LEVEL 0
+#else
+#define WJR_DEBUG_LEVEL 1
+#endif
 #endif
 
-#endif // WJR_X86_SIMD_INTRIN_HPP__
+#if WJR_DEBUG_LEVEL < 0 || WJR_DEBUG_LEVEL > 3
+#error "WJR_DEBUG_LEVEL must be 0 ~ 3"
+#endif
 
 namespace wjr {
 
-// simd type can't be directly used on template
-template <typename T>
-struct simd_wrapper {
-    using type = T;
-};
+#define WJR_DEBUG_IF(level, expr0, expr1)                                                \
+    WJR_PP_BOOL_IF(WJR_PP_GT(WJR_DEBUG_LEVEL, level), expr0, expr1)
 
-template <typename T>
-using simd_wrapper_t = typename simd_wrapper<T>::type;
+WJR_NORETURN extern void __assert_failed(const char *expr, const char *file,
+                                         const char *func, int line) noexcept;
 
-#if WJR_HAS_SIMD(SSE)
+// LCOV_EXCL_START
 
-struct __m128_t {
-    using type = __m128;
-};
+/// @private
+template <typename... Args>
+WJR_NOINLINE void __assert_handler(const char *expr, const char *file, const char *func,
+                                   int line, Args &&...args) noexcept {
+    std::cerr << "Additional information: ";
+    (void)(std::cerr << ... << std::forward<Args>(args));
+    std::cerr << '\n';
+    __assert_failed(expr, file, func, line);
+}
 
-#endif // SSE
+/// @private
+inline void __assert_handler(const char *expr, const char *file, const char *func,
+                             int line) noexcept {
+    __assert_failed(expr, file, func, line);
+}
 
-#if WJR_HAS_SIMD(SSE2)
+// LCOV_EXCL_STOP
 
-struct __m128i_t {
-    using type = __m128i;
-};
+#define WJR_ASSERT_CHECK_I(expr, ...)                                                    \
+    do {                                                                                 \
+        if (WJR_UNLIKELY(!(expr))) {                                                     \
+            ::wjr::__assert_handler(#expr, WJR_FILE, WJR_CURRENT_FUNCTION, WJR_LINE,     \
+                                    ##__VA_ARGS__);                                      \
+        }                                                                                \
+    } while (0)
 
-struct __m128d_t {
-    using type = __m128d;
-};
+// do nothing
+#define WJR_ASSERT_UNCHECK_I(expr, ...)                                                  \
+    do {                                                                                 \
+    } while (0)
 
-template <>
-struct simd_cast_fn<__m128_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128 v) const {
-        return _mm_castps_si128(v);
-    }
-};
+// level = [0, 2]
+// The higher the level, the less likely it is to be detected
+// Runtime detect  : 1
+// Maximize detect : 2
+#define WJR_ASSERT_L(level, ...)                                                         \
+    WJR_DEBUG_IF(level, WJR_ASSERT_CHECK_I, WJR_ASSERT_UNCHECK_I)                        \
+    (__VA_ARGS__)
 
-template <>
-struct simd_cast_fn<__m128_t, __m128d_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128 v) const {
-        return _mm_castps_pd(v);
-    }
-};
+// level of assert is zero at default.
+#define WJR_ASSERT_L0(...) WJR_ASSERT_CHECK_I(__VA_ARGS__)
+#define WJR_ASSERT_L1(...) WJR_ASSERT_L(1, __VA_ARGS__)
+#define WJR_ASSERT_L2(...) WJR_ASSERT_L(2, __VA_ARGS__)
+#define WJR_ASSERT_L3(...) WJR_ASSERT_L(3, __VA_ARGS__)
+#define WJR_ASSERT(...) WJR_ASSERT_L1(__VA_ARGS__)
 
-template <>
-struct simd_cast_fn<__m128i_t, __m128_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128i v) const {
-        return _mm_castsi128_ps(v);
-    }
-};
+#define WJR_ASSERT_ASSUME_L(level, ...)                                                  \
+    WJR_ASSERT_L(level, __VA_ARGS__);                                                    \
+    __WJR_ASSERT_ASSUME_L_ASSUME(__VA_ARGS__)
+#define __WJR_ASSERT_ASSUME_L_ASSUME(expr, ...) WJR_ASSUME(expr)
 
-template <>
-struct simd_cast_fn<__m128i_t, __m128d_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128i v) const {
-        return _mm_castsi128_pd(v);
-    }
-};
+#define WJR_ASSERT_ASSUME_L0(...) WJR_ASSERT_ASSUME_L(0, __VA_ARGS__)
+#define WJR_ASSERT_ASSUME_L1(...) WJR_ASSERT_ASSUME_L(1, __VA_ARGS__)
+#define WJR_ASSERT_ASSUME_L2(...) WJR_ASSERT_ASSUME_L(2, __VA_ARGS__)
+#define WJR_ASSERT_ASSUME_L3(...) WJR_ASSERT_ASSUME_L(3, __VA_ARGS__)
+#define WJR_ASSERT_ASSUME(...) WJR_ASSERT_ASSUME_L1(__VA_ARGS__)
 
-template <>
-struct simd_cast_fn<__m128d_t, __m128_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128d v) const {
-        return _mm_castpd_ps(v);
-    }
-};
+} // namespace wjr
 
-template <>
-struct simd_cast_fn<__m128d_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128d v) const {
-        return _mm_castpd_si128(v);
-    }
-};
+#endif // WJR_ASSERT_HPP__
+#ifndef WJR_MATH_CLZ_HPP__
+#define WJR_MATH_CLZ_HPP__
 
-template <>
-struct simd_cast_fn<int8_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int8_t v) const {
-        return _mm_cvtsi32_si128(v);
-    }
-};
+// Already included
+#ifndef WJR_MATH_POPCOUNT_HPP__
+#define WJR_MATH_POPCOUNT_HPP__
 
-template <>
-struct simd_cast_fn<uint8_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const {
-        return _mm_cvtsi32_si128(v);
-    }
-};
+#ifndef WJR_MATH_DETAIL_HPP__
+#define WJR_MATH_DETAIL_HPP__
 
-template <>
-struct simd_cast_fn<__m128i_t, int8_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si32(v);
-    }
-};
+// Already included
 
-template <>
-struct simd_cast_fn<__m128i_t, uint8_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si32(v);
-    }
-};
+namespace wjr {
 
-template <>
-struct simd_cast_fn<int16_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int16_t v) const {
-        return _mm_cvtsi32_si128(v);
-    }
-};
+#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT))
 
-template <>
-struct simd_cast_fn<uint16_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const {
-        return _mm_cvtsi32_si128(v);
-    }
-};
+namespace math_detail {
 
-template <>
-struct simd_cast_fn<__m128i_t, int16_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si32(v);
-    }
-};
+template <typename T, T seed>
+class de_bruijn {
+public:
+    constexpr static uint8_t digits = std::numeric_limits<T>::digits;
+    constexpr static uint8_t mv = digits == 32 ? 27 : 58;
+    constexpr de_bruijn() noexcept : lookup(), lookupr() { initialize(); }
 
-template <>
-struct simd_cast_fn<__m128i_t, uint16_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si32(v);
-    }
-};
+    constexpr int get(T idx) const noexcept { return lookup[(idx * seed) >> mv]; }
+    constexpr int getr(T idx) const noexcept { return lookupr[(idx * seed) >> mv]; }
 
-template <>
-struct simd_cast_fn<int32_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int32_t v) const {
-        return _mm_cvtsi32_si128(v);
+private:
+    constexpr void initialize() noexcept {
+        for (uint8_t i = 0; i < digits; ++i) {
+            const auto idx = (seed << i) >> mv;
+            lookup[idx] = i;
+            lookupr[idx] = i == 0 ? 0 : digits - i;
+        }
     }
-};
 
-template <>
-struct simd_cast_fn<uint32_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const {
-        return _mm_cvtsi32_si128(v);
-    }
+    uint8_t lookup[digits];
+    uint8_t lookupr[digits];
 };
 
-template <>
-struct simd_cast_fn<__m128i_t, int32_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si32(v);
-    }
-};
+inline constexpr de_bruijn<uint32_t, 0x077C'B531> de_bruijn32 = {};
+inline constexpr de_bruijn<uint64_t, 0x03f7'9d71'b4ca'8b09> de_bruijn64 = {};
 
-template <>
-struct simd_cast_fn<__m128i_t, uint32_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si32(v);
-    }
-};
+} // namespace math_detail
 
-template <>
-struct simd_cast_fn<int64_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int64_t v) const {
-        return _mm_cvtsi64_si128(v);
-    }
-};
+#endif
 
-template <>
-struct simd_cast_fn<uint64_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const {
-        return _mm_cvtsi64_si128(v);
-    }
-};
+/**
+ * @brief
+ *
+ * @note `n & -n` is the lowest bit of n.
+ */
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr T lowbit(T n) noexcept {
+    return n & -n;
+}
 
-template <>
-struct simd_cast_fn<__m128i_t, int64_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si64(v);
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr T clear_lowbit(T n) noexcept {
+    return n & (n - 1);
+}
 
-template <>
-struct simd_cast_fn<__m128i_t, uint64_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m128i v) const {
-        return _mm_cvtsi128_si64(v);
-    }
-};
+// preview :
 
-#endif // SSE2
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr bool is_zero_or_single_bit(T n) noexcept {
+    return (n & (n - 1)) == 0;
+}
 
-#if WJR_HAS_SIMD(AVX)
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr bool __has_high_bit(T n) noexcept {
+    return n >> (std::numeric_limits<T>::digits - 1);
+}
 
-struct __m256_t {
-    using type = __m256;
-};
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr T __ceil_div(T n, type_identity_t<T> div) noexcept {
+    return (n + div - 1) / div;
+}
 
-struct __m256i_t {
-    using type = __m256i;
-};
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr T __align_down(T n, type_identity_t<T> alignment) noexcept {
+    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
+    return n & (-alignment);
+}
 
-struct __m256d_t {
-    using type = __m256d;
-};
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr T __align_down_offset(T n, type_identity_t<T> alignment) noexcept {
+    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
+    return n & (alignment - 1);
+}
 
-template <>
-struct simd_cast_fn<__m256_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256 v) const {
-        return _mm256_castps_si256(v);
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr T __align_up(T n, type_identity_t<T> alignment) noexcept {
+    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
+    return (n + alignment - 1) & (-alignment);
+}
 
-template <>
-struct simd_cast_fn<__m256_t, __m256d_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256 v) const {
-        return _mm256_castps_pd(v);
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr T __align_up_offset(T n, type_identity_t<T> alignment) noexcept {
+    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
+    return (-n) & (alignment - 1);
+}
 
-template <>
-struct simd_cast_fn<__m256i_t, __m256_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256i v) const {
-        return _mm256_castsi256_ps(v);
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST constexpr std::make_signed_t<T> __fasts_from_unsigned(T x) noexcept {
+    const std::make_signed_t<T> ret = x;
+    WJR_ASSERT_ASSUME_L2(ret >= 0, "overflow");
+    return ret;
+}
 
-template <>
-struct simd_cast_fn<__m256i_t, __m256d_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256i v) const {
-        return _mm256_castsi256_pd(v);
-    }
-};
+template <typename T, typename U = std::make_unsigned_t<T>,
+          WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr U __fasts_abs(T x) noexcept {
+    return static_cast<U>(x < 0 ? -x : x);
+}
 
-template <>
-struct simd_cast_fn<__m256d_t, __m256_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256d v) const {
-        return _mm256_castpd_ps(v);
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr T __fasts_negate(T x) noexcept {
+    return -x;
+}
 
-template <>
-struct simd_cast_fn<__m256d_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256d v) const {
-        return _mm256_castpd_si256(v);
-    }
-};
+template <typename T, typename U = std::make_unsigned_t<T>,
+          WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr T __fasts_conditional_negate(bool condition, T x) noexcept {
+    return condition ? -x : x;
+}
 
-template <>
-struct simd_cast_fn<__m128i_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const {
-        return _mm256_castsi128_si256(v);
-    }
-};
+template <typename T, typename U = std::make_unsigned_t<T>,
+          WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr T __fasts_negate_with(T condition, T x) noexcept {
+    return __fasts_conditional_negate(condition < 0, x);
+}
 
-template <>
-struct simd_cast_fn<__m256i_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m256i v) const {
-        return _mm256_castsi256_si128(v);
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr T __fasts_increment(T x) noexcept {
+    WJR_ASSERT_L2(x != std::numeric_limits<T>::min() &&
+                      x != std::numeric_limits<T>::max(),
+                  "overflow");
 
-template <>
-struct simd_cast_fn<int8_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int8_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
-    }
-};
+    return x < 0 ? x - 1 : x + 1;
+}
 
-template <>
-struct simd_cast_fn<uint8_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr T __fasts_decrement(T x) noexcept {
+    WJR_ASSERT_L2(x != 0 && x + 1 != T(0), "overflow");
 
-template <>
-struct simd_cast_fn<__m256i_t, int8_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
-    }
-};
+    return x < 0 ? x + 1 : x - 1;
+}
 
-template <>
-struct simd_cast_fn<__m256i_t, uint8_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr T __fasts_add(T x, std::make_unsigned_t<T> y) noexcept {
+    return x < 0 ? x - y : x + y;
+}
 
-template <>
-struct simd_cast_fn<int16_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int16_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
-    }
-};
+template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
+WJR_CONST constexpr T __fasts_sub(T x, std::make_unsigned_t<T> y) noexcept {
+    return x < 0 ? x + y : x - y;
+}
 
-template <>
-struct simd_cast_fn<uint16_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
-    }
-};
+} // namespace wjr
 
-template <>
-struct simd_cast_fn<__m256i_t, int16_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
-    }
-};
+#endif // WJR_MATH_DETAIL_HPP__
 
-template <>
-struct simd_cast_fn<__m256i_t, uint16_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+namespace wjr {
+
+template <typename T>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR int fallback_popcount(T x) noexcept {
+    constexpr auto nd = std::numeric_limits<T>::digits;
+    if constexpr (nd < 32) {
+        return fallback_popcount(static_cast<uint32_t>(x));
+    } else {
+        if constexpr (nd == 32) {
+            x -= (x >> 1) & 0x5555'5555;
+            x = (x & 0x3333'3333) + ((x >> 2) & 0x3333'3333);
+            x = (x + (x >> 4)) & 0x0f0f'0f0f;
+            return (x * 0x0101'0101) >> 24;
+        } else {
+            x -= (x >> 1) & 0x5555'5555'5555'5555;
+            x = (x & 0x3333'3333'3333'3333) + ((x >> 2) & 0x3333'3333'3333'3333);
+            x = (x + (x >> 4)) & 0x0f0f'0f0f'0f0f'0f0f;
+            return (x * 0x0101'0101'0101'0101) >> 56;
+        }
     }
-};
+}
 
-template <>
-struct simd_cast_fn<int32_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int32_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+#if WJR_HAS_BUILTIN(POPCOUNT)
+
+template <typename T>
+WJR_CONST WJR_INTRINSIC_INLINE int builtin_popcount(T x) noexcept {
+    constexpr auto nd = std::numeric_limits<T>::digits;
+    if constexpr (nd < 32) {
+        return builtin_popcount(static_cast<uint32_t>(x));
+    } else {
+        if constexpr (nd <= std::numeric_limits<unsigned int>::digits) {
+            return __builtin_popcount(x);
+        } else if constexpr (nd <= std::numeric_limits<unsigned long>::digits) {
+            return __builtin_popcountl(x);
+        }
+        if constexpr (nd <= std::numeric_limits<unsigned long long>::digits) {
+            return __builtin_popcountll(x);
+        } else {
+            static_assert(nd <= 64, "not support yet");
+        }
     }
-};
+}
 
-template <>
-struct simd_cast_fn<uint32_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+#endif // WJR_HAS_BUILTIN(POPCOUNT)
+
+template <typename T>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount_impl(T x) noexcept {
+    if (WJR_BUILTIN_CONSTANT_P_TRUE(is_zero_or_single_bit(x))) {
+        return x != 0;
     }
-};
 
-template <>
-struct simd_cast_fn<__m256i_t, int32_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+#if WJR_HAS_BUILTIN(POPCOUNT)
+    if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) {
+        return fallback_popcount(x);
     }
-};
 
-template <>
-struct simd_cast_fn<__m256i_t, uint32_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    return builtin_popcount(x);
+#else
+    return fallback_popcount(x);
+#endif
+}
+
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount(T x) noexcept {
+    const int ret = popcount_impl(x);
+    WJR_ASSUME(0 <= ret && ret <= std::numeric_limits<T>::digits);
+    return ret;
+}
+
+} // namespace wjr
+
+#endif // WJR_MATH_POPCOUNT_HPP__
+
+#if WJR_HAS_BUILTIN(__builtin_clz)
+#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF
+#elif defined(WJR_MSVC) && defined(WJR_X86)
+#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF_VAR(2)
+#endif
+
+#if WJR_HAS_BUILTIN(CLZ) == 2
+#ifndef WJR_X86_SIMD_INTRIN_HPP__
+#define WJR_X86_SIMD_INTRIN_HPP__
+
+// Already included
+
+#if defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+#elif defined(__GNUC__)
+/* GCC-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#endif
+
+#endif // WJR_X86_SIMD_INTRIN_HPP__
+#endif
+
+namespace wjr {
+
+template <typename T>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_clz(T x) noexcept {
+    constexpr auto nd = std::numeric_limits<T>::digits;
+
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+
+    if constexpr (nd >= 16) {
+        x |= (x >> 8);
     }
-};
 
-template <>
-struct simd_cast_fn<int64_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int64_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint64_t, __m128i_t>(v));
+    if constexpr (nd >= 32) {
+        x |= (x >> 16);
     }
-};
 
-template <>
-struct simd_cast_fn<uint64_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const {
-        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint64_t, __m128i_t>(v));
+    if constexpr (nd >= 64) {
+        x |= (x >> 32);
     }
-};
 
-template <>
-struct simd_cast_fn<__m256i_t, int64_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    return fallback_popcount(~x);
+}
+
+template <typename T>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_clz(T x) noexcept {
+    constexpr auto nd = std::numeric_limits<T>::digits;
+
+#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT))
+    if constexpr (nd >= 32) {
+#endif
+        x |= (x >> 1);
+        x |= (x >> 2);
+        x |= (x >> 4);
+
+        if constexpr (nd >= 16) {
+            x |= (x >> 8);
+        }
+
+        if constexpr (nd >= 32) {
+            x |= (x >> 16);
+        }
+
+        if constexpr (nd >= 64) {
+            x |= (x >> 32);
+        }
+#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT))
     }
-};
+#endif
 
-template <>
-struct simd_cast_fn<__m256i_t, uint64_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m256i v) const {
-        return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v));
+#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)
+    return popcount<T>(~x);
+#else
+    if constexpr (nd < 32) {
+        return fallback_clz(static_cast<uint32_t>(x)) - (32 - nd);
+    } else {
+        ++x;
+
+        if constexpr (nd <= 32) {
+            return math_detail::de_bruijn32.getr(x);
+        } else if constexpr (nd <= 64) {
+            return math_detail::de_bruijn64.getr(x);
+        } else {
+            static_assert(nd <= 64, "not support yet");
+        }
     }
-};
+#endif
+}
 
-#endif // AVX
+#if WJR_HAS_BUILTIN(CLZ)
 
-} // namespace wjr
+template <typename T>
+WJR_CONST WJR_INTRINSIC_INLINE int builtin_clz(T x) noexcept {
+    constexpr auto nd = std::numeric_limits<T>::digits;
+    if constexpr (nd < 32) {
+        return builtin_clz(static_cast<uint32_t>(x)) - (32 - nd);
+    } else {
+#if WJR_HAS_BUILTIN(CLZ) == 1
+        if constexpr (nd <= std::numeric_limits<unsigned int>::digits) {
+            constexpr auto delta = std::numeric_limits<unsigned int>::digits - nd;
+            return __builtin_clz(static_cast<unsigned int>(x)) - delta;
+        } else if constexpr (nd <= std::numeric_limits<unsigned long>::digits) {
+            constexpr auto delta = std::numeric_limits<unsigned long>::digits - nd;
+            return __builtin_clzl(static_cast<unsigned long>(x)) - delta;
+        } else if constexpr (nd <= std::numeric_limits<unsigned long long>::digits) {
+            constexpr auto delta = std::numeric_limits<unsigned long long>::digits - nd;
+            return __builtin_clzll(static_cast<unsigned long long>(x)) - delta;
+        } else {
+            static_assert(nd <= 64, "not supported yet");
+        }
+#else
+        if constexpr (nd == 32) {
+            unsigned long result;
+            (void)_BitScanReverse(&result, x);
+            return 31 - result;
+        } else {
+            unsigned long result;
+            (void)_BitScanReverse64(&result, x);
+            return 63 - result;
+        }
+#endif
+    }
+}
 
-#endif // WJR_X86_SIMD_SIMD_CAST_HPP__
+#endif
 
-#include <cstring>
+/**
+ * @brief Fast count leading zeros
+ *
+ * @tparam T Must be an unsigned integral type
+ */
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int clz(T x) noexcept {
+#if WJR_HAS_BUILTIN(CLZ)
+    if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) {
+        return fallback_clz(x);
+    }
+
+    return builtin_clz(x);
+#else
+    return fallback_clz(x);
+#endif
+}
+
+} // namespace wjr
+
+#endif // WJR_MATH_CLZ_HPP__
+#ifndef WJR_MATH_CTZ_HPP__
+#define WJR_MATH_CTZ_HPP__
 
 // Already included
 // Already included
 
-namespace wjr {
+#if WJR_HAS_BUILTIN(__builtin_ctz)
+#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF
+#elif defined(WJR_MSVC) && defined(WJR_X86)
+#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF_VAR(2)
+#endif
 
-struct sse {
-    using mask_type = uint16_t;
+#if WJR_HAS_BUILTIN(CTZ) == 2
+// Already included
+#endif
 
-#if WJR_HAS_SIMD(SSE)
+namespace wjr {
 
-    using float_type = __m128;
-    using float_tag_type = __m128_t;
+template <typename T>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_ctz(T x) noexcept {
+    return fallback_popcount<T>(lowbit(x) - 1);
+}
 
-#endif // SSE
+template <typename T>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_ctz(T x) noexcept {
+#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)
+    return popcount<T>(lowbit(x) - 1);
+#else
+    constexpr auto nd = std::numeric_limits<T>::digits;
 
-#if WJR_HAS_SIMD(SSE2)
+    if constexpr (nd < 32) {
+        return fallback_ctz(static_cast<uint32_t>(x));
+    } else {
+        x = lowbit(x);
 
-    using int_type = __m128i;
-    using int_tag_type = __m128i_t;
-    using double_type = __m128d;
-    using double_tag_type = __m128d_t;
+        if constexpr (nd <= 32) {
+            return math_detail::de_bruijn32.get(x);
+        } else if constexpr (nd <= 64) {
+            return math_detail::de_bruijn64.get(x);
+        } else {
+            static_assert(nd <= 64, "not support yet");
+        }
+    }
+#endif //
+}
 
-#endif // SSE2
+#if WJR_HAS_BUILTIN(CTZ)
 
-    constexpr static size_t width();
-    constexpr static mask_type mask();
+template <typename T>
+WJR_CONST WJR_INTRINSIC_INLINE int builtin_ctz(T x) noexcept {
+    constexpr auto nd = std::numeric_limits<T>::digits;
 
-#if WJR_HAS_SIMD(SSE)
+    if constexpr (nd < 32) {
+        return builtin_ctz(static_cast<uint32_t>(x));
+    } else {
+#if WJR_HAS_BUILTIN(CTZ) == 1
+        if constexpr (nd <= std::numeric_limits<unsigned int>::digits) {
+            return __builtin_ctz(static_cast<unsigned int>(x));
+        } else if constexpr (nd <= std::numeric_limits<unsigned long>::digits) {
+            return __builtin_ctzl(static_cast<unsigned long>(x));
+        } else if constexpr (nd <= std::numeric_limits<unsigned long long>::digits) {
+            return __builtin_ctzll(static_cast<unsigned long long>(x));
+        } else {
+            static_assert(nd <= 64, "not supported yet");
+        }
+#else
+        if constexpr (nd == 32) {
+            unsigned long result;
+            (void)_BitScanForward(&result, x);
+            return result;
+        } else {
+            unsigned long result;
+            (void)_BitScanForward64(&result, x);
+            return result;
+        }
+#endif
+    }
+}
 
-    WJR_INTRINSIC_INLINE static mask_type movemask_ps(__m128 v);
-    WJR_INTRINSIC_INLINE static void sfence();
+#endif
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128 shuffle_ps(__m128 a, __m128 b);
+/**
+ * @brief Fast count trailing zeros
+ *
+ * @details Very fast even on non-optimized platforms by using a De Bruijn sequence. \n
+ * Try __builtin_clz if available, otherwise fallback to a portable implementation. \n
+ * In fallback_clz, use popcount and lowbit if POPCOUNT and POPCNT are available, make
+ * sure popcount is fast. \n
+ * Then use De Bruijn sequence, just a bit slower than popcount + lowbit.
+ *
+ * @tparam T Must be an unsigned integral type
+ */
+template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
+WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int ctz(T x) noexcept {
+#if WJR_HAS_BUILTIN(CTZ)
+    if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) {
+        return fallback_ctz(x);
+    }
 
-#endif // SSE
+    return builtin_ctz(x);
+#else
+    return fallback_ctz(x);
+#endif
+}
 
-#if WJR_HAS_SIMD(SSE2)
+} // namespace wjr
 
-    WJR_INTRINSIC_INLINE static __m128i add_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i add_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i add_epi32(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i add_epi64(__m128i a, __m128i b);
+#endif // WJR_MATH_CTZ_HPP__
+// Already included
 
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint64_t);
+namespace wjr::simd_detail {
 
-    WJR_INTRINSIC_INLINE static int8_t add_epi8(__m128i a);
-    WJR_INTRINSIC_INLINE static int16_t add_epi16(__m128i a);
-    WJR_INTRINSIC_INLINE static int32_t add_epi32(__m128i a);
-    WJR_INTRINSIC_INLINE static int64_t add_epi64(__m128i a);
+template <typename T, size_t Size, size_t BitWidth>
+class basic_simd_mask {
+    using mask_type = uint_t<BitWidth>;
+    constexpr static size_t __mask_bits = BitWidth / Size;
+    constexpr static mask_type __half_mask =
+        static_cast<uint_t<BitWidth / 2>>(in_place_max);
+    constexpr static mask_type __full_mask = in_place_max;
 
-    WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m128i a);
-    WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m128i a);
-    WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m128i a);
-    WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m128i a);
+public:
+    WJR_ENABLE_DEFAULT_SPECIAL_MEMBERS(basic_simd_mask);
 
-    WJR_INTRINSIC_INLINE static int8_t add(__m128i a, int8_t);
-    WJR_INTRINSIC_INLINE static int16_t add(__m128i a, int16_t);
-    WJR_INTRINSIC_INLINE static int32_t add(__m128i a, int32_t);
-    WJR_INTRINSIC_INLINE static int64_t add(__m128i a, int64_t);
-    WJR_INTRINSIC_INLINE static uint8_t add(__m128i a, uint8_t);
-    WJR_INTRINSIC_INLINE static uint16_t add(__m128i a, uint16_t);
-    WJR_INTRINSIC_INLINE static uint32_t add(__m128i a, uint32_t);
-    WJR_INTRINSIC_INLINE static uint64_t add(__m128i a, uint64_t);
+    constexpr basic_simd_mask(mask_type mask) noexcept : m_mask(mask) {}
 
-    WJR_INTRINSIC_INLINE static __m128i adds_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i adds_epi16(__m128i a, __m128i b);
+    WJR_PURE WJR_CONSTEXPR20 int clz() const noexcept {
+        WJR_ASSERT_ASSUME(m_mask != 0);
 
-    WJR_INTRINSIC_INLINE static __m128i adds_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i adds_epu16(__m128i a, __m128i b);
+        if constexpr (Size == 2) {
+            constexpr auto high_mask = __half_mask << (BitWidth / 2);
 
-    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint16_t);
+            return (m_mask & high_mask) ? 0 : 1;
+        } else {
+            return ::wjr::clz(m_mask) / __mask_bits;
+        }
+    }
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b);
+    WJR_PURE WJR_CONSTEXPR20 int ctz() const noexcept {
+        WJR_ASSERT_ASSUME(m_mask != 0);
 
-    WJR_INTRINSIC_INLINE static __m128i alignr_epi16(__m128i a, __m128i b, int c);
-    WJR_INTRINSIC_INLINE static __m128i alignr_epi32(__m128i a, __m128i b, int c);
-    WJR_INTRINSIC_INLINE static __m128i alignr_epi64(__m128i a, __m128i b, int c);
+        if constexpr (Size == 2) {
+            constexpr auto low_mask = __half_mask;
 
-    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int64_t);
-    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint32_t);
-    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint64_t);
+            return (m_mask & low_mask) ? 0 : 1;
+        } else {
+            return ::wjr::ctz(m_mask) / __mask_bits;
+        }
+    }
 
-    WJR_INTRINSIC_INLINE static __m128i And(__m128i a, __m128i b);
+    WJR_PURE constexpr bool all() const noexcept { return m_mask == __full_mask; }
 
-    WJR_INTRINSIC_INLINE static __m128i AndNot(__m128i a, __m128i b);
+private:
+    mask_type m_mask;
+};
 
-    WJR_INTRINSIC_INLINE static __m128i avg_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i avg_epu16(__m128i a, __m128i b);
+} // namespace wjr::simd_detail
 
-    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint16_t);
+#endif // WJR_SIMD_SIMD_MASK_HPP__
 
-    // notice that mask must be 0 or 255(every byte)
-    WJR_INTRINSIC_INLINE static __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask);
-    WJR_INTRINSIC_INLINE static __m128i blendv_epi16(__m128i a, __m128i b, __m128i mask);
-    WJR_INTRINSIC_INLINE static __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask);
+namespace wjr {
 
-    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
-                                               int8_t);
-    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
-                                               int16_t);
-    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
-                                               int32_t);
-    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
-                                               uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
-                                               uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
-                                               uint32_t);
+namespace simd_abi {
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i bslli(__m128i val);
+template <size_t N>
+struct fixed_size {};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i bsrli(__m128i val);
+} // namespace simd_abi
 
-    WJR_INTRINSIC_INLINE static __m128i cmpeq_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpeq_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpeq_epi32(__m128i a, __m128i b);
+struct element_aligned_t {};
+inline constexpr element_aligned_t element_aligned{};
 
-    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint32_t);
+struct vector_aligned_t {};
+inline constexpr vector_aligned_t vector_aligned{};
 
-    WJR_INTRINSIC_INLINE static __m128i cmpge_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpge_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpge_epi32(__m128i a, __m128i b);
+template <typename T, typename Abi>
+class simd;
 
-    WJR_INTRINSIC_INLINE static __m128i cmpge_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpge_epu16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpge_epu32(__m128i a, __m128i b);
+template <typename T, size_t N>
+using fixed_size_simd = simd<T, simd_abi::fixed_size<N>>;
 
-    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint32_t);
+} // namespace wjr
 
-    WJR_INTRINSIC_INLINE static __m128i cmpgt_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt_epi32(__m128i a, __m128i b);
+#endif // WJR_SIMD_DETAIL_HPP__
+// Already included
 
-    WJR_INTRINSIC_INLINE static __m128i cmpgt_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt_epu16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt_epu32(__m128i a, __m128i b);
+namespace wjr {
 
-    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint32_t);
+// simd type can't be directly used on template
+template <typename T>
+struct simd_wrapper {
+    using type = T;
+};
 
-    WJR_INTRINSIC_INLINE static __m128i cmple_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmple_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmple_epi32(__m128i a, __m128i b);
+template <typename T>
+using simd_wrapper_t = typename simd_wrapper<T>::type;
 
-    WJR_INTRINSIC_INLINE static __m128i cmple_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmple_epu16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmple_epu32(__m128i a, __m128i b);
+#if WJR_HAS_SIMD(SSE)
 
-    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint32_t);
+struct __m128_t {
+    using type = __m128;
+};
 
-    WJR_INTRINSIC_INLINE static __m128i cmplt_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmplt_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmplt_epi32(__m128i a, __m128i b);
+#endif // SSE
 
-    WJR_INTRINSIC_INLINE static __m128i cmplt_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmplt_epu16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmplt_epu32(__m128i a, __m128i b);
+#if WJR_HAS_SIMD(SSE2)
 
-    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint32_t);
+struct __m128i_t {
+    using type = __m128i;
+};
 
-    WJR_INTRINSIC_INLINE static __m128i cmpne_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpne_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i cmpne_epi32(__m128i a, __m128i b);
+struct __m128d_t {
+    using type = __m128d;
+};
 
-    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint32_t);
+template <>
+struct simd_cast_fn<__m128_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128 v) const {
+        return _mm_castps_si128(v);
+    }
+};
 
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::equal_to<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::not_equal_to<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater_equal<>,
-                                            T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less_equal<>, T);
+template <>
+struct simd_cast_fn<__m128_t, __m128d_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128 v) const {
+        return _mm_castps_pd(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i concat(uint64_t lo, uint64_t hi);
+template <>
+struct simd_cast_fn<__m128i_t, __m128_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128i v) const {
+        return _mm_castsi128_ps(v);
+    }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract_epi8(__m128i a);
+template <>
+struct simd_cast_fn<__m128i_t, __m128d_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128d operator()(__m128i v) const {
+        return _mm_castsi128_pd(v);
+    }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract_epi16(__m128i a);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract_epi32(__m128i a);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m128i a);
+template <>
+struct simd_cast_fn<__m128d_t, __m128_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128 operator()(__m128d v) const {
+        return _mm_castpd_ps(v);
+    }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m128i a, int8_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m128i a, int16_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m128i a, int32_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, int64_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m128i a, uint8_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m128i a, uint16_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m128i a, uint32_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, uint64_t);
+template <>
+struct simd_cast_fn<__m128d_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128d v) const {
+        return _mm_castpd_si128(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static uint64_t getlow(__m128i v);
-    WJR_INTRINSIC_INLINE static uint64_t gethigh(__m128i v);
+template <>
+struct simd_cast_fn<int8_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int8_t v) const {
+        return _mm_cvtsi32_si128(v);
+    }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i insert_epi16(__m128i a, int i);
+template <>
+struct simd_cast_fn<uint8_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const {
+        return _mm_cvtsi32_si128(v);
+    }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, int16_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, uint16_t);
+template <>
+struct simd_cast_fn<__m128i_t, int8_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si32(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static void lfence();
+template <>
+struct simd_cast_fn<__m128i_t, uint8_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si32(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i load(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu_si16(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu_si32(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu_si64(const void *ptr);
+template <>
+struct simd_cast_fn<int16_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int16_t v) const {
+        return _mm_cvtsi32_si128(v);
+    }
+};
 
-    template <typename T,
-              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                       uint16_t, uint32_t, uint64_t>)>
-    WJR_INTRINSIC_INLINE static __m128i logical_and(__m128i a, __m128i b, T);
+template <>
+struct simd_cast_fn<uint16_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const {
+        return _mm_cvtsi32_si128(v);
+    }
+};
 
-    template <typename T,
-              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                       uint16_t, uint32_t, uint64_t>)>
-    WJR_INTRINSIC_INLINE static __m128i logical_not(__m128i v, T);
+template <>
+struct simd_cast_fn<__m128i_t, int16_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si32(v);
+    }
+};
 
-    template <typename T,
-              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                       uint16_t, uint32_t, uint64_t>)>
-    WJR_INTRINSIC_INLINE static __m128i logical_or(__m128i a, __m128i b, T);
+template <>
+struct simd_cast_fn<__m128i_t, uint16_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si32(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i madd_epi16(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<int32_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int32_t v) const {
+        return _mm_cvtsi32_si128(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static void maskmoveu(__m128i a, __m128i mask, char *mem_addr);
+template <>
+struct simd_cast_fn<uint32_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const {
+        return _mm_cvtsi32_si128(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i max_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i max_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i max_epi32(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<__m128i_t, int32_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si32(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i max_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i max_epu16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i max_epu32(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<__m128i_t, uint32_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si32(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint32_t);
+template <>
+struct simd_cast_fn<int64_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(int64_t v) const {
+        return _mm_cvtsi64_si128(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static int8_t max_epi8(__m128i a);
-    WJR_INTRINSIC_INLINE static int16_t max_epi16(__m128i a);
-    WJR_INTRINSIC_INLINE static int32_t max_epi32(__m128i a);
+template <>
+struct simd_cast_fn<uint64_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const {
+        return _mm_cvtsi64_si128(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m128i a);
-    WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m128i a);
-    WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m128i a);
+template <>
+struct simd_cast_fn<__m128i_t, int64_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si64(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static int8_t max(__m128i a, int8_t);
-    WJR_INTRINSIC_INLINE static int16_t max(__m128i a, int16_t);
-    WJR_INTRINSIC_INLINE static int32_t max(__m128i a, int32_t);
-    WJR_INTRINSIC_INLINE static uint8_t max(__m128i a, uint8_t);
-    WJR_INTRINSIC_INLINE static uint16_t max(__m128i a, uint16_t);
-    WJR_INTRINSIC_INLINE static uint32_t max(__m128i a, uint32_t);
+template <>
+struct simd_cast_fn<__m128i_t, uint64_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m128i v) const {
+        return _mm_cvtsi128_si64(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static void mfence();
+#endif // SSE2
 
-    WJR_INTRINSIC_INLINE static __m128i min_epi8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i min_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i min_epi32(__m128i a, __m128i b);
+#if WJR_HAS_SIMD(AVX)
 
-    WJR_INTRINSIC_INLINE static __m128i min_epu8(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i min_epu16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i min_epu32(__m128i a, __m128i b);
+struct __m256_t {
+    using type = __m256;
+};
 
-    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint32_t);
+struct __m256i_t {
+    using type = __m256i;
+};
 
-    WJR_INTRINSIC_INLINE static int8_t min_epi8(__m128i a);
-    WJR_INTRINSIC_INLINE static int16_t min_epi16(__m128i a);
-    WJR_INTRINSIC_INLINE static int32_t min_epi32(__m128i a);
+struct __m256d_t {
+    using type = __m256d;
+};
 
-    WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m128i a);
-    WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m128i a);
-    WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m128i a);
+template <>
+struct simd_cast_fn<__m256_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256 v) const {
+        return _mm256_castps_si256(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static int8_t min(__m128i a, int8_t);
-    WJR_INTRINSIC_INLINE static int16_t min(__m128i a, int16_t);
-    WJR_INTRINSIC_INLINE static int32_t min(__m128i a, int32_t);
+template <>
+struct simd_cast_fn<__m256_t, __m256d_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256 v) const {
+        return _mm256_castps_pd(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static uint8_t min(__m128i a, uint8_t);
-    WJR_INTRINSIC_INLINE static uint16_t min(__m128i a, uint16_t);
-    WJR_INTRINSIC_INLINE static uint32_t min(__m128i a, uint32_t);
+template <>
+struct simd_cast_fn<__m256i_t, __m256_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256i v) const {
+        return _mm256_castsi256_ps(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i move_epi64(__m128i a);
+template <>
+struct simd_cast_fn<__m256i_t, __m256d_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256d operator()(__m256i v) const {
+        return _mm256_castsi256_pd(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m128i a);
-    WJR_INTRINSIC_INLINE static mask_type movemask_pd(__m128d v);
+template <>
+struct simd_cast_fn<__m256d_t, __m256_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256 operator()(__m256d v) const {
+        return _mm256_castpd_ps(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int8_t);
-    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int32_t);
-    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int64_t);
+template <>
+struct simd_cast_fn<__m256d_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256d v) const {
+        return _mm256_castpd_si256(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint8_t);
-    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint32_t);
-    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint64_t);
+template <>
+struct simd_cast_fn<__m128i_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const {
+        return _mm256_castsi128_si256(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i mul_epu32(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<__m256i_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m256i v) const {
+        return _mm256_castsi256_si128(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i mulhi_epi16(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<int8_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int8_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i mulhi_epu16(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<uint8_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i mullo_epi16(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<__m256i_t, int8_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int8_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i negate_epi8(__m128i a);
-    WJR_INTRINSIC_INLINE static __m128i negate_epi16(__m128i a);
-    WJR_INTRINSIC_INLINE static __m128i negate_epi32(__m128i a);
-    WJR_INTRINSIC_INLINE static __m128i negate_epi64(__m128i a);
+template <>
+struct simd_cast_fn<__m256i_t, uint8_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint8_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int64_t);
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint32_t);
-    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint64_t);
+template <>
+struct simd_cast_fn<int16_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int16_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i Not(__m128i v);
+template <>
+struct simd_cast_fn<uint16_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i Or(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<__m256i_t, int16_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int16_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i packs_epi16(__m128i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m128i packs_epi32(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<__m256i_t, uint16_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint16_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i packus_epi16(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<int32_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int32_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i loadu_si48(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu_si80(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu_si96(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu_si112(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m128i loadu_si128(const void *ptr);
+template <>
+struct simd_cast_fn<uint32_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint32_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i loadu_si16x(const void *ptr, int n);
+template <>
+struct simd_cast_fn<__m256i_t, int32_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int32_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i sad_epu8(__m128i a, __m128i b);
+template <>
+struct simd_cast_fn<__m256i_t, uint32_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint32_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint32_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i zeros();
-    WJR_INTRINSIC_INLINE static __m128i ones();
+template <>
+struct simd_cast_fn<int64_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(int64_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint64_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i set_epi8(char e15, char e14, char e13, char e12,
-                                                 char e11, char e10, char e9, char e8,
-                                                 char e7, char e6, char e5, char e4,
-                                                 char e3, char e2, char e1, char e0);
+template <>
+struct simd_cast_fn<uint64_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const {
+        return simd_cast<__m128i_t, __m256i_t>(simd_cast<uint64_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i set_epi16(short e7, short e6, short e5, short e4,
-                                                  short e3, short e2, short e1, short e0);
-    WJR_INTRINSIC_INLINE static __m128i set_epi32(int e3, int e2, int e1, int e0);
-    WJR_INTRINSIC_INLINE static __m128i set_epi64x(long long e1, long long e0);
+template <>
+struct simd_cast_fn<__m256i_t, int64_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE int64_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i setr_epi8(char e15, char e14, char e13, char e12,
-                                                  char e11, char e10, char e9, char e8,
-                                                  char e7, char e6, char e5, char e4,
-                                                  char e3, char e2, char e1, char e0);
+template <>
+struct simd_cast_fn<__m256i_t, uint64_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE uint64_t operator()(__m256i v) const {
+        return simd_cast<__m128i_t, uint64_t>(simd_cast<__m256i_t, __m128i_t>(v));
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m128i setr_epi16(short e7, short e6, short e5, short e4,
-                                                   short e3, short e2, short e1,
-                                                   short e0);
-    WJR_INTRINSIC_INLINE static __m128i setr_epi32(int e3, int e2, int e1, int e0);
+#endif // AVX
 
-    WJR_INTRINSIC_INLINE static __m128i set1_epi8(int8_t val);
-    WJR_INTRINSIC_INLINE static __m128i set1_epi16(int16_t val);
-    WJR_INTRINSIC_INLINE static __m128i set1_epi32(int32_t val);
-    WJR_INTRINSIC_INLINE static __m128i set1_epi64(int64_t val);
+} // namespace wjr
 
-    WJR_INTRINSIC_INLINE static __m128i set1(int8_t val, int8_t);
-    WJR_INTRINSIC_INLINE static __m128i set1(int16_t val, int16_t);
-    WJR_INTRINSIC_INLINE static __m128i set1(int32_t val, int32_t);
-    WJR_INTRINSIC_INLINE static __m128i set1(int64_t val, int64_t);
-    WJR_INTRINSIC_INLINE static __m128i set1(uint8_t val, uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i set1(uint16_t val, uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i set1(uint32_t val, uint32_t);
-    WJR_INTRINSIC_INLINE static __m128i set1(uint64_t val, uint64_t);
+#endif // WJR_X86_SIMD_SIMD_CAST_HPP__
 
-    WJR_INTRINSIC_INLINE static __m128i setmin_epi8();
-    WJR_INTRINSIC_INLINE static __m128i setmin_epi16();
-    WJR_INTRINSIC_INLINE static __m128i setmin_epi32();
+#include <cstring>
 
-    WJR_INTRINSIC_INLINE static __m128i setmin(int8_t);
-    WJR_INTRINSIC_INLINE static __m128i setmin(int16_t);
-    WJR_INTRINSIC_INLINE static __m128i setmin(int32_t);
-    WJR_INTRINSIC_INLINE static __m128i setmin(uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i setmin(uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i setmin(uint32_t);
+// Already included
+// Already included
 
-    WJR_INTRINSIC_INLINE static __m128i setmax_epi8();
-    WJR_INTRINSIC_INLINE static __m128i setmax_epi16();
-    WJR_INTRINSIC_INLINE static __m128i setmax_epi32();
+namespace wjr {
 
-    WJR_INTRINSIC_INLINE static __m128i setmax(int8_t);
-    WJR_INTRINSIC_INLINE static __m128i setmax(int16_t);
-    WJR_INTRINSIC_INLINE static __m128i setmax(int32_t);
-    WJR_INTRINSIC_INLINE static __m128i setmax(uint8_t);
-    WJR_INTRINSIC_INLINE static __m128i setmax(uint16_t);
-    WJR_INTRINSIC_INLINE static __m128i setmax(uint32_t);
+struct sse {
+    using mask_type = uint16_t;
 
-    template <int imm>
-    WJR_INTRINSIC_INLINE static __m128i shl(__m128i a);
+#if WJR_HAS_SIMD(SSE)
 
-    template <int imm>
-    WJR_INTRINSIC_INLINE static __m128i shr(__m128i b);
+    using float_type = __m128;
+    using float_tag_type = __m128_t;
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i shuffle_epi32(__m128i v);
+#endif // SSE
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i shufflehi_epi16(__m128i v);
+#if WJR_HAS_SIMD(SSE2)
+
+    using int_type = __m128i;
+    using int_tag_type = __m128i_t;
+    using double_type = __m128d;
+    using double_tag_type = __m128d_t;
+
+#endif // SSE2
+
+    constexpr static size_t width();
+    constexpr static mask_type mask();
+
+#if WJR_HAS_SIMD(SSE)
+
+    WJR_INTRINSIC_INLINE static mask_type movemask_ps(__m128 v);
+    WJR_INTRINSIC_INLINE static void sfence();
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128 shuffle_ps(__m128 a, __m128 b);
+
+#endif // SSE
+
+#if WJR_HAS_SIMD(SSE2)
+
+    WJR_INTRINSIC_INLINE static __m128i add_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i add_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i add_epi32(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i add_epi64(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m128i add(__m128i a, __m128i b, uint64_t);
+
+    WJR_INTRINSIC_INLINE static int8_t add_epi8(__m128i a);
+    WJR_INTRINSIC_INLINE static int16_t add_epi16(__m128i a);
+    WJR_INTRINSIC_INLINE static int32_t add_epi32(__m128i a);
+    WJR_INTRINSIC_INLINE static int64_t add_epi64(__m128i a);
+
+    WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m128i a);
+    WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m128i a);
+    WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m128i a);
+    WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m128i a);
+
+    WJR_INTRINSIC_INLINE static int8_t add(__m128i a, int8_t);
+    WJR_INTRINSIC_INLINE static int16_t add(__m128i a, int16_t);
+    WJR_INTRINSIC_INLINE static int32_t add(__m128i a, int32_t);
+    WJR_INTRINSIC_INLINE static int64_t add(__m128i a, int64_t);
+    WJR_INTRINSIC_INLINE static uint8_t add(__m128i a, uint8_t);
+    WJR_INTRINSIC_INLINE static uint16_t add(__m128i a, uint16_t);
+    WJR_INTRINSIC_INLINE static uint32_t add(__m128i a, uint32_t);
+    WJR_INTRINSIC_INLINE static uint64_t add(__m128i a, uint64_t);
+
+    WJR_INTRINSIC_INLINE static __m128i adds_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i adds_epi16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i adds_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i adds_epu16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i adds(__m128i a, __m128i b, uint16_t);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i alignr_epi16(__m128i a, __m128i b, int c);
+    WJR_INTRINSIC_INLINE static __m128i alignr_epi32(__m128i a, __m128i b, int c);
+    WJR_INTRINSIC_INLINE static __m128i alignr_epi64(__m128i a, __m128i b, int c);
+
+    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, int64_t);
+    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint32_t);
+    WJR_INTRINSIC_INLINE static __m128i alignr(__m128i a, __m128i b, int c, uint64_t);
+
+    WJR_INTRINSIC_INLINE static __m128i And(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i AndNot(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i avg_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i avg_epu16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i avg(__m128i a, __m128i b, uint16_t);
+
+    // notice that mask must be 0 or 255(every byte)
+    WJR_INTRINSIC_INLINE static __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask);
+    WJR_INTRINSIC_INLINE static __m128i blendv_epi16(__m128i a, __m128i b, __m128i mask);
+    WJR_INTRINSIC_INLINE static __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask);
+
+    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
+                                               int8_t);
+    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
+                                               int16_t);
+    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
+                                               int32_t);
+    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
+                                               uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
+                                               uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i blendv(__m128i a, __m128i b, __m128i mask,
+                                               uint32_t);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i bslli(__m128i val);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i bsrli(__m128i val);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpeq_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpeq_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpeq_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpeq(__m128i a, __m128i b, uint32_t);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpge_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpge_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpge_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpge_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpge_epu16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpge_epu32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpge(__m128i a, __m128i b, uint32_t);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpgt_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpgt_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt_epu16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt_epu32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpgt(__m128i a, __m128i b, uint32_t);
+
+    WJR_INTRINSIC_INLINE static __m128i cmple_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmple_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmple_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmple_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmple_epu16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmple_epu32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmple(__m128i a, __m128i b, uint32_t);
+
+    WJR_INTRINSIC_INLINE static __m128i cmplt_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmplt_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmplt_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmplt_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmplt_epu16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmplt_epu32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmplt(__m128i a, __m128i b, uint32_t);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpne_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpne_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i cmpne_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i cmpne(__m128i a, __m128i b, uint32_t);
+
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::equal_to<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::not_equal_to<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::greater_equal<>,
+                                            T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m128i cmp(__m128i a, __m128i b, std::less_equal<>, T);
+
+    WJR_INTRINSIC_INLINE static __m128i concat(uint64_t lo, uint64_t hi);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract_epi8(__m128i a);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract_epi16(__m128i a);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract_epi32(__m128i a);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m128i a);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m128i a, int8_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m128i a, int16_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m128i a, int32_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, int64_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m128i a, uint8_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m128i a, uint16_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m128i a, uint32_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int64_t extract(__m128i a, uint64_t);
+
+    WJR_INTRINSIC_INLINE static uint64_t getlow(__m128i v);
+    WJR_INTRINSIC_INLINE static uint64_t gethigh(__m128i v);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i insert_epi16(__m128i a, int i);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, int16_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i insert(__m128i a, int i, uint16_t);
+
+    WJR_INTRINSIC_INLINE static void lfence();
+
+    WJR_INTRINSIC_INLINE static __m128i load(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu_si16(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu_si32(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu_si64(const void *ptr);
+
+    template <typename T,
+              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                       uint16_t, uint32_t, uint64_t>)>
+    WJR_INTRINSIC_INLINE static __m128i logical_and(__m128i a, __m128i b, T);
+
+    template <typename T,
+              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                       uint16_t, uint32_t, uint64_t>)>
+    WJR_INTRINSIC_INLINE static __m128i logical_not(__m128i v, T);
+
+    template <typename T,
+              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                       uint16_t, uint32_t, uint64_t>)>
+    WJR_INTRINSIC_INLINE static __m128i logical_or(__m128i a, __m128i b, T);
+
+    WJR_INTRINSIC_INLINE static __m128i madd_epi16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static void maskmoveu(__m128i a, __m128i mask, char *mem_addr);
+
+    WJR_INTRINSIC_INLINE static __m128i max_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i max_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i max_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i max_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i max_epu16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i max_epu32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i max(__m128i a, __m128i b, uint32_t);
+
+    WJR_INTRINSIC_INLINE static int8_t max_epi8(__m128i a);
+    WJR_INTRINSIC_INLINE static int16_t max_epi16(__m128i a);
+    WJR_INTRINSIC_INLINE static int32_t max_epi32(__m128i a);
+
+    WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m128i a);
+    WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m128i a);
+    WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m128i a);
+
+    WJR_INTRINSIC_INLINE static int8_t max(__m128i a, int8_t);
+    WJR_INTRINSIC_INLINE static int16_t max(__m128i a, int16_t);
+    WJR_INTRINSIC_INLINE static int32_t max(__m128i a, int32_t);
+    WJR_INTRINSIC_INLINE static uint8_t max(__m128i a, uint8_t);
+    WJR_INTRINSIC_INLINE static uint16_t max(__m128i a, uint16_t);
+    WJR_INTRINSIC_INLINE static uint32_t max(__m128i a, uint32_t);
+
+    WJR_INTRINSIC_INLINE static void mfence();
+
+    WJR_INTRINSIC_INLINE static __m128i min_epi8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i min_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i min_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i min_epu8(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i min_epu16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i min_epu32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i min(__m128i a, __m128i b, uint32_t);
+
+    WJR_INTRINSIC_INLINE static int8_t min_epi8(__m128i a);
+    WJR_INTRINSIC_INLINE static int16_t min_epi16(__m128i a);
+    WJR_INTRINSIC_INLINE static int32_t min_epi32(__m128i a);
+
+    WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m128i a);
+    WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m128i a);
+    WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m128i a);
+
+    WJR_INTRINSIC_INLINE static int8_t min(__m128i a, int8_t);
+    WJR_INTRINSIC_INLINE static int16_t min(__m128i a, int16_t);
+    WJR_INTRINSIC_INLINE static int32_t min(__m128i a, int32_t);
+
+    WJR_INTRINSIC_INLINE static uint8_t min(__m128i a, uint8_t);
+    WJR_INTRINSIC_INLINE static uint16_t min(__m128i a, uint16_t);
+    WJR_INTRINSIC_INLINE static uint32_t min(__m128i a, uint32_t);
+
+    WJR_INTRINSIC_INLINE static __m128i move_epi64(__m128i a);
+
+    WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m128i a);
+    WJR_INTRINSIC_INLINE static mask_type movemask_pd(__m128d v);
+
+    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int8_t);
+    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int32_t);
+    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, int64_t);
+
+    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint8_t);
+    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint32_t);
+    WJR_INTRINSIC_INLINE static mask_type movemask(__m128i v, uint64_t);
+
+    WJR_INTRINSIC_INLINE static __m128i mul_epu32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i mulhi_epi16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i mulhi_epu16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i mullo_epi16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i negate_epi8(__m128i a);
+    WJR_INTRINSIC_INLINE static __m128i negate_epi16(__m128i a);
+    WJR_INTRINSIC_INLINE static __m128i negate_epi32(__m128i a);
+    WJR_INTRINSIC_INLINE static __m128i negate_epi64(__m128i a);
+
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, int64_t);
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint32_t);
+    WJR_INTRINSIC_INLINE static __m128i negate(__m128i a, uint64_t);
+
+    WJR_INTRINSIC_INLINE static __m128i Not(__m128i v);
+
+    WJR_INTRINSIC_INLINE static __m128i Or(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i packs_epi16(__m128i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m128i packs_epi32(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i packus_epi16(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i loadu_si48(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu_si80(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu_si96(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu_si112(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m128i loadu_si128(const void *ptr);
+
+    WJR_INTRINSIC_INLINE static __m128i loadu_si16x(const void *ptr, int n);
+
+    WJR_INTRINSIC_INLINE static __m128i sad_epu8(__m128i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m128i zeros();
+    WJR_INTRINSIC_INLINE static __m128i ones();
+
+    WJR_INTRINSIC_INLINE static __m128i set_epi8(char e15, char e14, char e13, char e12,
+                                                 char e11, char e10, char e9, char e8,
+                                                 char e7, char e6, char e5, char e4,
+                                                 char e3, char e2, char e1, char e0);
+
+    WJR_INTRINSIC_INLINE static __m128i set_epi16(short e7, short e6, short e5, short e4,
+                                                  short e3, short e2, short e1, short e0);
+    WJR_INTRINSIC_INLINE static __m128i set_epi32(int e3, int e2, int e1, int e0);
+    WJR_INTRINSIC_INLINE static __m128i set_epi64x(long long e1, long long e0);
+
+    WJR_INTRINSIC_INLINE static __m128i setr_epi8(char e15, char e14, char e13, char e12,
+                                                  char e11, char e10, char e9, char e8,
+                                                  char e7, char e6, char e5, char e4,
+                                                  char e3, char e2, char e1, char e0);
+
+    WJR_INTRINSIC_INLINE static __m128i setr_epi16(short e7, short e6, short e5, short e4,
+                                                   short e3, short e2, short e1,
+                                                   short e0);
+    WJR_INTRINSIC_INLINE static __m128i setr_epi32(int e3, int e2, int e1, int e0);
+
+    WJR_INTRINSIC_INLINE static __m128i set1_epi8(int8_t val);
+    WJR_INTRINSIC_INLINE static __m128i set1_epi16(int16_t val);
+    WJR_INTRINSIC_INLINE static __m128i set1_epi32(int32_t val);
+    WJR_INTRINSIC_INLINE static __m128i set1_epi64(int64_t val);
+
+    WJR_INTRINSIC_INLINE static __m128i set1(int8_t val, int8_t);
+    WJR_INTRINSIC_INLINE static __m128i set1(int16_t val, int16_t);
+    WJR_INTRINSIC_INLINE static __m128i set1(int32_t val, int32_t);
+    WJR_INTRINSIC_INLINE static __m128i set1(int64_t val, int64_t);
+    WJR_INTRINSIC_INLINE static __m128i set1(uint8_t val, uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i set1(uint16_t val, uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i set1(uint32_t val, uint32_t);
+    WJR_INTRINSIC_INLINE static __m128i set1(uint64_t val, uint64_t);
+
+    WJR_INTRINSIC_INLINE static __m128i setmin_epi8();
+    WJR_INTRINSIC_INLINE static __m128i setmin_epi16();
+    WJR_INTRINSIC_INLINE static __m128i setmin_epi32();
+
+    WJR_INTRINSIC_INLINE static __m128i setmin(int8_t);
+    WJR_INTRINSIC_INLINE static __m128i setmin(int16_t);
+    WJR_INTRINSIC_INLINE static __m128i setmin(int32_t);
+    WJR_INTRINSIC_INLINE static __m128i setmin(uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i setmin(uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i setmin(uint32_t);
+
+    WJR_INTRINSIC_INLINE static __m128i setmax_epi8();
+    WJR_INTRINSIC_INLINE static __m128i setmax_epi16();
+    WJR_INTRINSIC_INLINE static __m128i setmax_epi32();
+
+    WJR_INTRINSIC_INLINE static __m128i setmax(int8_t);
+    WJR_INTRINSIC_INLINE static __m128i setmax(int16_t);
+    WJR_INTRINSIC_INLINE static __m128i setmax(int32_t);
+    WJR_INTRINSIC_INLINE static __m128i setmax(uint8_t);
+    WJR_INTRINSIC_INLINE static __m128i setmax(uint16_t);
+    WJR_INTRINSIC_INLINE static __m128i setmax(uint32_t);
+
+    template <int imm>
+    WJR_INTRINSIC_INLINE static __m128i shl(__m128i a);
+
+    template <int imm>
+    WJR_INTRINSIC_INLINE static __m128i shr(__m128i b);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i shuffle_epi32(__m128i v);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i shufflehi_epi16(__m128i v);
 
     template <int imm8>
     WJR_INTRINSIC_INLINE static __m128i shufflelo_epi16(__m128i v);
@@ -4680,2009 +5373,2015 @@ struct sse {
 #endif // SSE4_1
 };
 
-struct avx {
-    using mask_type = uint32_t;
+namespace sse_detail {
+#if WJR_HAS_SIMD(SSE2)
 
-#if WJR_HAS_SIMD(AVX)
+const static __m128i srli_epi8_mask[8] = {
+    sse::set1_epi16(0xFFFF), sse::set1_epi16(0x7F7F), sse::set1_epi16(0x3F3F),
+    sse::set1_epi16(0x1F1F), sse::set1_epi16(0xF0F),  sse::set1_epi16(0x707),
+    sse::set1_epi16(0x303),  sse::set1_epi16(0x101),
+};
 
-    using float_type = __m256;
-    using float_tag_type = __m256_t;
-    using int_type = __m256i;
-    using int_tag_type = __m256i_t;
-    using double_type = __m256d;
-    using double_tag_type = __m256d_t;
+#endif
+} // namespace sse_detail
 
-#endif // AVX
+#if WJR_HAS_SIMD(SSE2)
 
-    constexpr static size_t width();
-    constexpr static mask_type mask();
+template <>
+struct broadcast_fn<uint8_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const {
+        return _mm_set1_epi8(v);
+    }
+};
 
-#if WJR_HAS_SIMD(AVX)
+template <>
+struct broadcast_fn<uint16_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const {
+        return _mm_set1_epi16(v);
+    }
+};
 
-    WJR_INTRINSIC_INLINE static __m256i concat(__m128i a, __m128i b);
+template <>
+struct broadcast_fn<uint32_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const {
+        return _mm_set1_epi32(v);
+    }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract_epi32(__m256i v);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m256i v);
+template <>
+struct broadcast_fn<uint64_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const {
+        return _mm_set1_epi64x(v);
+    }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m256i v, int32_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int64_t extract(__m256i v, int64_t);
+template <>
+struct broadcast_fn<__m128i_t, __m128i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128i v) const { return v; }
+};
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m128i extract_si128(__m256i v);
+#endif // SSE2
 
-    WJR_INTRINSIC_INLINE static __m128i getlow(__m256i a);
+/*------------------------sse------------------------*/
 
-    WJR_INTRINSIC_INLINE static __m128i gethigh(__m256i a);
+constexpr size_t sse::width() { return 128; }
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i insert_epi8(__m256i v, int8_t i);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i insert_epi16(__m256i v, int16_t i);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i insert_epi32(__m256i v, int32_t i);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i insert_epi64(__m256i v, int64_t i);
+constexpr sse::mask_type sse::mask() { return 0xFFFF; }
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i insert_si128(__m256i a, __m128i b);
+#if WJR_HAS_SIMD(SSE)
 
-    WJR_INTRINSIC_INLINE static __m256i load(const void *p);
-    WJR_INTRINSIC_INLINE static __m256i loadu(const void *p);
+sse::mask_type sse::movemask_ps(__m128 v) {
+    return static_cast<sse::mask_type>(_mm_movemask_ps(v));
+}
 
-    WJR_INTRINSIC_INLINE static __m256i ones();
+void sse::sfence() { return _mm_sfence(); }
 
-    WJR_INTRINSIC_INLINE static __m256i loadu_si16(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si32(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si48(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si64(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si80(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si96(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si112(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si128(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si144(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si160(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si176(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si192(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si208(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si224(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si240(const void *ptr);
-    WJR_INTRINSIC_INLINE static __m256i loadu_si256(const void *ptr);
+template <int imm8>
+__m128 sse::shuffle_ps(__m128 a, __m128 b) {
+    static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]");
+    return _mm_shuffle_ps(a, b, imm8);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i loadu_si16x(const void *ptr, int n);
+#endif // SSE
 
-    WJR_INTRINSIC_INLINE static __m256i
-    set_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25,
-             char e24, char e23, char e22, char e21, char e20, char e19, char e18,
-             char e17, char e16, char e15, char e14, char e13, char e12, char e11,
-             char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3,
-             char e2, char e1, char e0);
+#if WJR_HAS_SIMD(SSE2)
 
-    WJR_INTRINSIC_INLINE static __m256i set_epi16(short e15, short e14, short e13,
-                                                  short e12, short e11, short e10,
-                                                  short e9, short e8, short e7, short e6,
-                                                  short e5, short e4, short e3, short e2,
-                                                  short e1, short e0);
+__m128i sse::add_epi8(__m128i a, __m128i b) { return _mm_add_epi8(a, b); }
+__m128i sse::add_epi16(__m128i a, __m128i b) { return _mm_add_epi16(a, b); }
+__m128i sse::add_epi32(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+__m128i sse::add_epi64(__m128i a, __m128i b) { return _mm_add_epi64(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i set_epi32(int e7, int e6, int e5, int e4, int e3,
-                                                  int e2, int e1, int e0);
+__m128i sse::add(__m128i a, __m128i b, int8_t) { return add_epi8(a, b); }
+__m128i sse::add(__m128i a, __m128i b, int16_t) { return add_epi16(a, b); }
+__m128i sse::add(__m128i a, __m128i b, int32_t) { return add_epi32(a, b); }
+__m128i sse::add(__m128i a, __m128i b, int64_t) { return add_epi64(a, b); }
+__m128i sse::add(__m128i a, __m128i b, uint8_t) { return add_epi8(a, b); }
+__m128i sse::add(__m128i a, __m128i b, uint16_t) { return add_epi16(a, b); }
+__m128i sse::add(__m128i a, __m128i b, uint32_t) { return add_epi32(a, b); }
+__m128i sse::add(__m128i a, __m128i b, uint64_t) { return add_epi64(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i set_epi64x(long long e3, long long e2,
-                                                   long long e1, long long e0);
+int8_t sse::add_epi8(__m128i a) { return static_cast<int8_t>(add_epu8(a)); }
+int16_t sse::add_epi16(__m128i a) { return static_cast<int16_t>(add_epu16(a)); }
+int32_t sse::add_epi32(__m128i a) { return static_cast<int32_t>(add_epu32(a)); }
+int64_t sse::add_epi64(__m128i a) { return static_cast<int64_t>(add_epu64(a)); }
 
-    WJR_INTRINSIC_INLINE static __m256i
-    setr_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25,
-              char e24, char e23, char e22, char e21, char e20, char e19, char e18,
-              char e17, char e16, char e15, char e14, char e13, char e12, char e11,
-              char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3,
-              char e2, char e1, char e0);
+uint8_t sse::add_epu8(__m128i a) {
+    auto b = shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a);
+    a = add(a, b, uint8_t());
+    b = zeros();
+    a = sad_epu8(a, b);
+    return simd_cast<__m128i_t, uint8_t>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i setr_epi16(short e15, short e14, short e13,
-                                                   short e12, short e11, short e10,
-                                                   short e9, short e8, short e7, short e6,
-                                                   short e5, short e4, short e3, short e2,
-                                                   short e1, short e0);
+uint16_t sse::add_epu16(__m128i a) {
+    a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint16_t());
+    a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint16_t());
+    a = add(a, srli<2>(a), uint16_t());
+    return simd_cast<__m128i_t, uint16_t>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i setr_epi32(int e7, int e6, int e5, int e4, int e3,
-                                                   int e2, int e1, int e0);
+uint32_t sse::add_epu32(__m128i a) {
+    a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint32_t());
+    a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint32_t());
+    return simd_cast<__m128i_t, uint32_t>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i setr_epi64x(long long e3, long long e2,
-                                                    long long e1, long long e0);
+uint64_t sse::add_epu64(__m128i a) {
+    a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint64_t());
+    return simd_cast<__m128i_t, uint64_t>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i set1_epi8(int8_t a);
-    WJR_INTRINSIC_INLINE static __m256i set1_epi16(int16_t a);
-    WJR_INTRINSIC_INLINE static __m256i set1_epi32(int32_t a);
-    WJR_INTRINSIC_INLINE static __m256i set1_epi64(int64_t a);
+int8_t sse::add(__m128i a, int8_t) { return add_epi8(a); }
+int16_t sse::add(__m128i a, int16_t) { return add_epi16(a); }
+int32_t sse::add(__m128i a, int32_t) { return add_epi32(a); }
+int64_t sse::add(__m128i a, int64_t) { return add_epi64(a); }
+uint8_t sse::add(__m128i a, uint8_t) { return add_epu8(a); }
+uint16_t sse::add(__m128i a, uint16_t) { return add_epu16(a); }
+uint32_t sse::add(__m128i a, uint32_t) { return add_epu32(a); }
+uint64_t sse::add(__m128i a, uint64_t) { return add_epu64(a); }
 
-    WJR_INTRINSIC_INLINE static __m256i set1(int8_t a, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i set1(int16_t a, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i set1(int32_t a, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i set1(int64_t a, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i set1(uint8_t a, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i set1(uint16_t a, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i set1(uint32_t a, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i set1(uint64_t a, uint64_t);
+__m128i sse::adds_epi8(__m128i a, __m128i b) { return _mm_adds_epi8(a, b); }
+__m128i sse::adds_epi16(__m128i a, __m128i b) { return _mm_adds_epi16(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i setmin_epi8();
-    WJR_INTRINSIC_INLINE static __m256i setmin_epi16();
-    WJR_INTRINSIC_INLINE static __m256i setmin_epi32();
-    WJR_INTRINSIC_INLINE static __m256i setmin_epi64();
+__m128i sse::adds_epu8(__m128i a, __m128i b) { return _mm_adds_epu8(a, b); }
+__m128i sse::adds_epu16(__m128i a, __m128i b) { return _mm_adds_epu16(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i setmin(int8_t);
-    WJR_INTRINSIC_INLINE static __m256i setmin(int16_t);
-    WJR_INTRINSIC_INLINE static __m256i setmin(int32_t);
-    WJR_INTRINSIC_INLINE static __m256i setmin(int64_t);
+__m128i sse::adds(__m128i a, __m128i b, int8_t) { return adds_epi8(a, b); }
+__m128i sse::adds(__m128i a, __m128i b, int16_t) { return adds_epi16(a, b); }
+__m128i sse::adds(__m128i a, __m128i b, uint8_t) { return adds_epu8(a, b); }
+__m128i sse::adds(__m128i a, __m128i b, uint16_t) { return adds_epu16(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i setmax_epi8();
-    WJR_INTRINSIC_INLINE static __m256i setmax_epi16();
-    WJR_INTRINSIC_INLINE static __m256i setmax_epi32();
-    WJR_INTRINSIC_INLINE static __m256i setmax_epi64();
+template <int imm8>
+__m128i sse::alignr(__m128i a, __m128i b) {
+    constexpr int s = imm8 & 0x1F;
+#if WJR_HAS_SIMD(SSSE3)
+    return _mm_alignr_epi8(a, b, s);
+#else
+    if constexpr (s == 0) {
+        return b;
+    }
+    if constexpr (s == 16) {
+        return a;
+    }
+    if constexpr (s < 16) {
+        return Or(slli<16 - s>(a), srli<s>(b));
+    }
+    return srli<s - 16>(a);
+#endif // SSSE3
+}
 
-    WJR_INTRINSIC_INLINE static __m256i setmax(int8_t);
-    WJR_INTRINSIC_INLINE static __m256i setmax(int16_t);
-    WJR_INTRINSIC_INLINE static __m256i setmax(int32_t);
-    WJR_INTRINSIC_INLINE static __m256i setmax(int64_t);
+__m128i sse::alignr_epi16(__m128i a, __m128i b, int c) {
+    return Or(slli(a, 16 - c, uint16_t()), srli(b, c, uint16_t()));
+}
 
-    WJR_INTRINSIC_INLINE static void stream(__m256i *p, __m256i a);
+__m128i sse::alignr_epi32(__m128i a, __m128i b, int c) {
+    return Or(slli(a, 32 - c, uint32_t()), srli(b, c, uint32_t()));
+}
 
-    WJR_INTRINSIC_INLINE static void store(void *p, __m256i a);
-    WJR_INTRINSIC_INLINE static void storeu(void *p, __m256i a);
+__m128i sse::alignr_epi64(__m128i a, __m128i b, int c) {
+    return Or(slli(a, 64 - c, uint64_t()), srli(b, c, uint64_t()));
+}
 
-    WJR_INTRINSIC_INLINE static int test_all_zeros(__m256i a);
+__m128i sse::alignr(__m128i a, __m128i b, int c, int16_t) {
+    return alignr_epi16(a, b, c);
+}
+__m128i sse::alignr(__m128i a, __m128i b, int c, int32_t) {
+    return alignr_epi32(a, b, c);
+}
+__m128i sse::alignr(__m128i a, __m128i b, int c, int64_t) {
+    return alignr_epi64(a, b, c);
+}
+__m128i sse::alignr(__m128i a, __m128i b, int c, uint16_t) {
+    return alignr_epi16(a, b, c);
+}
+__m128i sse::alignr(__m128i a, __m128i b, int c, uint32_t) {
+    return alignr_epi32(a, b, c);
+}
+__m128i sse::alignr(__m128i a, __m128i b, int c, uint64_t) {
+    return alignr_epi64(a, b, c);
+}
 
-    WJR_INTRINSIC_INLINE static int testc(__m256i a, __m256i b);
+__m128i sse::And(__m128i a, __m128i b) { return _mm_and_si128(a, b); }
 
-    WJR_INTRINSIC_INLINE static int testnzc(__m256i a, __m256i b);
+__m128i sse::AndNot(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }
 
-    WJR_INTRINSIC_INLINE static int testz(__m256i a, __m256i b);
+__m128i sse::avg_epu8(__m128i a, __m128i b) { return _mm_avg_epu8(a, b); }
+__m128i sse::avg_epu16(__m128i a, __m128i b) { return _mm_avg_epu16(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i zeros();
+__m128i sse::avg(__m128i a, __m128i b, int8_t) { return avg_epu8(a, b); }
+__m128i sse::avg(__m128i a, __m128i b, int16_t) { return avg_epu16(a, b); }
+__m128i sse::avg(__m128i a, __m128i b, uint8_t) { return avg_epu8(a, b); }
+__m128i sse::avg(__m128i a, __m128i b, uint16_t) { return avg_epu16(a, b); }
 
-#endif // AVX
+// notice that mask must be 0 or 255(every byte)
+__m128i sse::blendv_epi8(__m128i a, __m128i b, __m128i mask) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_blendv_epi8(a, b, mask);
+#elif defined(WJR_COMPILER_GCC)
+    return ((~mask) & a) | (mask & b);
+#else
+    return Or(AndNot(mask, a), And(mask, b));
+#endif
+}
 
-#if WJR_HAS_SIMD(AVX2)
+__m128i sse::blendv_epi16(__m128i a, __m128i b, __m128i mask) {
+    return blendv_epi8(b, a, logical_not(mask, uint16_t()));
+}
 
-    WJR_INTRINSIC_INLINE static __m256i And(__m256i a, __m256i b);
+__m128i sse::blendv_epi32(__m128i a, __m128i b, __m128i mask) {
+    return blendv_epi8(b, a, logical_not(mask, uint32_t()));
+}
 
-    WJR_INTRINSIC_INLINE static __m256i AndNot(__m256i a, __m256i b);
+__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int8_t) {
+    return blendv_epi8(a, b, mask);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i Or(__m256i a, __m256i b);
+__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int16_t) {
+    return blendv_epi16(a, b, mask);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i Xor(__m256i a, __m256i b);
+__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int32_t) {
+    return blendv_epi32(a, b, mask);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i Not(__m256i v);
+__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint8_t) {
+    return blendv_epi8(a, b, mask);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i abs_epi8(__m256i v);
-    WJR_INTRINSIC_INLINE static __m256i abs_epi16(__m256i v);
-    WJR_INTRINSIC_INLINE static __m256i abs_epi32(__m256i v);
+__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint16_t) {
+    return blendv_epi16(a, b, mask);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int64_t);
+__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint32_t) {
+    return blendv_epi32(a, b, mask);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i add_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i add_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i add_epi32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i add_epi64(__m256i a, __m256i b);
+template <int imm8>
+__m128i sse::bslli(__m128i val) {
+    return _mm_bslli_si128(val, imm8);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint64_t);
+template <int imm8>
+__m128i sse::bsrli(__m128i val) {
+    return _mm_bsrli_si128(val, imm8);
+}
 
-    WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m256i v);
-    WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m256i v);
-    WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m256i v);
-    WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m256i v);
+__m128i sse::cmpeq_epi8(__m128i a, __m128i b) { return _mm_cmpeq_epi8(a, b); }
+__m128i sse::cmpeq_epi16(__m128i a, __m128i b) { return _mm_cmpeq_epi16(a, b); }
+__m128i sse::cmpeq_epi32(__m128i a, __m128i b) { return _mm_cmpeq_epi32(a, b); }
 
-    WJR_INTRINSIC_INLINE static int8_t add_epi8(__m256i v);
-    WJR_INTRINSIC_INLINE static int16_t add_epi16(__m256i v);
-    WJR_INTRINSIC_INLINE static int32_t add_epi32(__m256i v);
-    WJR_INTRINSIC_INLINE static int64_t add_epi64(__m256i v);
+__m128i sse::cmpeq(__m128i a, __m128i b, int8_t) { return cmpeq_epi8(a, b); }
+__m128i sse::cmpeq(__m128i a, __m128i b, int16_t) { return cmpeq_epi16(a, b); }
+__m128i sse::cmpeq(__m128i a, __m128i b, int32_t) { return cmpeq_epi32(a, b); }
+__m128i sse::cmpeq(__m128i a, __m128i b, uint8_t) { return cmpeq_epi8(a, b); }
+__m128i sse::cmpeq(__m128i a, __m128i b, uint16_t) { return cmpeq_epi16(a, b); }
+__m128i sse::cmpeq(__m128i a, __m128i b, uint32_t) { return cmpeq_epi32(a, b); }
 
-    WJR_INTRINSIC_INLINE static int8_t add(__m256i v, int8_t);
-    WJR_INTRINSIC_INLINE static int16_t add(__m256i v, int16_t);
-    WJR_INTRINSIC_INLINE static int32_t add(__m256i v, int32_t);
-    WJR_INTRINSIC_INLINE static int64_t add(__m256i v, int64_t);
-    WJR_INTRINSIC_INLINE static uint8_t add(__m256i v, uint8_t);
-    WJR_INTRINSIC_INLINE static uint16_t add(__m256i v, uint16_t);
-    WJR_INTRINSIC_INLINE static uint32_t add(__m256i v, uint32_t);
-    WJR_INTRINSIC_INLINE static uint64_t add(__m256i v, uint64_t);
+__m128i sse::cmpge_epi8(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comge_epi8(a, b);
+#elif WJR_HAS_SIMD(SSE4_1)
+    return cmpeq(min(a, b, int8_t()), b, uint8_t());
+#else
+    return Not(cmpgt(b, a, int8_t()));
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i adds_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i adds_epi16(__m256i a, __m256i b);
+__m128i sse::cmpge_epi16(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comge_epi16(a, b);
+#else
+    return cmpeq(min(a, b, int16_t()), b, uint16_t());
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i adds_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i adds_epu16(__m256i a, __m256i b);
+__m128i sse::cmpge_epi32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comge_epi32(a, b);
+#elif WJR_HAS_SIMD(SSE4_1)
+    return cmpeq(min(a, b, int32_t()), b, uint32_t());
+#else
+    return Not(cmpgt(b, a, int32_t()));
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint16_t);
+__m128i sse::cmpge_epu8(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comge_epu8(a, b);
+#else
+    return cmpeq(min(a, b, uint8_t()), b, uint8_t());
+#endif
+}
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b);
+__m128i sse::cmpge_epu16(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comge_epu16(a, b);
+#elif WJR_HAS_SIMD(SSE4_1)
+    return cmpeq(min(a, b, uint16_t()), b, uint16_t());
+#else
+    return logical_not(subs(b, a, uint16_t()), uint16_t());
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i alignr_epi16(__m256i a, __m256i b, int c);
-    WJR_INTRINSIC_INLINE static __m256i alignr_epi32(__m256i a, __m256i b, int c);
-    WJR_INTRINSIC_INLINE static __m256i alignr_epi64(__m256i a, __m256i b, int c);
+__m128i sse::cmpge_epu32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comge_epu32(a, b);
+#elif WJR_HAS_SIMD(SSE4_1)
+    return cmpeq(min(a, b, uint32_t()), b, uint32_t());
+#else
+    return Not(cmpgt(b, a, uint32_t()));
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint64_t);
+__m128i sse::cmpge(__m128i a, __m128i b, int8_t) { return cmpge_epi8(a, b); }
+__m128i sse::cmpge(__m128i a, __m128i b, int16_t) { return cmpge_epi16(a, b); }
+__m128i sse::cmpge(__m128i a, __m128i b, int32_t) { return cmpge_epi32(a, b); }
+__m128i sse::cmpge(__m128i a, __m128i b, uint8_t) { return cmpge_epu8(a, b); }
+__m128i sse::cmpge(__m128i a, __m128i b, uint16_t) { return cmpge_epu16(a, b); }
+__m128i sse::cmpge(__m128i a, __m128i b, uint32_t) { return cmpge_epu32(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i avg_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i avg_epu16(__m256i a, __m256i b);
+__m128i sse::cmpgt_epi8(__m128i a, __m128i b) { return _mm_cmpgt_epi8(a, b); }
+__m128i sse::cmpgt_epi16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(a, b); }
+__m128i sse::cmpgt_epi32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint16_t);
+__m128i sse::cmpgt_epu8(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comgt_epu8(a, b);
+#else
+    return cmpgt_epi8(Xor(a, setmin_epi8()), Xor(b, setmin_epi8()));
+#endif
+}
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i blend_epi16(__m256i a, __m256i b);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i blend_epi32(__m256i a, __m256i b);
+__m128i sse::cmpgt_epu16(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comgt_epu16(a, b);
+#else
+    return cmpgt_epi16(Xor(a, setmin_epi16()), Xor(b, setmin_epi16()));
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i blendv_epi8(__m256i a, __m256i b, __m256i mask);
+__m128i sse::cmpgt_epu32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comgt_epu32(a, b);
+#else
+    return cmpgt_epi32(Xor(a, setmin_epi32()), Xor(b, setmin_epi32()));
+#endif
+}
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i bslli_epi128(__m256i a);
+__m128i sse::cmpgt(__m128i a, __m128i b, int8_t) { return cmpgt_epi8(a, b); }
+__m128i sse::cmpgt(__m128i a, __m128i b, int16_t) { return cmpgt_epi16(a, b); }
+__m128i sse::cmpgt(__m128i a, __m128i b, int32_t) { return cmpgt_epi32(a, b); }
+__m128i sse::cmpgt(__m128i a, __m128i b, uint8_t) { return cmpgt_epu8(a, b); }
+__m128i sse::cmpgt(__m128i a, __m128i b, uint16_t) { return cmpgt_epu16(a, b); }
+__m128i sse::cmpgt(__m128i a, __m128i b, uint32_t) { return cmpgt_epu32(a, b); }
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i bsrli_epi128(__m256i a);
+__m128i sse::cmple_epi8(__m128i a, __m128i b) { return cmpge_epi8(b, a); }
+__m128i sse::cmple_epi16(__m128i a, __m128i b) { return cmpge_epi16(b, a); }
+__m128i sse::cmple_epi32(__m128i a, __m128i b) { return cmpge_epi32(b, a); }
 
-    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi64(__m256i a, __m256i b);
+__m128i sse::cmple_epu8(__m128i a, __m128i b) { return cmpge_epu8(b, a); }
+__m128i sse::cmple_epu16(__m128i a, __m128i b) { return cmpge_epu16(b, a); }
+__m128i sse::cmple_epu32(__m128i a, __m128i b) { return cmpge_epu32(b, a); }
 
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint64_t);
+__m128i sse::cmple(__m128i a, __m128i b, int8_t) { return cmple_epi8(a, b); }
+__m128i sse::cmple(__m128i a, __m128i b, int16_t) { return cmple_epi16(a, b); }
+__m128i sse::cmple(__m128i a, __m128i b, int32_t) { return cmple_epi32(a, b); }
+__m128i sse::cmple(__m128i a, __m128i b, uint8_t) { return cmple_epu8(a, b); }
+__m128i sse::cmple(__m128i a, __m128i b, uint16_t) { return cmple_epu16(a, b); }
+__m128i sse::cmple(__m128i a, __m128i b, uint32_t) { return cmple_epu32(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i cmpge_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpge_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpge_epi32(__m256i a, __m256i b);
+__m128i sse::cmplt_epi8(__m128i a, __m128i b) { return _mm_cmplt_epi8(a, b); }
+__m128i sse::cmplt_epi16(__m128i a, __m128i b) { return _mm_cmplt_epi16(a, b); }
+__m128i sse::cmplt_epi32(__m128i a, __m128i b) { return _mm_cmplt_epi32(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i cmpge_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpge_epu16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpge_epu32(__m256i a, __m256i b);
+__m128i sse::cmplt_epu8(__m128i a, __m128i b) { return cmpgt_epu8(b, a); }
+__m128i sse::cmplt_epu16(__m128i a, __m128i b) { return cmpgt_epu16(b, a); }
+__m128i sse::cmplt_epu32(__m128i a, __m128i b) { return cmpgt_epu32(b, a); }
 
-    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint32_t);
+__m128i sse::cmplt(__m128i a, __m128i b, int8_t) { return cmplt_epi8(a, b); }
+__m128i sse::cmplt(__m128i a, __m128i b, int16_t) { return cmplt_epi16(a, b); }
+__m128i sse::cmplt(__m128i a, __m128i b, int32_t) { return cmplt_epi32(a, b); }
+__m128i sse::cmplt(__m128i a, __m128i b, uint8_t) { return cmplt_epu8(a, b); }
+__m128i sse::cmplt(__m128i a, __m128i b, uint16_t) { return cmplt_epu16(a, b); }
+__m128i sse::cmplt(__m128i a, __m128i b, uint32_t) { return cmplt_epu32(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi64(__m256i a, __m256i b);
+__m128i sse::cmpne_epi8(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comneq_epi8(a, b);
+#else
+    return Not(cmpeq_epi8(a, b));
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu64(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint64_t);
-
-    WJR_INTRINSIC_INLINE static __m256i cmple_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmple_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmple_epi32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i cmple_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmple_epu16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmple_epu32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint32_t);
-
-    WJR_INTRINSIC_INLINE static __m256i cmplt_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmplt_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmplt_epi32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i cmplt_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmplt_epu16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmplt_epu32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint32_t);
-
-    WJR_INTRINSIC_INLINE static __m256i cmpne_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpne_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i cmpne_epi32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint32_t);
-
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::equal_to<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::not_equal_to<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater_equal<>,
-                                            T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less<>, T);
-    template <typename T>
-    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less_equal<>, T);
-
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract_epi8(__m256i v);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract_epi16(__m256i v);
-
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m256i v, int8_t);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static int extract(__m256i v, int16_t);
-
-    WJR_INTRINSIC_INLINE static __m256i hadd_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i hadd_epi32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int32_t);
-
-    WJR_INTRINSIC_INLINE static __m256i hadds_epi16(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i hsub_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i hsub_epi32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int32_t);
-
-    WJR_INTRINSIC_INLINE static __m256i hsubs_epi16(__m256i a, __m256i b);
-
-    template <typename T,
-              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                       uint16_t, uint32_t, uint64_t>)>
-    WJR_INTRINSIC_INLINE static __m256i logical_and(__m256i a, __m256i b, T);
-
-    template <typename T,
-              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                       uint16_t, uint32_t, uint64_t>)>
-    WJR_INTRINSIC_INLINE static __m256i logical_not(__m256i v, T);
-
-    template <typename T,
-              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                       uint16_t, uint32_t, uint64_t>)>
-    WJR_INTRINSIC_INLINE static __m256i logical_or(__m256i a, __m256i b, T);
-
-    WJR_INTRINSIC_INLINE static __m256i madd_epi16(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i max_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i max_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i max_epi32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i max_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i max_epu16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i max_epu32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint32_t);
-
-    WJR_INTRINSIC_INLINE static int8_t max_epi8(__m256i a);
-    WJR_INTRINSIC_INLINE static int16_t max_epi16(__m256i a);
-    WJR_INTRINSIC_INLINE static int32_t max_epi32(__m256i a);
-    WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m256i a);
-    WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m256i a);
-    WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m256i a);
-
-    WJR_INTRINSIC_INLINE static int8_t max(__m256i a, int8_t);
-    WJR_INTRINSIC_INLINE static int16_t max(__m256i a, int16_t);
-    WJR_INTRINSIC_INLINE static int32_t max(__m256i a, int32_t);
-
-    WJR_INTRINSIC_INLINE static uint8_t max(__m256i a, uint8_t);
-    WJR_INTRINSIC_INLINE static uint16_t max(__m256i a, uint16_t);
-    WJR_INTRINSIC_INLINE static uint32_t max(__m256i a, uint32_t);
-
-    WJR_INTRINSIC_INLINE static __m256i min_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i min_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i min_epi32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i min_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i min_epu16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i min_epu32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint32_t);
-
-    WJR_INTRINSIC_INLINE static int8_t min_epi8(__m256i a);
-    WJR_INTRINSIC_INLINE static int16_t min_epi16(__m256i a);
-    WJR_INTRINSIC_INLINE static int32_t min_epi32(__m256i a);
-
-    WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m256i a);
-    WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m256i a);
-    WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m256i a);
-
-    WJR_INTRINSIC_INLINE static int8_t min(__m256i a, int8_t);
-    WJR_INTRINSIC_INLINE static int16_t min(__m256i a, int16_t);
-    WJR_INTRINSIC_INLINE static int32_t min(__m256i a, int32_t);
-    WJR_INTRINSIC_INLINE static uint8_t min(__m256i a, uint8_t);
-    WJR_INTRINSIC_INLINE static uint16_t min(__m256i a, uint16_t);
-    WJR_INTRINSIC_INLINE static uint32_t min(__m256i a, uint32_t);
-
-    WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m256i a);
-
-    WJR_INTRINSIC_INLINE static __m256i mul_epi32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i mul_epu32(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i mulhi_epi16(__m256i a, __m256i b);
-
-    WJR_INTRINSIC_INLINE static __m256i mulhi_epu16(__m256i a, __m256i b);
+__m128i sse::cmpne_epi16(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comneq_epi16(a, b);
+#else
+    return Not(cmpeq_epi16(a, b));
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i mullo_epi16(__m256i a, __m256i b);
+__m128i sse::cmpne_epi32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(XOP)
+    return _mm_comneq_epi32(a, b);
+#else
+    return Not(cmpeq_epi32(a, b));
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i packs_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i packs_epi32(__m256i a, __m256i b);
+__m128i sse::cmpne(__m128i a, __m128i b, int8_t) { return cmpne_epi8(a, b); }
+__m128i sse::cmpne(__m128i a, __m128i b, int16_t) { return cmpne_epi16(a, b); }
+__m128i sse::cmpne(__m128i a, __m128i b, int32_t) { return cmpne_epi32(a, b); }
+__m128i sse::cmpne(__m128i a, __m128i b, uint8_t) { return cmpne_epi8(a, b); }
+__m128i sse::cmpne(__m128i a, __m128i b, uint16_t) { return cmpne_epi16(a, b); }
+__m128i sse::cmpne(__m128i a, __m128i b, uint32_t) { return cmpne_epi32(a, b); }
 
-    WJR_INTRINSIC_INLINE static __m256i packus_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i packus_epi32(__m256i a, __m256i b);
+template <typename T>
+__m128i sse::cmp(__m128i a, __m128i b, std::equal_to<>, T) {
+    return cmpeq(a, b, T());
+}
 
-    template <int imm>
-    WJR_INTRINSIC_INLINE static __m256i shl(__m256i a);
+template <typename T>
+__m128i sse::cmp(__m128i a, __m128i b, std::not_equal_to<>, T) {
+    return cmpne(a, b, T());
+}
 
-    template <int imm>
-    WJR_INTRINSIC_INLINE static __m256i shr(__m256i a);
+template <typename T>
+__m128i sse::cmp(__m128i a, __m128i b, std::greater<>, T) {
+    return cmpgt(a, b, T());
+}
 
-    WJR_INTRINSIC_INLINE static __m256i shuffle_epi8(__m256i a, __m256i b);
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i shuffle_epi32(__m256i a);
+template <typename T>
+__m128i sse::cmp(__m128i a, __m128i b, std::greater_equal<>, T) {
+    return cmpge(a, b, T());
+}
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i shufflehi_epi16(__m256i a);
+template <typename T>
+__m128i sse::cmp(__m128i a, __m128i b, std::less<>, T) {
+    return cmplt(a, b, T());
+}
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i shufflelo_epi16(__m256i a);
+template <typename T>
+__m128i sse::cmp(__m128i a, __m128i b, std::less_equal<>, T) {
+    return cmple(a, b, T());
+}
 
-    WJR_INTRINSIC_INLINE static __m256i sll_epi16(__m256i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m256i sll_epi32(__m256i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m256i sll_epi64(__m256i a, __m128i b);
+__m128i sse::concat(uint64_t lo, uint64_t hi) { return set_epi64x(hi, lo); }
 
-    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint64_t);
+template <int imm8>
+int sse::extract_epi8(__m128i a) {
+    static_assert(imm8 >= 0 && imm8 < 16, "imm8 must be in range [0, 15]");
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_extract_epi8(a, imm8);
+#else
+    if constexpr (imm8 & 1) {
+        return extract_epi16<(imm8 >> 1)>(a) >> 8;
+    } else {
+        return extract_epi16<(imm8 >> 1)>(a) & 0xff;
+    }
+#endif
+}
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a);
-    WJR_INTRINSIC_INLINE static __m256i slli_epi16(__m256i a, int imm8);
-    WJR_INTRINSIC_INLINE static __m256i slli_epi32(__m256i a, int imm8);
-    WJR_INTRINSIC_INLINE static __m256i slli_epi64(__m256i a, int imm8);
+template <int imm8>
+int sse::extract_epi16(__m128i a) {
+    static_assert(imm8 >= 0 && imm8 < 8, "imm8 must be in range [0, 7]");
+    return _mm_extract_epi16(a, imm8);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint64_t);
+template <int imm8>
+int sse::extract_epi32(__m128i a) {
+    static_assert(imm8 >= 0 && imm8 < 4, "imm8 must be in range [0, 3]");
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_extract_epi32(a, imm8);
+#else
+    if constexpr (imm8 == 0) {
+        return simd_cast<__m128i_t, uint32_t>(a);
+    } else if constexpr (imm8 == 1) {
+        return static_cast<uint32_t>(simd_cast<__m128i_t, uint64_t>(a) >> 32);
+    } else if constexpr (imm8 == 2) {
+        return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    } else {
+        return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 3, 3, 3)>(a));
+    }
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i sra_epi16(__m256i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m256i sra_epi32(__m256i a, __m128i b);
+template <int imm8>
+int64_t sse::extract_epi64(__m128i a) {
+    static_assert(imm8 >= 0 && imm8 < 2, "imm8 must be in range [0, 1]");
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_extract_epi64(a, imm8);
+#else
+    if constexpr (imm8 == 0) {
+        return simd_cast<__m128i_t, uint64_t>(a);
+    } else {
+        return simd_cast<__m128i_t, uint64_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    }
+#endif
+}
 
-    WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int32_t);
+template <int imm8>
+int sse::extract(__m128i a, int8_t) {
+    return extract_epi8<imm8>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i srai_epi16(__m256i a, int imm8);
-    WJR_INTRINSIC_INLINE static __m256i srai_epi32(__m256i a, int imm8);
+template <int imm8>
+int sse::extract(__m128i a, int16_t) {
+    return extract_epi16<imm8>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int32_t);
+template <int imm8>
+int sse::extract(__m128i a, int32_t) {
+    return extract_epi32<imm8>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i stream_load(const void *p);
+template <int imm8>
+int64_t sse::extract(__m128i a, int64_t) {
+    return extract_epi64<imm8>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i srl_epi16(__m256i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m256i srl_epi32(__m256i a, __m128i b);
-    WJR_INTRINSIC_INLINE static __m256i srl_epi64(__m256i a, __m128i b);
+template <int imm8>
+int sse::extract(__m128i a, uint8_t) {
+    return extract_epi8<imm8>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint64_t);
+template <int imm8>
+int sse::extract(__m128i a, uint16_t) {
+    return extract_epi16<imm8>(a);
+}
 
-    template <int imm8>
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a);
-    WJR_INTRINSIC_INLINE static __m256i srli_epi8(__m256i a, int imm8);
-    WJR_INTRINSIC_INLINE static __m256i srli_epi16(__m256i a, int imm8);
-    WJR_INTRINSIC_INLINE static __m256i srli_epi32(__m256i a, int imm8);
-    WJR_INTRINSIC_INLINE static __m256i srli_epi64(__m256i a, int imm8);
+template <int imm8>
+int sse::extract(__m128i a, uint32_t) {
+    return extract_epi32<imm8>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint64_t);
+template <int imm8>
+int64_t sse::extract(__m128i a, uint64_t) {
+    return extract_epi64<imm8>(a);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i sub_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i sub_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i sub_epi32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i sub_epi64(__m256i a, __m256i b);
+uint64_t sse::getlow(__m128i v) { return simd_cast<__m128i_t, uint64_t>(v); }
+uint64_t sse::gethigh(__m128i v) { return extract_epi64<1>(v); }
 
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint64_t);
+template <int imm8>
+__m128i sse::insert_epi16(__m128i a, int i) {
+    return _mm_insert_epi16(a, i, imm8);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i subs_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i subs_epi16(__m256i a, __m256i b);
+template <int imm8>
+__m128i sse::insert(__m128i a, int i, int16_t) {
+    return insert_epi16<imm8>(a, i);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i subs_epu8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i subs_epu16(__m256i a, __m256i b);
+template <int imm8>
+__m128i sse::insert(__m128i a, int i, uint16_t) {
+    return insert_epi16<imm8>(a, i);
+}
 
-    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint16_t);
+void sse::lfence() { _mm_lfence(); }
 
-    WJR_INTRINSIC_INLINE static int test_all_ones(__m256i a);
+__m128i sse::load(const void *ptr) {
+    return _mm_load_si128(static_cast<const __m128i *>(ptr));
+}
+__m128i sse::loadu(const void *ptr) {
+    return _mm_loadu_si128(static_cast<const __m128i *>(ptr));
+}
+__m128i sse::loadu_si16(const void *ptr) {
+    return simd_cast<uint16_t, __m128i_t>(read_memory<uint16_t>(ptr));
+}
 
-    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi64(__m256i a, __m256i b);
+__m128i sse::loadu_si32(const void *ptr) {
+    return simd_cast<uint32_t, __m128i_t>(read_memory<uint32_t>(ptr));
+}
 
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint32_t);
-    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint64_t);
+__m128i sse::loadu_si64(const void *ptr) {
+    return simd_cast<uint64_t, __m128i_t>(read_memory<uint64_t>(ptr));
+}
 
-    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi8(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi16(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi32(__m256i a, __m256i b);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi64(__m256i a, __m256i b);
+template <typename T, WJR_REQUIRES_I(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t,
+                                                 uint8_t, uint16_t, uint32_t, uint64_t>)>
+__m128i sse::logical_and(__m128i a, __m128i b, T) {
+    return Not(Or(logical_not(a, T()), logical_not(b, T())));
+}
 
-    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int8_t);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int16_t);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int32_t);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int64_t);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint8_t);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint16_t);
-    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint32_t);
+template <typename T, WJR_REQUIRES_I(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t,
+                                                 uint8_t, uint16_t, uint32_t, uint64_t>)>
+__m128i sse::logical_not(__m128i v, T) {
+    auto Zero = zeros();
+    return cmpeq(v, Zero, T());
+}
 
-#endif // AVX2
-};
+template <typename T, WJR_REQUIRES_I(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t,
+                                                 uint8_t, uint16_t, uint32_t, uint64_t>)>
+__m128i sse::logical_or(__m128i a, __m128i b, T) {
+    return Not(logical_not(Or(a, b), T()));
+}
 
-namespace sse_detail {
-#if WJR_HAS_SIMD(SSE2)
+__m128i sse::madd_epi16(__m128i a, __m128i b) { return _mm_madd_epi16(a, b); }
 
-const static __m128i srli_epi8_mask[8] = {
-    sse::set1_epi16(0xFFFF), sse::set1_epi16(0x7F7F), sse::set1_epi16(0x3F3F),
-    sse::set1_epi16(0x1F1F), sse::set1_epi16(0xF0F),  sse::set1_epi16(0x707),
-    sse::set1_epi16(0x303),  sse::set1_epi16(0x101),
-};
+void sse::maskmoveu(__m128i a, __m128i mask, char *mem_addr) {
+    return _mm_maskmoveu_si128(a, mask, mem_addr);
+}
 
+__m128i sse::max_epi8(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_max_epi8(a, b);
+#else
+    return blendv_epi8(b, a, cmpgt_epi8(a, b));
 #endif
-} // namespace sse_detail
-
-namespace avx_detail {
-#if WJR_HAS_SIMD(AVX2)
+}
 
-const static __m256i srli_epi8_mask[8] = {
-    avx::set1_epi16(0xFFFF), avx::set1_epi16(0x7F7F), avx::set1_epi16(0x3F3F),
-    avx::set1_epi16(0x1F1F), avx::set1_epi16(0xF0F),  avx::set1_epi16(0x707),
-    avx::set1_epi16(0x303),  avx::set1_epi16(0x101),
-};
+__m128i sse::max_epi16(__m128i a, __m128i b) { return _mm_max_epi16(a, b); }
 
+__m128i sse::max_epi32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_max_epi32(a, b);
+#else
+    return blendv_epi8(b, a, cmpgt_epi32(a, b));
 #endif
-} // namespace avx_detail
-
-#if WJR_HAS_SIMD(SSE2)
+}
 
-template <>
-struct broadcast_fn<uint8_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint8_t v) const {
-        return _mm_set1_epi8(v);
-    }
-};
+__m128i sse::max_epu8(__m128i a, __m128i b) { return _mm_max_epu8(a, b); }
 
-template <>
-struct broadcast_fn<uint16_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint16_t v) const {
-        return _mm_set1_epi16(v);
-    }
-};
+__m128i sse::max_epu16(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_max_epu16(a, b);
+#else
+    return add(subs_epu16(b, a), a, uint16_t());
+#endif
+}
 
-template <>
-struct broadcast_fn<uint32_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint32_t v) const {
-        return _mm_set1_epi32(v);
-    }
-};
+__m128i sse::max_epu32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_max_epu32(a, b);
+#else
+    return blendv_epi8(b, a, cmpgt_epu32(a, b));
+#endif
+}
 
-template <>
-struct broadcast_fn<uint64_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(uint64_t v) const {
-        return _mm_set1_epi64x(v);
-    }
-};
+__m128i sse::max(__m128i a, __m128i b, int8_t) { return max_epi8(a, b); }
+__m128i sse::max(__m128i a, __m128i b, int16_t) { return max_epi16(a, b); }
+__m128i sse::max(__m128i a, __m128i b, int32_t) { return max_epi32(a, b); }
+__m128i sse::max(__m128i a, __m128i b, uint8_t) { return max_epu8(a, b); }
+__m128i sse::max(__m128i a, __m128i b, uint16_t) { return max_epu16(a, b); }
+__m128i sse::max(__m128i a, __m128i b, uint32_t) { return max_epu32(a, b); }
 
-template <>
-struct broadcast_fn<__m128i_t, __m128i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m128i operator()(__m128i v) const { return v; }
-};
+int8_t sse::max_epi8(__m128i a) { return 0x7fu ^ min_epu8(Xor(a, set1_epi8(0x7fu))); }
 
-#endif // SSE2
+int16_t sse::max_epi16(__m128i a) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return 0x7fffu ^ min_epu16(Xor(a, set1_epi16(0x7fffu)));
+#else
+    a = max_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
+    return simd_cast<__m128i_t, int16_t>(a);
+#endif
+}
 
-#if WJR_HAS_SIMD(AVX)
+int32_t sse::max_epi32(__m128i a) {
+    a = max_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = max_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    return simd_cast<__m128i_t, int32_t>(a);
+}
 
-template <>
-struct broadcast_fn<uint8_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const {
-        return _mm256_set1_epi8(v);
-    }
-};
+uint8_t sse::max_epu8(__m128i a) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return 0xffu ^ min_epu8(Xor(a, ones()));
+#else
+    a = max_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
+    auto X = simd_cast<__m128i_t, uint32_t>(a);
+    return std::max((uint8_t)X, (uint8_t)(X >> 8));
+#endif
+}
 
-template <>
-struct broadcast_fn<uint16_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const {
-        return _mm256_set1_epi16(v);
-    }
-};
+uint16_t sse::max_epu16(__m128i a) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return 0xffffu ^ min_epu16(Xor(a, ones()));
+#else
+    a = max_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
+    return simd_cast<__m128i_t, uint16_t>(a);
+#endif
+}
 
-template <>
-struct broadcast_fn<uint32_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const {
-        return _mm256_set1_epi32(v);
-    }
-};
+uint32_t sse::max_epu32(__m128i a) {
+    a = max_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = max_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    return simd_cast<__m128i_t, uint32_t>(a);
+}
 
-template <>
-struct broadcast_fn<uint64_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const {
-        return _mm256_set1_epi64x(v);
-    }
-};
+int8_t sse::max(__m128i a, int8_t) { return max_epi8(a); }
+int16_t sse::max(__m128i a, int16_t) { return max_epi16(a); }
+int32_t sse::max(__m128i a, int32_t) { return max_epi32(a); }
+uint8_t sse::max(__m128i a, uint8_t) { return max_epu8(a); }
+uint16_t sse::max(__m128i a, uint16_t) { return max_epu16(a); }
+uint32_t sse::max(__m128i a, uint32_t) { return max_epu32(a); }
 
-template <>
-struct broadcast_fn<__m256i_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256i v) const { return v; }
-};
+void sse::mfence() { _mm_mfence(); }
 
-template <>
-struct broadcast_fn<__m128i_t, __m256i_t> {
-    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const {
-#if WJR_HAS_SIMD(AVX2)
-        return _mm256_broadcastsi128_si256(v);
+__m128i sse::min_epi8(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_min_epi8(a, b);
 #else
-        return _mm256_insertf128_si256(_mm256_castsi128_si256(v), v, 1);
+    return blendv_epi8(a, b, cmpgt_epi8(a, b));
 #endif
-    }
-};
-
-#endif // AVX
-
-/*------------------------sse------------------------*/
+}
 
-constexpr size_t sse::width() { return 128; }
+__m128i sse::min_epi16(__m128i a, __m128i b) { return _mm_min_epi16(a, b); }
 
-constexpr sse::mask_type sse::mask() { return 0xFFFF; }
+__m128i sse::min_epi32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_min_epi32(a, b);
+#else
+    return blendv_epi8(a, b, cmpgt_epi32(a, b));
+#endif
+}
 
-#if WJR_HAS_SIMD(SSE)
+__m128i sse::min_epu8(__m128i a, __m128i b) { return _mm_min_epu8(a, b); }
 
-sse::mask_type sse::movemask_ps(__m128 v) {
-    return static_cast<sse::mask_type>(_mm_movemask_ps(v));
+__m128i sse::min_epu16(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_min_epu16(a, b);
+#else
+    return blendv_epi8(a, b, cmpgt_epu16(a, b));
+#endif
 }
 
-void sse::sfence() { return _mm_sfence(); }
-
-template <int imm8>
-__m128 sse::shuffle_ps(__m128 a, __m128 b) {
-    static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]");
-    return _mm_shuffle_ps(a, b, imm8);
+__m128i sse::min_epu32(__m128i a, __m128i b) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return _mm_min_epu32(a, b);
+#else
+    return blendv_epi8(a, b, cmpgt_epu32(a, b));
+#endif
 }
 
-#endif // SSE
-
-#if WJR_HAS_SIMD(SSE2)
+__m128i sse::min(__m128i a, __m128i b, int8_t) { return min_epi8(a, b); }
+__m128i sse::min(__m128i a, __m128i b, int16_t) { return min_epi16(a, b); }
+__m128i sse::min(__m128i a, __m128i b, int32_t) { return min_epi32(a, b); }
+__m128i sse::min(__m128i a, __m128i b, uint8_t) { return min_epu8(a, b); }
+__m128i sse::min(__m128i a, __m128i b, uint16_t) { return min_epu16(a, b); }
+__m128i sse::min(__m128i a, __m128i b, uint32_t) { return min_epu32(a, b); }
 
-__m128i sse::add_epi8(__m128i a, __m128i b) { return _mm_add_epi8(a, b); }
-__m128i sse::add_epi16(__m128i a, __m128i b) { return _mm_add_epi16(a, b); }
-__m128i sse::add_epi32(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
-__m128i sse::add_epi64(__m128i a, __m128i b) { return _mm_add_epi64(a, b); }
+int8_t sse::min_epi8(__m128i a) { return 0x80u ^ min_epu8(Xor(a, setmin_epi8())); }
 
-__m128i sse::add(__m128i a, __m128i b, int8_t) { return add_epi8(a, b); }
-__m128i sse::add(__m128i a, __m128i b, int16_t) { return add_epi16(a, b); }
-__m128i sse::add(__m128i a, __m128i b, int32_t) { return add_epi32(a, b); }
-__m128i sse::add(__m128i a, __m128i b, int64_t) { return add_epi64(a, b); }
-__m128i sse::add(__m128i a, __m128i b, uint8_t) { return add_epi8(a, b); }
-__m128i sse::add(__m128i a, __m128i b, uint16_t) { return add_epi16(a, b); }
-__m128i sse::add(__m128i a, __m128i b, uint32_t) { return add_epi32(a, b); }
-__m128i sse::add(__m128i a, __m128i b, uint64_t) { return add_epi64(a, b); }
+int16_t sse::min_epi16(__m128i a) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return 0x8000u ^ min_epu16(Xor(a, setmin_epi16()));
+#else
+    a = min_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
+    return simd_cast<__m128i_t, int16_t>(a);
+#endif
+}
 
-int8_t sse::add_epi8(__m128i a) { return static_cast<int8_t>(add_epu8(a)); }
-int16_t sse::add_epi16(__m128i a) { return static_cast<int16_t>(add_epu16(a)); }
-int32_t sse::add_epi32(__m128i a) { return static_cast<int32_t>(add_epu32(a)); }
-int64_t sse::add_epi64(__m128i a) { return static_cast<int64_t>(add_epu64(a)); }
+int32_t sse::min_epi32(__m128i a) {
+    a = min_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = min_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    return simd_cast<__m128i_t, int32_t>(a);
+}
 
-uint8_t sse::add_epu8(__m128i a) {
-    auto b = shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a);
-    a = add(a, b, uint8_t());
-    b = zeros();
-    a = sad_epu8(a, b);
+uint8_t sse::min_epu8(__m128i a) {
+#if WJR_HAS_SIMD(SSE4_1)
+    a = min_epu8(a, srli_epi16(a, 8));
+    a = _mm_minpos_epu16(a);
     return simd_cast<__m128i_t, uint8_t>(a);
+#else
+    a = min_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
+    auto X = simd_cast<__m128i_t, uint32_t>(a);
+    return std::min((uint8_t)X, (uint8_t)(X >> 8));
+#endif
 }
 
-uint16_t sse::add_epu16(__m128i a) {
-    a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint16_t());
-    a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint16_t());
-    a = add(a, srli<2>(a), uint16_t());
+uint16_t sse::min_epu16(__m128i a) {
+#if WJR_HAS_SIMD(SSE4_1)
+    return simd_cast<__m128i_t, uint16_t>(_mm_minpos_epu16(a));
+#else
+    a = min_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
+    a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
     return simd_cast<__m128i_t, uint16_t>(a);
+#endif
 }
 
-uint32_t sse::add_epu32(__m128i a) {
-    a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint32_t());
-    a = add(a, shuffle_epi32<_MM_SHUFFLE(1, 1, 1, 1)>(a), uint32_t());
+uint32_t sse::min_epu32(__m128i a) {
+    a = min_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
+    a = min_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
     return simd_cast<__m128i_t, uint32_t>(a);
 }
 
-uint64_t sse::add_epu64(__m128i a) {
-    a = add(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a), uint64_t());
-    return simd_cast<__m128i_t, uint64_t>(a);
+int8_t sse::min(__m128i a, int8_t) { return min_epi8(a); }
+int16_t sse::min(__m128i a, int16_t) { return min_epi16(a); }
+int32_t sse::min(__m128i a, int32_t) { return min_epi32(a); }
+uint8_t sse::min(__m128i a, uint8_t) { return min_epu8(a); }
+uint16_t sse::min(__m128i a, uint16_t) { return min_epu16(a); }
+uint32_t sse::min(__m128i a, uint32_t) { return min_epu32(a); }
+
+__m128i sse::move_epi64(__m128i a) { return _mm_move_epi64(a); }
+
+sse::mask_type sse::movemask_epi8(__m128i a) {
+    return static_cast<mask_type>(_mm_movemask_epi8(a));
+}
+sse::mask_type sse::movemask_pd(__m128d v) {
+    return static_cast<mask_type>(_mm_movemask_pd(v));
 }
 
-int8_t sse::add(__m128i a, int8_t) { return add_epi8(a); }
-int16_t sse::add(__m128i a, int16_t) { return add_epi16(a); }
-int32_t sse::add(__m128i a, int32_t) { return add_epi32(a); }
-int64_t sse::add(__m128i a, int64_t) { return add_epi64(a); }
-uint8_t sse::add(__m128i a, uint8_t) { return add_epu8(a); }
-uint16_t sse::add(__m128i a, uint16_t) { return add_epu16(a); }
-uint32_t sse::add(__m128i a, uint32_t) { return add_epu32(a); }
-uint64_t sse::add(__m128i a, uint64_t) { return add_epu64(a); }
+sse::mask_type sse::movemask(__m128i v, int8_t) { return movemask_epi8(v); }
+sse::mask_type sse::movemask(__m128i v, int32_t) {
+    return movemask_ps(simd_cast<__m128i_t, __m128_t>(v));
+}
+sse::mask_type sse::movemask(__m128i v, int64_t) {
+    return movemask_pd(simd_cast<__m128i_t, __m128d_t>(v));
+}
+sse::mask_type sse::movemask(__m128i v, uint8_t) { return movemask(v, int8_t()); }
+sse::mask_type sse::movemask(__m128i v, uint32_t) { return movemask(v, int32_t()); }
+sse::mask_type sse::movemask(__m128i v, uint64_t) { return movemask(v, int64_t()); }
 
-__m128i sse::adds_epi8(__m128i a, __m128i b) { return _mm_adds_epi8(a, b); }
-__m128i sse::adds_epi16(__m128i a, __m128i b) { return _mm_adds_epi16(a, b); }
+__m128i sse::mul_epu32(__m128i a, __m128i b) { return _mm_mul_epu32(a, b); }
 
-__m128i sse::adds_epu8(__m128i a, __m128i b) { return _mm_adds_epu8(a, b); }
-__m128i sse::adds_epu16(__m128i a, __m128i b) { return _mm_adds_epu16(a, b); }
+__m128i sse::mulhi_epi16(__m128i a, __m128i b) { return _mm_mulhi_epi16(a, b); }
 
-__m128i sse::adds(__m128i a, __m128i b, int8_t) { return adds_epi8(a, b); }
-__m128i sse::adds(__m128i a, __m128i b, int16_t) { return adds_epi16(a, b); }
-__m128i sse::adds(__m128i a, __m128i b, uint8_t) { return adds_epu8(a, b); }
-__m128i sse::adds(__m128i a, __m128i b, uint16_t) { return adds_epu16(a, b); }
+__m128i sse::mulhi_epu16(__m128i a, __m128i b) { return _mm_mulhi_epu16(a, b); }
 
-template <int imm8>
-__m128i sse::alignr(__m128i a, __m128i b) {
-    constexpr int s = imm8 & 0x1F;
+__m128i sse::mullo_epi16(__m128i a, __m128i b) { return _mm_mullo_epi16(a, b); }
+
+__m128i sse::negate_epi8(__m128i a) {
 #if WJR_HAS_SIMD(SSSE3)
-    return _mm_alignr_epi8(a, b, s);
+    return sign_epi8(a, ones());
 #else
-    if constexpr (s == 0) {
-        return b;
-    }
-    if constexpr (s == 16) {
-        return a;
-    }
-    if constexpr (s < 16) {
-        return Or(slli<16 - s>(a), srli<s>(b));
-    }
-    return srli<s - 16>(a);
-#endif // SSSE3
+    return sub_epi8(zeros(), a);
+#endif
 }
 
-__m128i sse::alignr_epi16(__m128i a, __m128i b, int c) {
-    return Or(slli(a, 16 - c, uint16_t()), srli(b, c, uint16_t()));
+__m128i sse::negate_epi16(__m128i a) {
+#if WJR_HAS_SIMD(SSSE3)
+    return sign_epi16(a, ones());
+#else
+    return sub_epi16(zeros(), a);
+#endif
 }
 
-__m128i sse::alignr_epi32(__m128i a, __m128i b, int c) {
-    return Or(slli(a, 32 - c, uint32_t()), srli(b, c, uint32_t()));
+__m128i sse::negate_epi32(__m128i a) {
+#if WJR_HAS_SIMD(SSSE3)
+    return sign_epi32(a, ones());
+#else
+    return sub_epi32(zeros(), a);
+#endif
 }
 
-__m128i sse::alignr_epi64(__m128i a, __m128i b, int c) {
-    return Or(slli(a, 64 - c, uint64_t()), srli(b, c, uint64_t()));
-}
+__m128i sse::negate_epi64(__m128i a) { return sub_epi64(zeros(), a); }
 
-__m128i sse::alignr(__m128i a, __m128i b, int c, int16_t) {
-    return alignr_epi16(a, b, c);
-}
-__m128i sse::alignr(__m128i a, __m128i b, int c, int32_t) {
-    return alignr_epi32(a, b, c);
-}
-__m128i sse::alignr(__m128i a, __m128i b, int c, int64_t) {
-    return alignr_epi64(a, b, c);
-}
-__m128i sse::alignr(__m128i a, __m128i b, int c, uint16_t) {
-    return alignr_epi16(a, b, c);
-}
-__m128i sse::alignr(__m128i a, __m128i b, int c, uint32_t) {
-    return alignr_epi32(a, b, c);
-}
-__m128i sse::alignr(__m128i a, __m128i b, int c, uint64_t) {
-    return alignr_epi64(a, b, c);
-}
+__m128i sse::negate(__m128i a, int8_t) { return negate_epi8(a); }
+__m128i sse::negate(__m128i a, int16_t) { return negate_epi16(a); }
+__m128i sse::negate(__m128i a, int32_t) { return negate_epi32(a); }
+__m128i sse::negate(__m128i a, int64_t) { return negate_epi64(a); }
+__m128i sse::negate(__m128i a, uint8_t) { return negate_epi8(a); }
+__m128i sse::negate(__m128i a, uint16_t) { return negate_epi16(a); }
+__m128i sse::negate(__m128i a, uint32_t) { return negate_epi32(a); }
+__m128i sse::negate(__m128i a, uint64_t) { return negate_epi64(a); }
 
-__m128i sse::And(__m128i a, __m128i b) { return _mm_and_si128(a, b); }
+__m128i sse::Not(__m128i v) { return Xor(v, ones()); }
 
-__m128i sse::AndNot(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }
+__m128i sse::Or(__m128i a, __m128i b) { return _mm_or_si128(a, b); }
 
-__m128i sse::avg_epu8(__m128i a, __m128i b) { return _mm_avg_epu8(a, b); }
-__m128i sse::avg_epu16(__m128i a, __m128i b) { return _mm_avg_epu16(a, b); }
+__m128i sse::packs_epi16(__m128i a, __m128i b) { return _mm_packs_epi16(a, b); }
+__m128i sse::packs_epi32(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
 
-__m128i sse::avg(__m128i a, __m128i b, int8_t) { return avg_epu8(a, b); }
-__m128i sse::avg(__m128i a, __m128i b, int16_t) { return avg_epu16(a, b); }
-__m128i sse::avg(__m128i a, __m128i b, uint8_t) { return avg_epu8(a, b); }
-__m128i sse::avg(__m128i a, __m128i b, uint16_t) { return avg_epu16(a, b); }
+__m128i sse::packus_epi16(__m128i a, __m128i b) { return _mm_packus_epi16(a, b); }
 
-// notice that mask must be 0 or 255(every byte)
-__m128i sse::blendv_epi8(__m128i a, __m128i b, __m128i mask) {
+__m128i sse::loadu_si48(const void *ptr) {
+    return insert_epi16<2>(loadu_si32(ptr), reinterpret_cast<const uint16_t *>(ptr)[2]);
+}
+
+__m128i sse::loadu_si80(const void *ptr) {
+    return insert_epi16<4>(loadu_si64(ptr), reinterpret_cast<const uint16_t *>(ptr)[4]);
+}
+
+__m128i sse::loadu_si96(const void *ptr) {
 #if WJR_HAS_SIMD(SSE4_1)
-    return _mm_blendv_epi8(a, b, mask);
-#elif defined(WJR_COMPILER_GCC)
-    return ((~mask) & a) | (mask & b);
+    return insert_epi32<2>(loadu_si64(ptr), reinterpret_cast<const uint32_t *>(ptr)[2]);
 #else
-    return Or(AndNot(mask, a), And(mask, b));
+    return insert_epi16<5>(loadu_si80(ptr), reinterpret_cast<const uint16_t *>(ptr)[5]);
 #endif
 }
 
-__m128i sse::blendv_epi16(__m128i a, __m128i b, __m128i mask) {
-    return blendv_epi8(b, a, logical_not(mask, uint16_t()));
+__m128i sse::loadu_si112(const void *ptr) {
+    return insert_epi16<6>(loadu_si96(ptr), reinterpret_cast<const uint16_t *>(ptr)[6]);
 }
 
-__m128i sse::blendv_epi32(__m128i a, __m128i b, __m128i mask) {
-    return blendv_epi8(b, a, logical_not(mask, uint32_t()));
+__m128i sse::loadu_si128(const void *ptr) { return loadu(ptr); }
+
+__m128i sse::loadu_si16x(const void *ptr, int n) {
+    switch (n) {
+    case 0:
+        return zeros();
+    case 1:
+        return loadu_si16(ptr);
+    case 2:
+        return loadu_si32(ptr);
+    case 3:
+        return loadu_si48(ptr);
+    case 4:
+        return loadu_si64(ptr);
+    case 5:
+        return loadu_si80(ptr);
+    case 6:
+        return loadu_si96(ptr);
+    case 7:
+        return loadu_si112(ptr);
+    default:
+        return loadu_si128(ptr);
+    }
 }
 
-__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int8_t) {
-    return blendv_epi8(a, b, mask);
+__m128i sse::sad_epu8(__m128i a, __m128i b) { return _mm_sad_epu8(a, b); }
+
+__m128i sse::zeros() { return _mm_setzero_si128(); }
+__m128i sse::ones() { return _mm_set1_epi32(-1); }
+
+__m128i sse::set_epi8(char e15, char e14, char e13, char e12, char e11, char e10, char e9,
+                      char e8, char e7, char e6, char e5, char e4, char e3, char e2,
+                      char e1, char e0) {
+    return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1,
+                        e0);
 }
 
-__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int16_t) {
-    return blendv_epi16(a, b, mask);
+__m128i sse::set_epi16(short e7, short e6, short e5, short e4, short e3, short e2,
+                       short e1, short e0) {
+    return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
+}
+__m128i sse::set_epi32(int e3, int e2, int e1, int e0) {
+    return _mm_set_epi32(e3, e2, e1, e0);
 }
+__m128i sse::set_epi64x(long long e1, long long e0) { return _mm_set_epi64x(e1, e0); }
 
-__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, int32_t) {
-    return blendv_epi32(a, b, mask);
+__m128i sse::setr_epi8(char e15, char e14, char e13, char e12, char e11, char e10,
+                       char e9, char e8, char e7, char e6, char e5, char e4, char e3,
+                       char e2, char e1, char e0) {
+    return _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1,
+                         e0);
 }
 
-__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint8_t) {
-    return blendv_epi8(a, b, mask);
+__m128i sse::setr_epi16(short e7, short e6, short e5, short e4, short e3, short e2,
+                        short e1, short e0) {
+    return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
+}
+__m128i sse::setr_epi32(int e3, int e2, int e1, int e0) {
+    return _mm_setr_epi32(e3, e2, e1, e0);
 }
 
-__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint16_t) {
-    return blendv_epi16(a, b, mask);
+__m128i sse::set1_epi8(int8_t val) { return _mm_set1_epi8(val); }
+__m128i sse::set1_epi16(int16_t val) { return _mm_set1_epi16(val); }
+__m128i sse::set1_epi32(int32_t val) { return _mm_set1_epi32(val); }
+__m128i sse::set1_epi64(int64_t val) { return _mm_set1_epi64x(val); }
+
+__m128i sse::set1(int8_t val, int8_t) { return set1_epi8(val); }
+__m128i sse::set1(int16_t val, int16_t) { return set1_epi16(val); }
+__m128i sse::set1(int32_t val, int32_t) { return set1_epi32(val); }
+__m128i sse::set1(int64_t val, int64_t) { return set1_epi64(val); }
+__m128i sse::set1(uint8_t val, uint8_t) { return set1_epi8(val); }
+__m128i sse::set1(uint16_t val, uint16_t) { return set1_epi16(val); }
+__m128i sse::set1(uint32_t val, uint32_t) { return set1_epi32(val); }
+__m128i sse::set1(uint64_t val, uint64_t) { return set1_epi64(val); }
+
+__m128i sse::setmin_epi8() { return set1_epi8(0x80u); }
+__m128i sse::setmin_epi16() { return set1_epi16(0x8000u); }
+__m128i sse::setmin_epi32() { return set1_epi32(0x80000000u); }
+
+__m128i sse::setmin(int8_t) { return setmin_epi8(); }
+__m128i sse::setmin(int16_t) { return setmin_epi16(); }
+__m128i sse::setmin(int32_t) { return setmin_epi32(); }
+__m128i sse::setmin(uint8_t) { return set1_epi32(0); }
+__m128i sse::setmin(uint16_t) { return set1_epi32(0); }
+__m128i sse::setmin(uint32_t) { return set1_epi32(0); }
+
+__m128i sse::setmax_epi8() { return set1_epi8(0x7F); }
+__m128i sse::setmax_epi16() { return set1_epi16(0x7FFF); }
+__m128i sse::setmax_epi32() { return set1_epi32(0x7FFFFFFF); }
+
+__m128i sse::setmax(int8_t) { return setmax_epi8(); }
+__m128i sse::setmax(int16_t) { return setmax_epi16(); }
+__m128i sse::setmax(int32_t) { return setmax_epi32(); }
+__m128i sse::setmax(uint8_t) { return set1_epi32(0xFFFFFFFF); }
+__m128i sse::setmax(uint16_t) { return set1_epi32(0xFFFFFFFF); }
+__m128i sse::setmax(uint32_t) { return set1_epi32(0xFFFFFFFF); }
+
+template <int imm>
+__m128i sse::shl(__m128i a) {
+    if constexpr (imm >= 64) {
+        a = slli<8>(a);
+        a = slli_epi64(a, imm - 64);
+        return a;
+    } else {
+        auto b = slli_epi64(a, imm);
+        auto c = slli<8>(a);
+        c = srli_epi64(c, 64 - imm);
+        return Or(b, c);
+    }
 }
 
-__m128i sse::blendv(__m128i a, __m128i b, __m128i mask, uint32_t) {
-    return blendv_epi32(a, b, mask);
+template <int imm>
+__m128i sse::shr(__m128i a) {
+    if constexpr (imm >= 64) {
+        a = srli<8>(a);
+        a = srli_epi64(a, imm - 64);
+        return a;
+    } else {
+        auto b = srli_epi64(a, imm);
+        auto c = srli<8>(a);
+        c = slli_epi64(c, 64 - imm);
+        return Or(b, c);
+    }
 }
 
 template <int imm8>
-__m128i sse::bslli(__m128i val) {
-    return _mm_bslli_si128(val, imm8);
+__m128i sse::shuffle_epi32(__m128i v) {
+    static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]");
+    return _mm_shuffle_epi32(v, imm8);
 }
 
 template <int imm8>
-__m128i sse::bsrli(__m128i val) {
-    return _mm_bsrli_si128(val, imm8);
+__m128i sse::shufflehi_epi16(__m128i v) {
+    return _mm_shufflehi_epi16(v, imm8);
 }
 
-__m128i sse::cmpeq_epi8(__m128i a, __m128i b) { return _mm_cmpeq_epi8(a, b); }
-__m128i sse::cmpeq_epi16(__m128i a, __m128i b) { return _mm_cmpeq_epi16(a, b); }
-__m128i sse::cmpeq_epi32(__m128i a, __m128i b) { return _mm_cmpeq_epi32(a, b); }
+template <int imm8>
+__m128i sse::shufflelo_epi16(__m128i v) {
+    return _mm_shufflelo_epi16(v, imm8);
+}
 
-__m128i sse::cmpeq(__m128i a, __m128i b, int8_t) { return cmpeq_epi8(a, b); }
-__m128i sse::cmpeq(__m128i a, __m128i b, int16_t) { return cmpeq_epi16(a, b); }
-__m128i sse::cmpeq(__m128i a, __m128i b, int32_t) { return cmpeq_epi32(a, b); }
-__m128i sse::cmpeq(__m128i a, __m128i b, uint8_t) { return cmpeq_epi8(a, b); }
-__m128i sse::cmpeq(__m128i a, __m128i b, uint16_t) { return cmpeq_epi16(a, b); }
-__m128i sse::cmpeq(__m128i a, __m128i b, uint32_t) { return cmpeq_epi32(a, b); }
+__m128i sse::sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
+__m128i sse::sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
+__m128i sse::sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
 
-__m128i sse::cmpge_epi8(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comge_epi8(a, b);
-#elif WJR_HAS_SIMD(SSE4_1)
-    return cmpeq(min(a, b, int8_t()), b, uint8_t());
-#else
-    return Not(cmpgt(b, a, int8_t()));
-#endif
-}
+__m128i sse::sll(__m128i a, __m128i b, int16_t) { return sll_epi16(a, b); }
+__m128i sse::sll(__m128i a, __m128i b, int32_t) { return sll_epi32(a, b); }
+__m128i sse::sll(__m128i a, __m128i b, int64_t) { return sll_epi64(a, b); }
+__m128i sse::sll(__m128i a, __m128i b, uint16_t) { return sll_epi16(a, b); }
+__m128i sse::sll(__m128i a, __m128i b, uint32_t) { return sll_epi32(a, b); }
+__m128i sse::sll(__m128i a, __m128i b, uint64_t) { return sll_epi64(a, b); }
 
-__m128i sse::cmpge_epi16(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comge_epi16(a, b);
-#else
-    return cmpeq(min(a, b, int16_t()), b, uint16_t());
-#endif
+template <int imm8>
+__m128i sse::slli(__m128i v) {
+    return _mm_slli_si128(v, imm8);
 }
+__m128i sse::slli_epi16(__m128i a, int imm8) {
+    if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) {
+        return sse::add_epi16(a, a);
+    }
 
-__m128i sse::cmpge_epi32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comge_epi32(a, b);
-#elif WJR_HAS_SIMD(SSE4_1)
-    return cmpeq(min(a, b, int32_t()), b, uint32_t());
-#else
-    return Not(cmpgt(b, a, int32_t()));
-#endif
+    return _mm_slli_epi16(a, imm8);
 }
+__m128i sse::slli_epi32(__m128i a, int imm8) {
+    if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) {
+        return sse::add_epi32(a, a);
+    }
 
-__m128i sse::cmpge_epu8(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comge_epu8(a, b);
-#else
-    return cmpeq(min(a, b, uint8_t()), b, uint8_t());
-#endif
+    return _mm_slli_epi32(a, imm8);
 }
+__m128i sse::slli_epi64(__m128i a, int imm8) {
+    if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) {
+        return sse::add_epi64(a, a);
+    }
 
-__m128i sse::cmpge_epu16(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comge_epu16(a, b);
-#elif WJR_HAS_SIMD(SSE4_1)
-    return cmpeq(min(a, b, uint16_t()), b, uint16_t());
-#else
-    return logical_not(subs(b, a, uint16_t()), uint16_t());
-#endif
+    return _mm_slli_epi64(a, imm8);
 }
 
-__m128i sse::cmpge_epu32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comge_epu32(a, b);
-#elif WJR_HAS_SIMD(SSE4_1)
-    return cmpeq(min(a, b, uint32_t()), b, uint32_t());
-#else
-    return Not(cmpgt(b, a, uint32_t()));
-#endif
+__m128i sse::slli(__m128i a, int imm8, int16_t) { return slli_epi16(a, imm8); }
+__m128i sse::slli(__m128i a, int imm8, int32_t) { return slli_epi32(a, imm8); }
+__m128i sse::slli(__m128i a, int imm8, int64_t) { return slli_epi64(a, imm8); }
+__m128i sse::slli(__m128i a, int imm8, uint16_t) { return slli_epi16(a, imm8); }
+__m128i sse::slli(__m128i a, int imm8, uint32_t) { return slli_epi32(a, imm8); }
+__m128i sse::slli(__m128i a, int imm8, uint64_t) { return slli_epi64(a, imm8); }
+
+__m128i sse::sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
+__m128i sse::sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
+
+__m128i sse::sra(__m128i a, __m128i b, int16_t) { return sra_epi16(a, b); }
+__m128i sse::sra(__m128i a, __m128i b, int32_t) { return sra_epi32(a, b); }
+
+__m128i sse::srai_epi16(__m128i a, int imm8) { return _mm_srai_epi16(a, imm8); }
+__m128i sse::srai_epi32(__m128i a, int imm8) { return _mm_srai_epi32(a, imm8); }
+
+__m128i sse::srai(__m128i a, int imm8, int16_t) { return srai_epi16(a, imm8); }
+__m128i sse::srai(__m128i a, int imm8, int32_t) { return srai_epi32(a, imm8); }
+
+__m128i sse::srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
+__m128i sse::srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
+__m128i sse::srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
+
+__m128i sse::srl(__m128i a, __m128i b, int16_t) { return srl_epi16(a, b); }
+__m128i sse::srl(__m128i a, __m128i b, int32_t) { return srl_epi32(a, b); }
+__m128i sse::srl(__m128i a, __m128i b, int64_t) { return srl_epi64(a, b); }
+__m128i sse::srl(__m128i a, __m128i b, uint16_t) { return srl_epi16(a, b); }
+__m128i sse::srl(__m128i a, __m128i b, uint32_t) { return srl_epi32(a, b); }
+__m128i sse::srl(__m128i a, __m128i b, uint64_t) { return srl_epi64(a, b); }
+
+template <int imm8>
+__m128i sse::srli(__m128i v) {
+    return _mm_srli_si128(v, imm8);
+}
+__m128i sse::srli_epi8(__m128i a, int imm8) {
+    return And(srli_epi16(a, imm8), sse_detail::srli_epi8_mask[imm8]);
 }
+__m128i sse::srli_epi16(__m128i a, int imm8) { return _mm_srli_epi16(a, imm8); }
+__m128i sse::srli_epi32(__m128i a, int imm8) { return _mm_srli_epi32(a, imm8); }
+__m128i sse::srli_epi64(__m128i a, int imm8) { return _mm_srli_epi64(a, imm8); }
 
-__m128i sse::cmpge(__m128i a, __m128i b, int8_t) { return cmpge_epi8(a, b); }
-__m128i sse::cmpge(__m128i a, __m128i b, int16_t) { return cmpge_epi16(a, b); }
-__m128i sse::cmpge(__m128i a, __m128i b, int32_t) { return cmpge_epi32(a, b); }
-__m128i sse::cmpge(__m128i a, __m128i b, uint8_t) { return cmpge_epu8(a, b); }
-__m128i sse::cmpge(__m128i a, __m128i b, uint16_t) { return cmpge_epu16(a, b); }
-__m128i sse::cmpge(__m128i a, __m128i b, uint32_t) { return cmpge_epu32(a, b); }
+__m128i sse::srli(__m128i a, int imm8, int8_t) { return srli_epi8(a, imm8); }
+__m128i sse::srli(__m128i a, int imm8, int16_t) { return srli_epi16(a, imm8); }
+__m128i sse::srli(__m128i a, int imm8, int32_t) { return srli_epi32(a, imm8); }
+__m128i sse::srli(__m128i a, int imm8, int64_t) { return srli_epi64(a, imm8); }
+__m128i sse::srli(__m128i a, int imm8, uint8_t) { return srli_epi8(a, imm8); }
+__m128i sse::srli(__m128i a, int imm8, uint16_t) { return srli_epi16(a, imm8); }
+__m128i sse::srli(__m128i a, int imm8, uint32_t) { return srli_epi32(a, imm8); }
+__m128i sse::srli(__m128i a, int imm8, uint64_t) { return srli_epi64(a, imm8); }
 
-__m128i sse::cmpgt_epi8(__m128i a, __m128i b) { return _mm_cmpgt_epi8(a, b); }
-__m128i sse::cmpgt_epi16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(a, b); }
-__m128i sse::cmpgt_epi32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(a, b); }
+void sse::stream(__m128i *ptr, __m128i v) { _mm_stream_si128(ptr, v); }
 
-__m128i sse::cmpgt_epu8(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comgt_epu8(a, b);
-#else
-    return cmpgt_epi8(Xor(a, setmin_epi8()), Xor(b, setmin_epi8()));
-#endif
+void sse::store(void *ptr, __m128i val) {
+    _mm_store_si128(static_cast<__m128i *>(ptr), val);
+}
+void sse::storeu(void *ptr, __m128i val) {
+    _mm_storeu_si128(static_cast<__m128i *>(ptr), val);
 }
 
-__m128i sse::cmpgt_epu16(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comgt_epu16(a, b);
-#else
-    return cmpgt_epi16(Xor(a, setmin_epi16()), Xor(b, setmin_epi16()));
-#endif
-}
+__m128i sse::sub_epi8(__m128i a, __m128i b) { return _mm_sub_epi8(a, b); }
+__m128i sse::sub_epi16(__m128i a, __m128i b) { return _mm_sub_epi16(a, b); }
+__m128i sse::sub_epi32(__m128i a, __m128i b) { return _mm_sub_epi32(a, b); }
+__m128i sse::sub_epi64(__m128i a, __m128i b) { return _mm_sub_epi64(a, b); }
 
-__m128i sse::cmpgt_epu32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comgt_epu32(a, b);
-#else
-    return cmpgt_epi32(Xor(a, setmin_epi32()), Xor(b, setmin_epi32()));
-#endif
-}
+__m128i sse::sub(__m128i a, __m128i b, int8_t) { return sub_epi8(a, b); }
+__m128i sse::sub(__m128i a, __m128i b, int16_t) { return sub_epi16(a, b); }
+__m128i sse::sub(__m128i a, __m128i b, int32_t) { return sub_epi32(a, b); }
+__m128i sse::sub(__m128i a, __m128i b, int64_t) { return sub_epi64(a, b); }
+__m128i sse::sub(__m128i a, __m128i b, uint8_t) { return sub_epi8(a, b); }
+__m128i sse::sub(__m128i a, __m128i b, uint16_t) { return sub_epi16(a, b); }
+__m128i sse::sub(__m128i a, __m128i b, uint32_t) { return sub_epi32(a, b); }
+__m128i sse::sub(__m128i a, __m128i b, uint64_t) { return sub_epi64(a, b); }
 
-__m128i sse::cmpgt(__m128i a, __m128i b, int8_t) { return cmpgt_epi8(a, b); }
-__m128i sse::cmpgt(__m128i a, __m128i b, int16_t) { return cmpgt_epi16(a, b); }
-__m128i sse::cmpgt(__m128i a, __m128i b, int32_t) { return cmpgt_epi32(a, b); }
-__m128i sse::cmpgt(__m128i a, __m128i b, uint8_t) { return cmpgt_epu8(a, b); }
-__m128i sse::cmpgt(__m128i a, __m128i b, uint16_t) { return cmpgt_epu16(a, b); }
-__m128i sse::cmpgt(__m128i a, __m128i b, uint32_t) { return cmpgt_epu32(a, b); }
+__m128i sse::subs_epi8(__m128i a, __m128i b) { return _mm_subs_epi8(a, b); }
+__m128i sse::subs_epi16(__m128i a, __m128i b) { return _mm_subs_epi16(a, b); }
 
-__m128i sse::cmple_epi8(__m128i a, __m128i b) { return cmpge_epi8(b, a); }
-__m128i sse::cmple_epi16(__m128i a, __m128i b) { return cmpge_epi16(b, a); }
-__m128i sse::cmple_epi32(__m128i a, __m128i b) { return cmpge_epi32(b, a); }
+__m128i sse::subs_epu8(__m128i a, __m128i b) { return _mm_subs_epu8(a, b); }
+__m128i sse::subs_epu16(__m128i a, __m128i b) { return _mm_subs_epu16(a, b); }
 
-__m128i sse::cmple_epu8(__m128i a, __m128i b) { return cmpge_epu8(b, a); }
-__m128i sse::cmple_epu16(__m128i a, __m128i b) { return cmpge_epu16(b, a); }
-__m128i sse::cmple_epu32(__m128i a, __m128i b) { return cmpge_epu32(b, a); }
+__m128i sse::subs(__m128i a, __m128i b, int8_t) { return subs_epi8(a, b); }
+__m128i sse::subs(__m128i a, __m128i b, int16_t) { return subs_epi16(a, b); }
+__m128i sse::subs(__m128i a, __m128i b, uint8_t) { return subs_epu8(a, b); }
+__m128i sse::subs(__m128i a, __m128i b, uint16_t) { return subs_epu16(a, b); }
 
-__m128i sse::cmple(__m128i a, __m128i b, int8_t) { return cmple_epi8(a, b); }
-__m128i sse::cmple(__m128i a, __m128i b, int16_t) { return cmple_epi16(a, b); }
-__m128i sse::cmple(__m128i a, __m128i b, int32_t) { return cmple_epi32(a, b); }
-__m128i sse::cmple(__m128i a, __m128i b, uint8_t) { return cmple_epu8(a, b); }
-__m128i sse::cmple(__m128i a, __m128i b, uint16_t) { return cmple_epu16(a, b); }
-__m128i sse::cmple(__m128i a, __m128i b, uint32_t) { return cmple_epu32(a, b); }
+__m128i sse::unpackhi_epi8(__m128i a, __m128i b) { return _mm_unpackhi_epi8(a, b); }
+__m128i sse::unpackhi_epi16(__m128i a, __m128i b) { return _mm_unpackhi_epi16(a, b); }
+__m128i sse::unpackhi_epi32(__m128i a, __m128i b) { return _mm_unpackhi_epi32(a, b); }
+__m128i sse::unpackhi_epi64(__m128i a, __m128i b) { return _mm_unpackhi_epi64(a, b); }
 
-__m128i sse::cmplt_epi8(__m128i a, __m128i b) { return _mm_cmplt_epi8(a, b); }
-__m128i sse::cmplt_epi16(__m128i a, __m128i b) { return _mm_cmplt_epi16(a, b); }
-__m128i sse::cmplt_epi32(__m128i a, __m128i b) { return _mm_cmplt_epi32(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, int8_t) { return unpackhi_epi8(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, int16_t) { return unpackhi_epi16(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, int32_t) { return unpackhi_epi32(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, int64_t) { return unpackhi_epi64(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, uint8_t) { return unpackhi_epi8(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, uint16_t) { return unpackhi_epi16(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, uint32_t) { return unpackhi_epi32(a, b); }
+__m128i sse::unpackhi(__m128i a, __m128i b, uint64_t) { return unpackhi_epi64(a, b); }
 
-__m128i sse::cmplt_epu8(__m128i a, __m128i b) { return cmpgt_epu8(b, a); }
-__m128i sse::cmplt_epu16(__m128i a, __m128i b) { return cmpgt_epu16(b, a); }
-__m128i sse::cmplt_epu32(__m128i a, __m128i b) { return cmpgt_epu32(b, a); }
+__m128i sse::unpacklo_epi8(__m128i a, __m128i b) { return _mm_unpacklo_epi8(a, b); }
+__m128i sse::unpacklo_epi16(__m128i a, __m128i b) { return _mm_unpacklo_epi16(a, b); }
+__m128i sse::unpacklo_epi32(__m128i a, __m128i b) { return _mm_unpacklo_epi32(a, b); }
+__m128i sse::unpacklo_epi64(__m128i a, __m128i b) { return _mm_unpacklo_epi64(a, b); }
 
-__m128i sse::cmplt(__m128i a, __m128i b, int8_t) { return cmplt_epi8(a, b); }
-__m128i sse::cmplt(__m128i a, __m128i b, int16_t) { return cmplt_epi16(a, b); }
-__m128i sse::cmplt(__m128i a, __m128i b, int32_t) { return cmplt_epi32(a, b); }
-__m128i sse::cmplt(__m128i a, __m128i b, uint8_t) { return cmplt_epu8(a, b); }
-__m128i sse::cmplt(__m128i a, __m128i b, uint16_t) { return cmplt_epu16(a, b); }
-__m128i sse::cmplt(__m128i a, __m128i b, uint32_t) { return cmplt_epu32(a, b); }
+__m128i sse::unpacklo(__m128i a, __m128i b, int8_t) { return unpacklo_epi8(a, b); }
+__m128i sse::unpacklo(__m128i a, __m128i b, int16_t) { return unpacklo_epi16(a, b); }
+__m128i sse::unpacklo(__m128i a, __m128i b, int32_t) { return unpacklo_epi32(a, b); }
+__m128i sse::unpacklo(__m128i a, __m128i b, int64_t) { return unpacklo_epi64(a, b); }
+__m128i sse::unpacklo(__m128i a, __m128i b, uint8_t) { return unpacklo_epi8(a, b); }
+__m128i sse::unpacklo(__m128i a, __m128i b, uint16_t) { return unpacklo_epi16(a, b); }
+__m128i sse::unpacklo(__m128i a, __m128i b, uint32_t) { return unpacklo_epi32(a, b); }
 
-__m128i sse::cmpne_epi8(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comneq_epi8(a, b);
-#else
-    return Not(cmpeq_epi8(a, b));
-#endif
-}
+__m128i sse::Xor(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
 
-__m128i sse::cmpne_epi16(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comneq_epi16(a, b);
-#else
-    return Not(cmpeq_epi16(a, b));
 #endif
-}
 
-__m128i sse::cmpne_epi32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(XOP)
-    return _mm_comneq_epi32(a, b);
-#else
-    return Not(cmpeq_epi32(a, b));
-#endif
-}
+#if WJR_HAS_SIMD(SSE3)
 
-__m128i sse::cmpne(__m128i a, __m128i b, int8_t) { return cmpne_epi8(a, b); }
-__m128i sse::cmpne(__m128i a, __m128i b, int16_t) { return cmpne_epi16(a, b); }
-__m128i sse::cmpne(__m128i a, __m128i b, int32_t) { return cmpne_epi32(a, b); }
-__m128i sse::cmpne(__m128i a, __m128i b, uint8_t) { return cmpne_epi8(a, b); }
-__m128i sse::cmpne(__m128i a, __m128i b, uint16_t) { return cmpne_epi16(a, b); }
-__m128i sse::cmpne(__m128i a, __m128i b, uint32_t) { return cmpne_epi32(a, b); }
+__m128i sse::lddqu(const __m128i *ptr) { return _mm_lddqu_si128(ptr); }
 
-template <typename T>
-__m128i sse::cmp(__m128i a, __m128i b, std::equal_to<>, T) {
-    return cmpeq(a, b, T());
-}
+#endif
 
-template <typename T>
-__m128i sse::cmp(__m128i a, __m128i b, std::not_equal_to<>, T) {
-    return cmpne(a, b, T());
-}
+#if WJR_HAS_SIMD(SSSE3)
 
-template <typename T>
-__m128i sse::cmp(__m128i a, __m128i b, std::greater<>, T) {
-    return cmpgt(a, b, T());
-}
+__m128i sse::abs_epi8(__m128i val) { return _mm_abs_epi8(val); }
+__m128i sse::abs_epi16(__m128i val) { return _mm_abs_epi16(val); }
+__m128i sse::abs_epi32(__m128i val) { return _mm_abs_epi32(val); }
 
-template <typename T>
-__m128i sse::cmp(__m128i a, __m128i b, std::greater_equal<>, T) {
-    return cmpge(a, b, T());
-}
+__m128i sse::abs(__m128i val, int8_t) { return abs_epi8(val); }
+__m128i sse::abs(__m128i val, int16_t) { return abs_epi16(val); }
+__m128i sse::abs(__m128i val, int32_t) { return abs_epi32(val); }
+__m128i sse::abs(__m128i val, uint8_t) { return val; }
+__m128i sse::abs(__m128i val, uint16_t) { return val; }
+__m128i sse::abs(__m128i val, uint32_t) { return val; }
 
-template <typename T>
-__m128i sse::cmp(__m128i a, __m128i b, std::less<>, T) {
-    return cmplt(a, b, T());
-}
+__m128i sse::shuffle_epi8(__m128i v, __m128i imm8) { return _mm_shuffle_epi8(v, imm8); }
 
-template <typename T>
-__m128i sse::cmp(__m128i a, __m128i b, std::less_equal<>, T) {
-    return cmple(a, b, T());
-}
+__m128i sse::sign_epi8(__m128i a, __m128i b) { return _mm_sign_epi8(a, b); }
+__m128i sse::sign_epi16(__m128i a, __m128i b) { return _mm_sign_epi16(a, b); }
+__m128i sse::sign_epi32(__m128i a, __m128i b) { return _mm_sign_epi32(a, b); }
 
-__m128i sse::concat(uint64_t lo, uint64_t hi) { return set_epi64x(hi, lo); }
+__m128i sse::sign(__m128i a, __m128i b, int8_t) { return sign_epi8(a, b); }
+__m128i sse::sign(__m128i a, __m128i b, int16_t) { return sign_epi16(a, b); }
+__m128i sse::sign(__m128i a, __m128i b, int32_t) { return sign_epi32(a, b); }
+__m128i sse::sign(__m128i a, __m128i b, uint8_t) { return sign_epi8(a, b); }
+__m128i sse::sign(__m128i a, __m128i b, uint16_t) { return sign_epi16(a, b); }
+__m128i sse::sign(__m128i a, __m128i b, uint32_t) { return sign_epi32(a, b); }
 
-template <int imm8>
-int sse::extract_epi8(__m128i a) {
-    static_assert(imm8 >= 0 && imm8 < 16, "imm8 must be in range [0, 15]");
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_extract_epi8(a, imm8);
-#else
-    if constexpr (imm8 & 1) {
-        return extract_epi16<(imm8 >> 1)>(a) >> 8;
-    } else {
-        return extract_epi16<(imm8 >> 1)>(a) & 0xff;
-    }
 #endif
-}
-
-template <int imm8>
-int sse::extract_epi16(__m128i a) {
-    static_assert(imm8 >= 0 && imm8 < 8, "imm8 must be in range [0, 7]");
-    return _mm_extract_epi16(a, imm8);
-}
 
-template <int imm8>
-int sse::extract_epi32(__m128i a) {
-    static_assert(imm8 >= 0 && imm8 < 4, "imm8 must be in range [0, 3]");
 #if WJR_HAS_SIMD(SSE4_1)
-    return _mm_extract_epi32(a, imm8);
-#else
-    if constexpr (imm8 == 0) {
-        return simd_cast<__m128i_t, uint32_t>(a);
-    } else if constexpr (imm8 == 1) {
-        return static_cast<uint32_t>(simd_cast<__m128i_t, uint64_t>(a) >> 32);
-    } else if constexpr (imm8 == 2) {
-        return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    } else {
-        return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 3, 3, 3)>(a));
-    }
-#endif
-}
 
 template <int imm8>
-int64_t sse::extract_epi64(__m128i a) {
-    static_assert(imm8 >= 0 && imm8 < 2, "imm8 must be in range [0, 1]");
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_extract_epi64(a, imm8);
-#else
-    if constexpr (imm8 == 0) {
-        return simd_cast<__m128i_t, uint64_t>(a);
-    } else {
-        return simd_cast<__m128i_t, uint64_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    }
-#endif
+__m128i sse::blend_epi16(__m128i a, __m128i b) {
+    return _mm_blend_epi16(a, b, imm8);
 }
 
-template <int imm8>
-int sse::extract(__m128i a, int8_t) {
-    return extract_epi8<imm8>(a);
-}
+__m128i sse::cmpeq_epi64(__m128i a, __m128i b) { return _mm_cmpeq_epi64(a, b); }
 
-template <int imm8>
-int sse::extract(__m128i a, int16_t) {
-    return extract_epi16<imm8>(a);
-}
+__m128i sse::cmpeq(__m128i a, __m128i b, int64_t) { return cmpeq_epi64(a, b); }
+__m128i sse::cmpeq(__m128i a, __m128i b, uint64_t) { return cmpeq_epi64(a, b); }
+
+__m128i sse::cmpgt_epi64(__m128i a, __m128i b) { return _mm_cmpgt_epi64(a, b); }
+
+__m128i sse::cmpgt(__m128i a, __m128i b, int64_t) { return cmpgt_epi64(a, b); }
 
 template <int imm8>
-int sse::extract(__m128i a, int32_t) {
-    return extract_epi32<imm8>(a);
+__m128i sse::insert_epi8(__m128i a, int i) {
+    return _mm_insert_epi8(a, i, imm8);
 }
 
 template <int imm8>
-int64_t sse::extract(__m128i a, int64_t) {
-    return extract_epi64<imm8>(a);
+__m128i sse::insert_epi32(__m128i a, int i) {
+    return _mm_insert_epi32(a, i, imm8);
 }
 
 template <int imm8>
-int sse::extract(__m128i a, uint8_t) {
-    return extract_epi8<imm8>(a);
+__m128i sse::insert_epi64(__m128i a, int64_t i) {
+    return _mm_insert_epi64(a, i, imm8);
 }
 
 template <int imm8>
-int sse::extract(__m128i a, uint16_t) {
-    return extract_epi16<imm8>(a);
+__m128i sse::insert(__m128i a, int i, int8_t) {
+    return insert_epi8<imm8>(a, i);
 }
 
 template <int imm8>
-int sse::extract(__m128i a, uint32_t) {
-    return extract_epi32<imm8>(a);
+__m128i sse::insert(__m128i a, int i, int32_t) {
+    return insert_epi32<imm8>(a, i);
 }
 
 template <int imm8>
-int64_t sse::extract(__m128i a, uint64_t) {
-    return extract_epi64<imm8>(a);
+__m128i sse::insert(__m128i a, int64_t i, int64_t) {
+    return insert_epi64<imm8>(a, i);
 }
 
-uint64_t sse::getlow(__m128i v) { return simd_cast<__m128i_t, uint64_t>(v); }
-uint64_t sse::gethigh(__m128i v) { return extract_epi64<1>(v); }
-
 template <int imm8>
-__m128i sse::insert_epi16(__m128i a, int i) {
-    return _mm_insert_epi16(a, i, imm8);
+__m128i sse::insert(__m128i a, int i, uint8_t) {
+    return insert_epi8<imm8>(a, i);
 }
 
 template <int imm8>
-__m128i sse::insert(__m128i a, int i, int16_t) {
-    return insert_epi16<imm8>(a, i);
+__m128i sse::insert(__m128i a, int i, uint32_t) {
+    return insert_epi32<imm8>(a, i);
 }
 
 template <int imm8>
-__m128i sse::insert(__m128i a, int i, uint16_t) {
-    return insert_epi16<imm8>(a, i);
+__m128i sse::insert(__m128i a, int64_t i, uint64_t) {
+    return insert_epi64<imm8>(a, i);
 }
 
-void sse::lfence() { _mm_lfence(); }
+__m128i sse::minpos_epu16(__m128i a) { return _mm_minpos_epu16(a); }
 
-__m128i sse::load(const void *ptr) {
-    return _mm_load_si128(static_cast<const __m128i *>(ptr));
-}
-__m128i sse::loadu(const void *ptr) {
-    return _mm_loadu_si128(static_cast<const __m128i *>(ptr));
-}
-__m128i sse::loadu_si16(const void *ptr) {
-    return simd_cast<uint16_t, __m128i_t>(read_memory<uint16_t>(ptr));
-}
+__m128i sse::mul_epi32(__m128i a, __m128i b) { return _mm_mul_epi32(a, b); }
 
-__m128i sse::loadu_si32(const void *ptr) {
-    return simd_cast<uint32_t, __m128i_t>(read_memory<uint32_t>(ptr));
-}
+__m128i sse::mullo_epi32(__m128i a, __m128i b) { return _mm_mullo_epi32(a, b); }
 
-__m128i sse::loadu_si64(const void *ptr) {
-    return simd_cast<uint64_t, __m128i_t>(read_memory<uint64_t>(ptr));
-}
+__m128i sse::packus_epi32(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
 
-template <typename T, WJR_REQUIRES_I(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t,
-                                                 uint8_t, uint16_t, uint32_t, uint64_t>)>
-__m128i sse::logical_and(__m128i a, __m128i b, T) {
-    return Not(Or(logical_not(a, T()), logical_not(b, T())));
+__m128i sse::stream_load(void *p) {
+    return _mm_stream_load_si128(static_cast<__m128i *>(p));
 }
 
-template <typename T, WJR_REQUIRES_I(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t,
-                                                 uint8_t, uint16_t, uint32_t, uint64_t>)>
-__m128i sse::logical_not(__m128i v, T) {
-    auto Zero = zeros();
-    return cmpeq(v, Zero, T());
-}
+int sse::test_all_ones(__m128i a) { return _mm_test_all_ones(a); }
 
-template <typename T, WJR_REQUIRES_I(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t,
-                                                 uint8_t, uint16_t, uint32_t, uint64_t>)>
-__m128i sse::logical_or(__m128i a, __m128i b, T) {
-    return Not(logical_not(Or(a, b), T()));
-}
+int sse::test_all_zeros(__m128i a, __m128i b) { return _mm_test_all_zeros(a, b); }
 
-__m128i sse::madd_epi16(__m128i a, __m128i b) { return _mm_madd_epi16(a, b); }
+int sse::test_all_zeros(__m128i a) { return _mm_test_all_zeros(a, a); }
 
-void sse::maskmoveu(__m128i a, __m128i mask, char *mem_addr) {
-    return _mm_maskmoveu_si128(a, mask, mem_addr);
+int sse::test_mix_ones_zeros(__m128i a, __m128i b) {
+    return _mm_test_mix_ones_zeros(a, b);
 }
 
-__m128i sse::max_epi8(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_max_epi8(a, b);
-#else
-    return blendv_epi8(b, a, cmpgt_epi8(a, b));
-#endif
-}
+int sse::testc(__m128i a, __m128i b) { return _mm_testc_si128(a, b); }
 
-__m128i sse::max_epi16(__m128i a, __m128i b) { return _mm_max_epi16(a, b); }
+int sse::testnzc(__m128i a, __m128i b) { return _mm_testnzc_si128(a, b); }
+
+int sse::testz(__m128i a, __m128i b) { return _mm_testz_si128(a, b); }
 
-__m128i sse::max_epi32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_max_epi32(a, b);
-#else
-    return blendv_epi8(b, a, cmpgt_epi32(a, b));
 #endif
-}
 
-__m128i sse::max_epu8(__m128i a, __m128i b) { return _mm_max_epu8(a, b); }
+} // namespace wjr
 
-__m128i sse::max_epu16(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_max_epu16(a, b);
-#else
-    return add(subs_epu16(b, a), a, uint16_t());
-#endif
-}
+#endif // WJR_X86_SIMD_SSE_HPP__
 
-__m128i sse::max_epu32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_max_epu32(a, b);
-#else
-    return blendv_epi8(b, a, cmpgt_epu32(a, b));
-#endif
-}
+namespace wjr {
 
-__m128i sse::max(__m128i a, __m128i b, int8_t) { return max_epi8(a, b); }
-__m128i sse::max(__m128i a, __m128i b, int16_t) { return max_epi16(a, b); }
-__m128i sse::max(__m128i a, __m128i b, int32_t) { return max_epi32(a, b); }
-__m128i sse::max(__m128i a, __m128i b, uint8_t) { return max_epu8(a, b); }
-__m128i sse::max(__m128i a, __m128i b, uint16_t) { return max_epu16(a, b); }
-__m128i sse::max(__m128i a, __m128i b, uint32_t) { return max_epu32(a, b); }
+struct avx {
+    using mask_type = uint32_t;
 
-int8_t sse::max_epi8(__m128i a) { return 0x7fu ^ min_epu8(Xor(a, set1_epi8(0x7fu))); }
+#if WJR_HAS_SIMD(AVX)
 
-int16_t sse::max_epi16(__m128i a) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return 0x7fffu ^ min_epu16(Xor(a, set1_epi16(0x7fffu)));
-#else
-    a = max_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
-    return simd_cast<__m128i_t, int16_t>(a);
-#endif
-}
+    using float_type = __m256;
+    using float_tag_type = __m256_t;
+    using int_type = __m256i;
+    using int_tag_type = __m256i_t;
+    using double_type = __m256d;
+    using double_tag_type = __m256d_t;
 
-int32_t sse::max_epi32(__m128i a) {
-    a = max_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = max_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    return simd_cast<__m128i_t, int32_t>(a);
-}
+#endif // AVX
 
-uint8_t sse::max_epu8(__m128i a) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return 0xffu ^ min_epu8(Xor(a, ones()));
-#else
-    a = max_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
-    auto X = simd_cast<__m128i_t, uint32_t>(a);
-    return std::max((uint8_t)X, (uint8_t)(X >> 8));
-#endif
-}
+    constexpr static size_t width();
+    constexpr static mask_type mask();
 
-uint16_t sse::max_epu16(__m128i a) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return 0xffffu ^ min_epu16(Xor(a, ones()));
-#else
-    a = max_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
-    return simd_cast<__m128i_t, uint16_t>(a);
-#endif
-}
+#if WJR_HAS_SIMD(AVX)
 
-uint32_t sse::max_epu32(__m128i a) {
-    a = max_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = max_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    return simd_cast<__m128i_t, uint32_t>(a);
-}
+    WJR_INTRINSIC_INLINE static __m256i concat(__m128i a, __m128i b);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract_epi32(__m256i v);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int64_t extract_epi64(__m256i v);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m256i v, int32_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int64_t extract(__m256i v, int64_t);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m128i extract_si128(__m256i v);
+
+    WJR_INTRINSIC_INLINE static __m128i getlow(__m256i a);
+
+    WJR_INTRINSIC_INLINE static __m128i gethigh(__m256i a);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i insert_epi8(__m256i v, int8_t i);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i insert_epi16(__m256i v, int16_t i);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i insert_epi32(__m256i v, int32_t i);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i insert_epi64(__m256i v, int64_t i);
+
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i insert_si128(__m256i a, __m128i b);
+
+    WJR_INTRINSIC_INLINE static __m256i load(const void *p);
+    WJR_INTRINSIC_INLINE static __m256i loadu(const void *p);
+
+    WJR_INTRINSIC_INLINE static __m256i ones();
+
+    WJR_INTRINSIC_INLINE static __m256i loadu_si16(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si32(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si48(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si64(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si80(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si96(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si112(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si128(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si144(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si160(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si176(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si192(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si208(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si224(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si240(const void *ptr);
+    WJR_INTRINSIC_INLINE static __m256i loadu_si256(const void *ptr);
+
+    WJR_INTRINSIC_INLINE static __m256i loadu_si16x(const void *ptr, int n);
+
+    WJR_INTRINSIC_INLINE static __m256i
+    set_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25,
+             char e24, char e23, char e22, char e21, char e20, char e19, char e18,
+             char e17, char e16, char e15, char e14, char e13, char e12, char e11,
+             char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3,
+             char e2, char e1, char e0);
+
+    WJR_INTRINSIC_INLINE static __m256i set_epi16(short e15, short e14, short e13,
+                                                  short e12, short e11, short e10,
+                                                  short e9, short e8, short e7, short e6,
+                                                  short e5, short e4, short e3, short e2,
+                                                  short e1, short e0);
+
+    WJR_INTRINSIC_INLINE static __m256i set_epi32(int e7, int e6, int e5, int e4, int e3,
+                                                  int e2, int e1, int e0);
+
+    WJR_INTRINSIC_INLINE static __m256i set_epi64x(long long e3, long long e2,
+                                                   long long e1, long long e0);
+
+    WJR_INTRINSIC_INLINE static __m256i
+    setr_epi8(char e31, char e30, char e29, char e28, char e27, char e26, char e25,
+              char e24, char e23, char e22, char e21, char e20, char e19, char e18,
+              char e17, char e16, char e15, char e14, char e13, char e12, char e11,
+              char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3,
+              char e2, char e1, char e0);
+
+    WJR_INTRINSIC_INLINE static __m256i setr_epi16(short e15, short e14, short e13,
+                                                   short e12, short e11, short e10,
+                                                   short e9, short e8, short e7, short e6,
+                                                   short e5, short e4, short e3, short e2,
+                                                   short e1, short e0);
+
+    WJR_INTRINSIC_INLINE static __m256i setr_epi32(int e7, int e6, int e5, int e4, int e3,
+                                                   int e2, int e1, int e0);
+
+    WJR_INTRINSIC_INLINE static __m256i setr_epi64x(long long e3, long long e2,
+                                                    long long e1, long long e0);
+
+    WJR_INTRINSIC_INLINE static __m256i set1_epi8(int8_t a);
+    WJR_INTRINSIC_INLINE static __m256i set1_epi16(int16_t a);
+    WJR_INTRINSIC_INLINE static __m256i set1_epi32(int32_t a);
+    WJR_INTRINSIC_INLINE static __m256i set1_epi64(int64_t a);
+
+    WJR_INTRINSIC_INLINE static __m256i set1(int8_t a, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i set1(int16_t a, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i set1(int32_t a, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i set1(int64_t a, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i set1(uint8_t a, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i set1(uint16_t a, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i set1(uint32_t a, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i set1(uint64_t a, uint64_t);
+
+    WJR_INTRINSIC_INLINE static __m256i setmin_epi8();
+    WJR_INTRINSIC_INLINE static __m256i setmin_epi16();
+    WJR_INTRINSIC_INLINE static __m256i setmin_epi32();
+    WJR_INTRINSIC_INLINE static __m256i setmin_epi64();
 
-int8_t sse::max(__m128i a, int8_t) { return max_epi8(a); }
-int16_t sse::max(__m128i a, int16_t) { return max_epi16(a); }
-int32_t sse::max(__m128i a, int32_t) { return max_epi32(a); }
-uint8_t sse::max(__m128i a, uint8_t) { return max_epu8(a); }
-uint16_t sse::max(__m128i a, uint16_t) { return max_epu16(a); }
-uint32_t sse::max(__m128i a, uint32_t) { return max_epu32(a); }
+    WJR_INTRINSIC_INLINE static __m256i setmin(int8_t);
+    WJR_INTRINSIC_INLINE static __m256i setmin(int16_t);
+    WJR_INTRINSIC_INLINE static __m256i setmin(int32_t);
+    WJR_INTRINSIC_INLINE static __m256i setmin(int64_t);
 
-void sse::mfence() { _mm_mfence(); }
+    WJR_INTRINSIC_INLINE static __m256i setmax_epi8();
+    WJR_INTRINSIC_INLINE static __m256i setmax_epi16();
+    WJR_INTRINSIC_INLINE static __m256i setmax_epi32();
+    WJR_INTRINSIC_INLINE static __m256i setmax_epi64();
 
-__m128i sse::min_epi8(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_min_epi8(a, b);
-#else
-    return blendv_epi8(a, b, cmpgt_epi8(a, b));
-#endif
-}
+    WJR_INTRINSIC_INLINE static __m256i setmax(int8_t);
+    WJR_INTRINSIC_INLINE static __m256i setmax(int16_t);
+    WJR_INTRINSIC_INLINE static __m256i setmax(int32_t);
+    WJR_INTRINSIC_INLINE static __m256i setmax(int64_t);
 
-__m128i sse::min_epi16(__m128i a, __m128i b) { return _mm_min_epi16(a, b); }
+    WJR_INTRINSIC_INLINE static void stream(__m256i *p, __m256i a);
 
-__m128i sse::min_epi32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_min_epi32(a, b);
-#else
-    return blendv_epi8(a, b, cmpgt_epi32(a, b));
-#endif
-}
+    WJR_INTRINSIC_INLINE static void store(void *p, __m256i a);
+    WJR_INTRINSIC_INLINE static void storeu(void *p, __m256i a);
 
-__m128i sse::min_epu8(__m128i a, __m128i b) { return _mm_min_epu8(a, b); }
+    WJR_INTRINSIC_INLINE static int test_all_zeros(__m256i a);
 
-__m128i sse::min_epu16(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_min_epu16(a, b);
-#else
-    return blendv_epi8(a, b, cmpgt_epu16(a, b));
-#endif
-}
+    WJR_INTRINSIC_INLINE static int testc(__m256i a, __m256i b);
 
-__m128i sse::min_epu32(__m128i a, __m128i b) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return _mm_min_epu32(a, b);
-#else
-    return blendv_epi8(a, b, cmpgt_epu32(a, b));
-#endif
-}
+    WJR_INTRINSIC_INLINE static int testnzc(__m256i a, __m256i b);
 
-__m128i sse::min(__m128i a, __m128i b, int8_t) { return min_epi8(a, b); }
-__m128i sse::min(__m128i a, __m128i b, int16_t) { return min_epi16(a, b); }
-__m128i sse::min(__m128i a, __m128i b, int32_t) { return min_epi32(a, b); }
-__m128i sse::min(__m128i a, __m128i b, uint8_t) { return min_epu8(a, b); }
-__m128i sse::min(__m128i a, __m128i b, uint16_t) { return min_epu16(a, b); }
-__m128i sse::min(__m128i a, __m128i b, uint32_t) { return min_epu32(a, b); }
+    WJR_INTRINSIC_INLINE static int testz(__m256i a, __m256i b);
 
-int8_t sse::min_epi8(__m128i a) { return 0x80u ^ min_epu8(Xor(a, setmin_epi8())); }
+    WJR_INTRINSIC_INLINE static __m256i zeros();
 
-int16_t sse::min_epi16(__m128i a) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return 0x8000u ^ min_epu16(Xor(a, setmin_epi16()));
-#else
-    a = min_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
-    return simd_cast<__m128i_t, int16_t>(a);
-#endif
-}
+#endif // AVX
 
-int32_t sse::min_epi32(__m128i a) {
-    a = min_epi32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = min_epi32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    return simd_cast<__m128i_t, int32_t>(a);
-}
+#if WJR_HAS_SIMD(AVX2)
 
-uint8_t sse::min_epu8(__m128i a) {
-#if WJR_HAS_SIMD(SSE4_1)
-    a = min_epu8(a, srli_epi16(a, 8));
-    a = _mm_minpos_epu16(a);
-    return simd_cast<__m128i_t, uint8_t>(a);
-#else
-    a = min_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
-    auto X = simd_cast<__m128i_t, uint32_t>(a);
-    return std::min((uint8_t)X, (uint8_t)(X >> 8));
-#endif
-}
+    WJR_INTRINSIC_INLINE static __m256i And(__m256i a, __m256i b);
 
-uint16_t sse::min_epu16(__m128i a) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return simd_cast<__m128i_t, uint16_t>(_mm_minpos_epu16(a));
-#else
-    a = min_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a));
-    return simd_cast<__m128i_t, uint16_t>(a);
-#endif
-}
+    WJR_INTRINSIC_INLINE static __m256i AndNot(__m256i a, __m256i b);
 
-uint32_t sse::min_epu32(__m128i a) {
-    a = min_epu32(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a));
-    a = min_epu32(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a));
-    return simd_cast<__m128i_t, uint32_t>(a);
-}
+    WJR_INTRINSIC_INLINE static __m256i Or(__m256i a, __m256i b);
 
-int8_t sse::min(__m128i a, int8_t) { return min_epi8(a); }
-int16_t sse::min(__m128i a, int16_t) { return min_epi16(a); }
-int32_t sse::min(__m128i a, int32_t) { return min_epi32(a); }
-uint8_t sse::min(__m128i a, uint8_t) { return min_epu8(a); }
-uint16_t sse::min(__m128i a, uint16_t) { return min_epu16(a); }
-uint32_t sse::min(__m128i a, uint32_t) { return min_epu32(a); }
+    WJR_INTRINSIC_INLINE static __m256i Xor(__m256i a, __m256i b);
 
-__m128i sse::move_epi64(__m128i a) { return _mm_move_epi64(a); }
+    WJR_INTRINSIC_INLINE static __m256i Not(__m256i v);
 
-sse::mask_type sse::movemask_epi8(__m128i a) {
-    return static_cast<mask_type>(_mm_movemask_epi8(a));
-}
-sse::mask_type sse::movemask_pd(__m128d v) {
-    return static_cast<mask_type>(_mm_movemask_pd(v));
-}
+    WJR_INTRINSIC_INLINE static __m256i abs_epi8(__m256i v);
+    WJR_INTRINSIC_INLINE static __m256i abs_epi16(__m256i v);
+    WJR_INTRINSIC_INLINE static __m256i abs_epi32(__m256i v);
 
-sse::mask_type sse::movemask(__m128i v, int8_t) { return movemask_epi8(v); }
-sse::mask_type sse::movemask(__m128i v, int32_t) {
-    return movemask_ps(simd_cast<__m128i_t, __m128_t>(v));
-}
-sse::mask_type sse::movemask(__m128i v, int64_t) {
-    return movemask_pd(simd_cast<__m128i_t, __m128d_t>(v));
-}
-sse::mask_type sse::movemask(__m128i v, uint8_t) { return movemask(v, int8_t()); }
-sse::mask_type sse::movemask(__m128i v, uint32_t) { return movemask(v, int32_t()); }
-sse::mask_type sse::movemask(__m128i v, uint64_t) { return movemask(v, int64_t()); }
+    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i abs(__m256i v, int64_t);
 
-__m128i sse::mul_epu32(__m128i a, __m128i b) { return _mm_mul_epu32(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i add_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i add_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i add_epi32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i add_epi64(__m256i a, __m256i b);
 
-__m128i sse::mulhi_epi16(__m128i a, __m128i b) { return _mm_mulhi_epi16(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i add(__m256i a, __m256i b, uint64_t);
 
-__m128i sse::mulhi_epu16(__m128i a, __m128i b) { return _mm_mulhi_epu16(a, b); }
+    WJR_INTRINSIC_INLINE static uint8_t add_epu8(__m256i v);
+    WJR_INTRINSIC_INLINE static uint16_t add_epu16(__m256i v);
+    WJR_INTRINSIC_INLINE static uint32_t add_epu32(__m256i v);
+    WJR_INTRINSIC_INLINE static uint64_t add_epu64(__m256i v);
 
-__m128i sse::mullo_epi16(__m128i a, __m128i b) { return _mm_mullo_epi16(a, b); }
+    WJR_INTRINSIC_INLINE static int8_t add_epi8(__m256i v);
+    WJR_INTRINSIC_INLINE static int16_t add_epi16(__m256i v);
+    WJR_INTRINSIC_INLINE static int32_t add_epi32(__m256i v);
+    WJR_INTRINSIC_INLINE static int64_t add_epi64(__m256i v);
 
-__m128i sse::negate_epi8(__m128i a) {
-#if WJR_HAS_SIMD(SSSE3)
-    return sign_epi8(a, ones());
-#else
-    return sub_epi8(zeros(), a);
-#endif
-}
+    WJR_INTRINSIC_INLINE static int8_t add(__m256i v, int8_t);
+    WJR_INTRINSIC_INLINE static int16_t add(__m256i v, int16_t);
+    WJR_INTRINSIC_INLINE static int32_t add(__m256i v, int32_t);
+    WJR_INTRINSIC_INLINE static int64_t add(__m256i v, int64_t);
+    WJR_INTRINSIC_INLINE static uint8_t add(__m256i v, uint8_t);
+    WJR_INTRINSIC_INLINE static uint16_t add(__m256i v, uint16_t);
+    WJR_INTRINSIC_INLINE static uint32_t add(__m256i v, uint32_t);
+    WJR_INTRINSIC_INLINE static uint64_t add(__m256i v, uint64_t);
 
-__m128i sse::negate_epi16(__m128i a) {
-#if WJR_HAS_SIMD(SSSE3)
-    return sign_epi16(a, ones());
-#else
-    return sub_epi16(zeros(), a);
-#endif
-}
+    WJR_INTRINSIC_INLINE static __m256i adds_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i adds_epi16(__m256i a, __m256i b);
 
-__m128i sse::negate_epi32(__m128i a) {
-#if WJR_HAS_SIMD(SSSE3)
-    return sign_epi32(a, ones());
-#else
-    return sub_epi32(zeros(), a);
-#endif
-}
+    WJR_INTRINSIC_INLINE static __m256i adds_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i adds_epu16(__m256i a, __m256i b);
 
-__m128i sse::negate_epi64(__m128i a) { return sub_epi64(zeros(), a); }
+    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i adds(__m256i a, __m256i b, uint16_t);
 
-__m128i sse::negate(__m128i a, int8_t) { return negate_epi8(a); }
-__m128i sse::negate(__m128i a, int16_t) { return negate_epi16(a); }
-__m128i sse::negate(__m128i a, int32_t) { return negate_epi32(a); }
-__m128i sse::negate(__m128i a, int64_t) { return negate_epi64(a); }
-__m128i sse::negate(__m128i a, uint8_t) { return negate_epi8(a); }
-__m128i sse::negate(__m128i a, uint16_t) { return negate_epi16(a); }
-__m128i sse::negate(__m128i a, uint32_t) { return negate_epi32(a); }
-__m128i sse::negate(__m128i a, uint64_t) { return negate_epi64(a); }
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b);
 
-__m128i sse::Not(__m128i v) { return Xor(v, ones()); }
+    WJR_INTRINSIC_INLINE static __m256i alignr_epi16(__m256i a, __m256i b, int c);
+    WJR_INTRINSIC_INLINE static __m256i alignr_epi32(__m256i a, __m256i b, int c);
+    WJR_INTRINSIC_INLINE static __m256i alignr_epi64(__m256i a, __m256i b, int c);
 
-__m128i sse::Or(__m128i a, __m128i b) { return _mm_or_si128(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i alignr(__m256i a, __m256i b, int c, uint64_t);
 
-__m128i sse::packs_epi16(__m128i a, __m128i b) { return _mm_packs_epi16(a, b); }
-__m128i sse::packs_epi32(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i avg_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i avg_epu16(__m256i a, __m256i b);
 
-__m128i sse::packus_epi16(__m128i a, __m128i b) { return _mm_packus_epi16(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i avg(__m256i a, __m256i b, uint16_t);
 
-__m128i sse::loadu_si48(const void *ptr) {
-    return insert_epi16<2>(loadu_si32(ptr), reinterpret_cast<const uint16_t *>(ptr)[2]);
-}
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i blend_epi16(__m256i a, __m256i b);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i blend_epi32(__m256i a, __m256i b);
 
-__m128i sse::loadu_si80(const void *ptr) {
-    return insert_epi16<4>(loadu_si64(ptr), reinterpret_cast<const uint16_t *>(ptr)[4]);
-}
+    WJR_INTRINSIC_INLINE static __m256i blendv_epi8(__m256i a, __m256i b, __m256i mask);
 
-__m128i sse::loadu_si96(const void *ptr) {
-#if WJR_HAS_SIMD(SSE4_1)
-    return insert_epi32<2>(loadu_si64(ptr), reinterpret_cast<const uint32_t *>(ptr)[2]);
-#else
-    return insert_epi16<5>(loadu_si80(ptr), reinterpret_cast<const uint16_t *>(ptr)[5]);
-#endif
-}
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i bslli_epi128(__m256i a);
 
-__m128i sse::loadu_si112(const void *ptr) {
-    return insert_epi16<6>(loadu_si96(ptr), reinterpret_cast<const uint16_t *>(ptr)[6]);
-}
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i bsrli_epi128(__m256i a);
 
-__m128i sse::loadu_si128(const void *ptr) { return loadu(ptr); }
+    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq_epi64(__m256i a, __m256i b);
 
-__m128i sse::loadu_si16x(const void *ptr, int n) {
-    switch (n) {
-    case 0:
-        return zeros();
-    case 1:
-        return loadu_si16(ptr);
-    case 2:
-        return loadu_si32(ptr);
-    case 3:
-        return loadu_si48(ptr);
-    case 4:
-        return loadu_si64(ptr);
-    case 5:
-        return loadu_si80(ptr);
-    case 6:
-        return loadu_si96(ptr);
-    case 7:
-        return loadu_si112(ptr);
-    default:
-        return loadu_si128(ptr);
-    }
-}
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpeq(__m256i a, __m256i b, uint64_t);
 
-__m128i sse::sad_epu8(__m128i a, __m128i b) { return _mm_sad_epu8(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i cmpge_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpge_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpge_epi32(__m256i a, __m256i b);
 
-__m128i sse::zeros() { return _mm_setzero_si128(); }
-__m128i sse::ones() { return _mm_set1_epi32(-1); }
+    WJR_INTRINSIC_INLINE static __m256i cmpge_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpge_epu16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpge_epu32(__m256i a, __m256i b);
 
-__m128i sse::set_epi8(char e15, char e14, char e13, char e12, char e11, char e10, char e9,
-                      char e8, char e7, char e6, char e5, char e4, char e3, char e2,
-                      char e1, char e0) {
-    return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1,
-                        e0);
-}
+    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpge(__m256i a, __m256i b, uint32_t);
 
-__m128i sse::set_epi16(short e7, short e6, short e5, short e4, short e3, short e2,
-                       short e1, short e0) {
-    return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
-}
-__m128i sse::set_epi32(int e3, int e2, int e1, int e0) {
-    return _mm_set_epi32(e3, e2, e1, e0);
-}
-__m128i sse::set_epi64x(long long e1, long long e0) { return _mm_set_epi64x(e1, e0); }
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epi64(__m256i a, __m256i b);
 
-__m128i sse::setr_epi8(char e15, char e14, char e13, char e12, char e11, char e10,
-                       char e9, char e8, char e7, char e6, char e5, char e4, char e3,
-                       char e2, char e1, char e0) {
-    return _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1,
-                         e0);
-}
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt_epu64(__m256i a, __m256i b);
 
-__m128i sse::setr_epi16(short e7, short e6, short e5, short e4, short e3, short e2,
-                        short e1, short e0) {
-    return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
-}
-__m128i sse::setr_epi32(int e3, int e2, int e1, int e0) {
-    return _mm_setr_epi32(e3, e2, e1, e0);
-}
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpgt(__m256i a, __m256i b, uint64_t);
 
-__m128i sse::set1_epi8(int8_t val) { return _mm_set1_epi8(val); }
-__m128i sse::set1_epi16(int16_t val) { return _mm_set1_epi16(val); }
-__m128i sse::set1_epi32(int32_t val) { return _mm_set1_epi32(val); }
-__m128i sse::set1_epi64(int64_t val) { return _mm_set1_epi64x(val); }
+    WJR_INTRINSIC_INLINE static __m256i cmple_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmple_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmple_epi32(__m256i a, __m256i b);
 
-__m128i sse::set1(int8_t val, int8_t) { return set1_epi8(val); }
-__m128i sse::set1(int16_t val, int16_t) { return set1_epi16(val); }
-__m128i sse::set1(int32_t val, int32_t) { return set1_epi32(val); }
-__m128i sse::set1(int64_t val, int64_t) { return set1_epi64(val); }
-__m128i sse::set1(uint8_t val, uint8_t) { return set1_epi8(val); }
-__m128i sse::set1(uint16_t val, uint16_t) { return set1_epi16(val); }
-__m128i sse::set1(uint32_t val, uint32_t) { return set1_epi32(val); }
-__m128i sse::set1(uint64_t val, uint64_t) { return set1_epi64(val); }
+    WJR_INTRINSIC_INLINE static __m256i cmple_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmple_epu16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmple_epu32(__m256i a, __m256i b);
 
-__m128i sse::setmin_epi8() { return set1_epi8(0x80u); }
-__m128i sse::setmin_epi16() { return set1_epi16(0x8000u); }
-__m128i sse::setmin_epi32() { return set1_epi32(0x80000000u); }
+    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmple(__m256i a, __m256i b, uint32_t);
 
-__m128i sse::setmin(int8_t) { return setmin_epi8(); }
-__m128i sse::setmin(int16_t) { return setmin_epi16(); }
-__m128i sse::setmin(int32_t) { return setmin_epi32(); }
-__m128i sse::setmin(uint8_t) { return set1_epi32(0); }
-__m128i sse::setmin(uint16_t) { return set1_epi32(0); }
-__m128i sse::setmin(uint32_t) { return set1_epi32(0); }
+    WJR_INTRINSIC_INLINE static __m256i cmplt_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmplt_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmplt_epi32(__m256i a, __m256i b);
 
-__m128i sse::setmax_epi8() { return set1_epi8(0x7F); }
-__m128i sse::setmax_epi16() { return set1_epi16(0x7FFF); }
-__m128i sse::setmax_epi32() { return set1_epi32(0x7FFFFFFF); }
+    WJR_INTRINSIC_INLINE static __m256i cmplt_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmplt_epu16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmplt_epu32(__m256i a, __m256i b);
 
-__m128i sse::setmax(int8_t) { return setmax_epi8(); }
-__m128i sse::setmax(int16_t) { return setmax_epi16(); }
-__m128i sse::setmax(int32_t) { return setmax_epi32(); }
-__m128i sse::setmax(uint8_t) { return set1_epi32(0xFFFFFFFF); }
-__m128i sse::setmax(uint16_t) { return set1_epi32(0xFFFFFFFF); }
-__m128i sse::setmax(uint32_t) { return set1_epi32(0xFFFFFFFF); }
+    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmplt(__m256i a, __m256i b, uint32_t);
 
-template <int imm>
-__m128i sse::shl(__m128i a) {
-    if constexpr (imm >= 64) {
-        a = slli<8>(a);
-        a = slli_epi64(a, imm - 64);
-        return a;
-    } else {
-        auto b = slli_epi64(a, imm);
-        auto c = slli<8>(a);
-        c = srli_epi64(c, 64 - imm);
-        return Or(b, c);
-    }
-}
+    WJR_INTRINSIC_INLINE static __m256i cmpne_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpne_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i cmpne_epi32(__m256i a, __m256i b);
 
-template <int imm>
-__m128i sse::shr(__m128i a) {
-    if constexpr (imm >= 64) {
-        a = srli<8>(a);
-        a = srli_epi64(a, imm - 64);
-        return a;
-    } else {
-        auto b = srli_epi64(a, imm);
-        auto c = srli<8>(a);
-        c = slli_epi64(c, 64 - imm);
-        return Or(b, c);
-    }
-}
+    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i cmpne(__m256i a, __m256i b, uint32_t);
 
-template <int imm8>
-__m128i sse::shuffle_epi32(__m128i v) {
-    static_assert(imm8 >= 0 && imm8 <= 255, "imm8 must be in range [0, 255]");
-    return _mm_shuffle_epi32(v, imm8);
-}
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::equal_to<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::not_equal_to<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::greater_equal<>,
+                                            T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less<>, T);
+    template <typename T>
+    WJR_INTRINSIC_INLINE static __m256i cmp(__m256i a, __m256i b, std::less_equal<>, T);
 
-template <int imm8>
-__m128i sse::shufflehi_epi16(__m128i v) {
-    return _mm_shufflehi_epi16(v, imm8);
-}
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract_epi8(__m256i v);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract_epi16(__m256i v);
 
-template <int imm8>
-__m128i sse::shufflelo_epi16(__m128i v) {
-    return _mm_shufflelo_epi16(v, imm8);
-}
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m256i v, int8_t);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static int extract(__m256i v, int16_t);
 
-__m128i sse::sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
-__m128i sse::sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
-__m128i sse::sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i hadd_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i hadd_epi32(__m256i a, __m256i b);
 
-__m128i sse::sll(__m128i a, __m128i b, int16_t) { return sll_epi16(a, b); }
-__m128i sse::sll(__m128i a, __m128i b, int32_t) { return sll_epi32(a, b); }
-__m128i sse::sll(__m128i a, __m128i b, int64_t) { return sll_epi64(a, b); }
-__m128i sse::sll(__m128i a, __m128i b, uint16_t) { return sll_epi16(a, b); }
-__m128i sse::sll(__m128i a, __m128i b, uint32_t) { return sll_epi32(a, b); }
-__m128i sse::sll(__m128i a, __m128i b, uint64_t) { return sll_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i hadd(__m256i a, __m256i b, int32_t);
 
-template <int imm8>
-__m128i sse::slli(__m128i v) {
-    return _mm_slli_si128(v, imm8);
-}
-__m128i sse::slli_epi16(__m128i a, int imm8) {
-    if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) {
-        return sse::add_epi16(a, a);
-    }
+    WJR_INTRINSIC_INLINE static __m256i hadds_epi16(__m256i a, __m256i b);
 
-    return _mm_slli_epi16(a, imm8);
-}
-__m128i sse::slli_epi32(__m128i a, int imm8) {
-    if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) {
-        return sse::add_epi32(a, a);
-    }
+    WJR_INTRINSIC_INLINE static __m256i hsub_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i hsub_epi32(__m256i a, __m256i b);
+
+    WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i hsub(__m256i a, __m256i b, int32_t);
 
-    return _mm_slli_epi32(a, imm8);
-}
-__m128i sse::slli_epi64(__m128i a, int imm8) {
-    if (WJR_BUILTIN_CONSTANT_P_TRUE(imm8 == 1)) {
-        return sse::add_epi64(a, a);
-    }
+    WJR_INTRINSIC_INLINE static __m256i hsubs_epi16(__m256i a, __m256i b);
 
-    return _mm_slli_epi64(a, imm8);
-}
+    template <typename T,
+              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                       uint16_t, uint32_t, uint64_t>)>
+    WJR_INTRINSIC_INLINE static __m256i logical_and(__m256i a, __m256i b, T);
 
-__m128i sse::slli(__m128i a, int imm8, int16_t) { return slli_epi16(a, imm8); }
-__m128i sse::slli(__m128i a, int imm8, int32_t) { return slli_epi32(a, imm8); }
-__m128i sse::slli(__m128i a, int imm8, int64_t) { return slli_epi64(a, imm8); }
-__m128i sse::slli(__m128i a, int imm8, uint16_t) { return slli_epi16(a, imm8); }
-__m128i sse::slli(__m128i a, int imm8, uint32_t) { return slli_epi32(a, imm8); }
-__m128i sse::slli(__m128i a, int imm8, uint64_t) { return slli_epi64(a, imm8); }
+    template <typename T,
+              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                       uint16_t, uint32_t, uint64_t>)>
+    WJR_INTRINSIC_INLINE static __m256i logical_not(__m256i v, T);
 
-__m128i sse::sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
-__m128i sse::sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
+    template <typename T,
+              WJR_REQUIRES(is_any_of_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                       uint16_t, uint32_t, uint64_t>)>
+    WJR_INTRINSIC_INLINE static __m256i logical_or(__m256i a, __m256i b, T);
 
-__m128i sse::sra(__m128i a, __m128i b, int16_t) { return sra_epi16(a, b); }
-__m128i sse::sra(__m128i a, __m128i b, int32_t) { return sra_epi32(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i madd_epi16(__m256i a, __m256i b);
 
-__m128i sse::srai_epi16(__m128i a, int imm8) { return _mm_srai_epi16(a, imm8); }
-__m128i sse::srai_epi32(__m128i a, int imm8) { return _mm_srai_epi32(a, imm8); }
+    WJR_INTRINSIC_INLINE static __m256i max_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i max_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i max_epi32(__m256i a, __m256i b);
 
-__m128i sse::srai(__m128i a, int imm8, int16_t) { return srai_epi16(a, imm8); }
-__m128i sse::srai(__m128i a, int imm8, int32_t) { return srai_epi32(a, imm8); }
+    WJR_INTRINSIC_INLINE static __m256i max_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i max_epu16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i max_epu32(__m256i a, __m256i b);
 
-__m128i sse::srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
-__m128i sse::srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
-__m128i sse::srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i max(__m256i a, __m256i b, uint32_t);
 
-__m128i sse::srl(__m128i a, __m128i b, int16_t) { return srl_epi16(a, b); }
-__m128i sse::srl(__m128i a, __m128i b, int32_t) { return srl_epi32(a, b); }
-__m128i sse::srl(__m128i a, __m128i b, int64_t) { return srl_epi64(a, b); }
-__m128i sse::srl(__m128i a, __m128i b, uint16_t) { return srl_epi16(a, b); }
-__m128i sse::srl(__m128i a, __m128i b, uint32_t) { return srl_epi32(a, b); }
-__m128i sse::srl(__m128i a, __m128i b, uint64_t) { return srl_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static int8_t max_epi8(__m256i a);
+    WJR_INTRINSIC_INLINE static int16_t max_epi16(__m256i a);
+    WJR_INTRINSIC_INLINE static int32_t max_epi32(__m256i a);
+    WJR_INTRINSIC_INLINE static uint8_t max_epu8(__m256i a);
+    WJR_INTRINSIC_INLINE static uint16_t max_epu16(__m256i a);
+    WJR_INTRINSIC_INLINE static uint32_t max_epu32(__m256i a);
 
-template <int imm8>
-__m128i sse::srli(__m128i v) {
-    return _mm_srli_si128(v, imm8);
-}
-__m128i sse::srli_epi8(__m128i a, int imm8) {
-    return And(srli_epi16(a, imm8), sse_detail::srli_epi8_mask[imm8]);
-}
-__m128i sse::srli_epi16(__m128i a, int imm8) { return _mm_srli_epi16(a, imm8); }
-__m128i sse::srli_epi32(__m128i a, int imm8) { return _mm_srli_epi32(a, imm8); }
-__m128i sse::srli_epi64(__m128i a, int imm8) { return _mm_srli_epi64(a, imm8); }
+    WJR_INTRINSIC_INLINE static int8_t max(__m256i a, int8_t);
+    WJR_INTRINSIC_INLINE static int16_t max(__m256i a, int16_t);
+    WJR_INTRINSIC_INLINE static int32_t max(__m256i a, int32_t);
 
-__m128i sse::srli(__m128i a, int imm8, int8_t) { return srli_epi8(a, imm8); }
-__m128i sse::srli(__m128i a, int imm8, int16_t) { return srli_epi16(a, imm8); }
-__m128i sse::srli(__m128i a, int imm8, int32_t) { return srli_epi32(a, imm8); }
-__m128i sse::srli(__m128i a, int imm8, int64_t) { return srli_epi64(a, imm8); }
-__m128i sse::srli(__m128i a, int imm8, uint8_t) { return srli_epi8(a, imm8); }
-__m128i sse::srli(__m128i a, int imm8, uint16_t) { return srli_epi16(a, imm8); }
-__m128i sse::srli(__m128i a, int imm8, uint32_t) { return srli_epi32(a, imm8); }
-__m128i sse::srli(__m128i a, int imm8, uint64_t) { return srli_epi64(a, imm8); }
+    WJR_INTRINSIC_INLINE static uint8_t max(__m256i a, uint8_t);
+    WJR_INTRINSIC_INLINE static uint16_t max(__m256i a, uint16_t);
+    WJR_INTRINSIC_INLINE static uint32_t max(__m256i a, uint32_t);
 
-void sse::stream(__m128i *ptr, __m128i v) { _mm_stream_si128(ptr, v); }
+    WJR_INTRINSIC_INLINE static __m256i min_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i min_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i min_epi32(__m256i a, __m256i b);
 
-void sse::store(void *ptr, __m128i val) {
-    _mm_store_si128(static_cast<__m128i *>(ptr), val);
-}
-void sse::storeu(void *ptr, __m128i val) {
-    _mm_storeu_si128(static_cast<__m128i *>(ptr), val);
-}
+    WJR_INTRINSIC_INLINE static __m256i min_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i min_epu16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i min_epu32(__m256i a, __m256i b);
 
-__m128i sse::sub_epi8(__m128i a, __m128i b) { return _mm_sub_epi8(a, b); }
-__m128i sse::sub_epi16(__m128i a, __m128i b) { return _mm_sub_epi16(a, b); }
-__m128i sse::sub_epi32(__m128i a, __m128i b) { return _mm_sub_epi32(a, b); }
-__m128i sse::sub_epi64(__m128i a, __m128i b) { return _mm_sub_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i min(__m256i a, __m256i b, uint32_t);
 
-__m128i sse::sub(__m128i a, __m128i b, int8_t) { return sub_epi8(a, b); }
-__m128i sse::sub(__m128i a, __m128i b, int16_t) { return sub_epi16(a, b); }
-__m128i sse::sub(__m128i a, __m128i b, int32_t) { return sub_epi32(a, b); }
-__m128i sse::sub(__m128i a, __m128i b, int64_t) { return sub_epi64(a, b); }
-__m128i sse::sub(__m128i a, __m128i b, uint8_t) { return sub_epi8(a, b); }
-__m128i sse::sub(__m128i a, __m128i b, uint16_t) { return sub_epi16(a, b); }
-__m128i sse::sub(__m128i a, __m128i b, uint32_t) { return sub_epi32(a, b); }
-__m128i sse::sub(__m128i a, __m128i b, uint64_t) { return sub_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static int8_t min_epi8(__m256i a);
+    WJR_INTRINSIC_INLINE static int16_t min_epi16(__m256i a);
+    WJR_INTRINSIC_INLINE static int32_t min_epi32(__m256i a);
 
-__m128i sse::subs_epi8(__m128i a, __m128i b) { return _mm_subs_epi8(a, b); }
-__m128i sse::subs_epi16(__m128i a, __m128i b) { return _mm_subs_epi16(a, b); }
+    WJR_INTRINSIC_INLINE static uint8_t min_epu8(__m256i a);
+    WJR_INTRINSIC_INLINE static uint16_t min_epu16(__m256i a);
+    WJR_INTRINSIC_INLINE static uint32_t min_epu32(__m256i a);
 
-__m128i sse::subs_epu8(__m128i a, __m128i b) { return _mm_subs_epu8(a, b); }
-__m128i sse::subs_epu16(__m128i a, __m128i b) { return _mm_subs_epu16(a, b); }
+    WJR_INTRINSIC_INLINE static int8_t min(__m256i a, int8_t);
+    WJR_INTRINSIC_INLINE static int16_t min(__m256i a, int16_t);
+    WJR_INTRINSIC_INLINE static int32_t min(__m256i a, int32_t);
+    WJR_INTRINSIC_INLINE static uint8_t min(__m256i a, uint8_t);
+    WJR_INTRINSIC_INLINE static uint16_t min(__m256i a, uint16_t);
+    WJR_INTRINSIC_INLINE static uint32_t min(__m256i a, uint32_t);
 
-__m128i sse::subs(__m128i a, __m128i b, int8_t) { return subs_epi8(a, b); }
-__m128i sse::subs(__m128i a, __m128i b, int16_t) { return subs_epi16(a, b); }
-__m128i sse::subs(__m128i a, __m128i b, uint8_t) { return subs_epu8(a, b); }
-__m128i sse::subs(__m128i a, __m128i b, uint16_t) { return subs_epu16(a, b); }
+    WJR_INTRINSIC_INLINE static mask_type movemask_epi8(__m256i a);
 
-__m128i sse::unpackhi_epi8(__m128i a, __m128i b) { return _mm_unpackhi_epi8(a, b); }
-__m128i sse::unpackhi_epi16(__m128i a, __m128i b) { return _mm_unpackhi_epi16(a, b); }
-__m128i sse::unpackhi_epi32(__m128i a, __m128i b) { return _mm_unpackhi_epi32(a, b); }
-__m128i sse::unpackhi_epi64(__m128i a, __m128i b) { return _mm_unpackhi_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i mul_epi32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i mul_epu32(__m256i a, __m256i b);
 
-__m128i sse::unpackhi(__m128i a, __m128i b, int8_t) { return unpackhi_epi8(a, b); }
-__m128i sse::unpackhi(__m128i a, __m128i b, int16_t) { return unpackhi_epi16(a, b); }
-__m128i sse::unpackhi(__m128i a, __m128i b, int32_t) { return unpackhi_epi32(a, b); }
-__m128i sse::unpackhi(__m128i a, __m128i b, int64_t) { return unpackhi_epi64(a, b); }
-__m128i sse::unpackhi(__m128i a, __m128i b, uint8_t) { return unpackhi_epi8(a, b); }
-__m128i sse::unpackhi(__m128i a, __m128i b, uint16_t) { return unpackhi_epi16(a, b); }
-__m128i sse::unpackhi(__m128i a, __m128i b, uint32_t) { return unpackhi_epi32(a, b); }
-__m128i sse::unpackhi(__m128i a, __m128i b, uint64_t) { return unpackhi_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i mulhi_epi16(__m256i a, __m256i b);
 
-__m128i sse::unpacklo_epi8(__m128i a, __m128i b) { return _mm_unpacklo_epi8(a, b); }
-__m128i sse::unpacklo_epi16(__m128i a, __m128i b) { return _mm_unpacklo_epi16(a, b); }
-__m128i sse::unpacklo_epi32(__m128i a, __m128i b) { return _mm_unpacklo_epi32(a, b); }
-__m128i sse::unpacklo_epi64(__m128i a, __m128i b) { return _mm_unpacklo_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i mulhi_epu16(__m256i a, __m256i b);
 
-__m128i sse::unpacklo(__m128i a, __m128i b, int8_t) { return unpacklo_epi8(a, b); }
-__m128i sse::unpacklo(__m128i a, __m128i b, int16_t) { return unpacklo_epi16(a, b); }
-__m128i sse::unpacklo(__m128i a, __m128i b, int32_t) { return unpacklo_epi32(a, b); }
-__m128i sse::unpacklo(__m128i a, __m128i b, int64_t) { return unpacklo_epi64(a, b); }
-__m128i sse::unpacklo(__m128i a, __m128i b, uint8_t) { return unpacklo_epi8(a, b); }
-__m128i sse::unpacklo(__m128i a, __m128i b, uint16_t) { return unpacklo_epi16(a, b); }
-__m128i sse::unpacklo(__m128i a, __m128i b, uint32_t) { return unpacklo_epi32(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i mullo_epi16(__m256i a, __m256i b);
 
-__m128i sse::Xor(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i packs_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i packs_epi32(__m256i a, __m256i b);
 
-#endif
+    WJR_INTRINSIC_INLINE static __m256i packus_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i packus_epi32(__m256i a, __m256i b);
 
-#if WJR_HAS_SIMD(SSE3)
+    template <int imm>
+    WJR_INTRINSIC_INLINE static __m256i shl(__m256i a);
 
-__m128i sse::lddqu(const __m128i *ptr) { return _mm_lddqu_si128(ptr); }
+    template <int imm>
+    WJR_INTRINSIC_INLINE static __m256i shr(__m256i a);
 
-#endif
+    WJR_INTRINSIC_INLINE static __m256i shuffle_epi8(__m256i a, __m256i b);
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i shuffle_epi32(__m256i a);
 
-#if WJR_HAS_SIMD(SSSE3)
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i shufflehi_epi16(__m256i a);
 
-__m128i sse::abs_epi8(__m128i val) { return _mm_abs_epi8(val); }
-__m128i sse::abs_epi16(__m128i val) { return _mm_abs_epi16(val); }
-__m128i sse::abs_epi32(__m128i val) { return _mm_abs_epi32(val); }
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i shufflelo_epi16(__m256i a);
 
-__m128i sse::abs(__m128i val, int8_t) { return abs_epi8(val); }
-__m128i sse::abs(__m128i val, int16_t) { return abs_epi16(val); }
-__m128i sse::abs(__m128i val, int32_t) { return abs_epi32(val); }
-__m128i sse::abs(__m128i val, uint8_t) { return val; }
-__m128i sse::abs(__m128i val, uint16_t) { return val; }
-__m128i sse::abs(__m128i val, uint32_t) { return val; }
+    WJR_INTRINSIC_INLINE static __m256i sll_epi16(__m256i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m256i sll_epi32(__m256i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m256i sll_epi64(__m256i a, __m128i b);
 
-__m128i sse::shuffle_epi8(__m128i v, __m128i imm8) { return _mm_shuffle_epi8(v, imm8); }
+    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i sll(__m256i a, __m128i b, uint64_t);
 
-__m128i sse::sign_epi8(__m128i a, __m128i b) { return _mm_sign_epi8(a, b); }
-__m128i sse::sign_epi16(__m128i a, __m128i b) { return _mm_sign_epi16(a, b); }
-__m128i sse::sign_epi32(__m128i a, __m128i b) { return _mm_sign_epi32(a, b); }
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a);
+    WJR_INTRINSIC_INLINE static __m256i slli_epi16(__m256i a, int imm8);
+    WJR_INTRINSIC_INLINE static __m256i slli_epi32(__m256i a, int imm8);
+    WJR_INTRINSIC_INLINE static __m256i slli_epi64(__m256i a, int imm8);
 
-__m128i sse::sign(__m128i a, __m128i b, int8_t) { return sign_epi8(a, b); }
-__m128i sse::sign(__m128i a, __m128i b, int16_t) { return sign_epi16(a, b); }
-__m128i sse::sign(__m128i a, __m128i b, int32_t) { return sign_epi32(a, b); }
-__m128i sse::sign(__m128i a, __m128i b, uint8_t) { return sign_epi8(a, b); }
-__m128i sse::sign(__m128i a, __m128i b, uint16_t) { return sign_epi16(a, b); }
-__m128i sse::sign(__m128i a, __m128i b, uint32_t) { return sign_epi32(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i slli(__m256i a, int imm8, uint64_t);
 
-#endif
+    WJR_INTRINSIC_INLINE static __m256i sra_epi16(__m256i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m256i sra_epi32(__m256i a, __m128i b);
 
-#if WJR_HAS_SIMD(SSE4_1)
+    WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i sra(__m256i a, __m128i b, int32_t);
 
-template <int imm8>
-__m128i sse::blend_epi16(__m128i a, __m128i b) {
-    return _mm_blend_epi16(a, b, imm8);
-}
+    WJR_INTRINSIC_INLINE static __m256i srai_epi16(__m256i a, int imm8);
+    WJR_INTRINSIC_INLINE static __m256i srai_epi32(__m256i a, int imm8);
 
-__m128i sse::cmpeq_epi64(__m128i a, __m128i b) { return _mm_cmpeq_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i srai(__m256i a, int imm8, int32_t);
 
-__m128i sse::cmpeq(__m128i a, __m128i b, int64_t) { return cmpeq_epi64(a, b); }
-__m128i sse::cmpeq(__m128i a, __m128i b, uint64_t) { return cmpeq_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i stream_load(const void *p);
 
-__m128i sse::cmpgt_epi64(__m128i a, __m128i b) { return _mm_cmpgt_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i srl_epi16(__m256i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m256i srl_epi32(__m256i a, __m128i b);
+    WJR_INTRINSIC_INLINE static __m256i srl_epi64(__m256i a, __m128i b);
 
-__m128i sse::cmpgt(__m128i a, __m128i b, int64_t) { return cmpgt_epi64(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i srl(__m256i a, __m128i b, uint64_t);
 
-template <int imm8>
-__m128i sse::insert_epi8(__m128i a, int i) {
-    return _mm_insert_epi8(a, i, imm8);
-}
+    template <int imm8>
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a);
+    WJR_INTRINSIC_INLINE static __m256i srli_epi8(__m256i a, int imm8);
+    WJR_INTRINSIC_INLINE static __m256i srli_epi16(__m256i a, int imm8);
+    WJR_INTRINSIC_INLINE static __m256i srli_epi32(__m256i a, int imm8);
+    WJR_INTRINSIC_INLINE static __m256i srli_epi64(__m256i a, int imm8);
 
-template <int imm8>
-__m128i sse::insert_epi32(__m128i a, int i) {
-    return _mm_insert_epi32(a, i, imm8);
-}
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i srli(__m256i a, int imm8, uint64_t);
 
-template <int imm8>
-__m128i sse::insert_epi64(__m128i a, int64_t i) {
-    return _mm_insert_epi64(a, i, imm8);
-}
+    WJR_INTRINSIC_INLINE static __m256i sub_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i sub_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i sub_epi32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i sub_epi64(__m256i a, __m256i b);
 
-template <int imm8>
-__m128i sse::insert(__m128i a, int i, int8_t) {
-    return insert_epi8<imm8>(a, i);
-}
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i sub(__m256i a, __m256i b, uint64_t);
 
-template <int imm8>
-__m128i sse::insert(__m128i a, int i, int32_t) {
-    return insert_epi32<imm8>(a, i);
-}
+    WJR_INTRINSIC_INLINE static __m256i subs_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i subs_epi16(__m256i a, __m256i b);
 
-template <int imm8>
-__m128i sse::insert(__m128i a, int64_t i, int64_t) {
-    return insert_epi64<imm8>(a, i);
-}
+    WJR_INTRINSIC_INLINE static __m256i subs_epu8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i subs_epu16(__m256i a, __m256i b);
 
-template <int imm8>
-__m128i sse::insert(__m128i a, int i, uint8_t) {
-    return insert_epi8<imm8>(a, i);
-}
+    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i subs(__m256i a, __m256i b, uint16_t);
 
-template <int imm8>
-__m128i sse::insert(__m128i a, int i, uint32_t) {
-    return insert_epi32<imm8>(a, i);
-}
+    WJR_INTRINSIC_INLINE static int test_all_ones(__m256i a);
 
-template <int imm8>
-__m128i sse::insert(__m128i a, int64_t i, uint64_t) {
-    return insert_epi64<imm8>(a, i);
-}
+    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi_epi64(__m256i a, __m256i b);
 
-__m128i sse::minpos_epu16(__m128i a) { return _mm_minpos_epu16(a); }
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint32_t);
+    WJR_INTRINSIC_INLINE static __m256i unpackhi(__m256i a, __m256i b, uint64_t);
 
-__m128i sse::mul_epi32(__m128i a, __m128i b) { return _mm_mul_epi32(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi8(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi16(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi32(__m256i a, __m256i b);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo_epi64(__m256i a, __m256i b);
 
-__m128i sse::mullo_epi32(__m128i a, __m128i b) { return _mm_mullo_epi32(a, b); }
+    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int8_t);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int16_t);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int32_t);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, int64_t);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint8_t);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint16_t);
+    WJR_INTRINSIC_INLINE static __m256i unpacklo(__m256i a, __m256i b, uint32_t);
 
-__m128i sse::packus_epi32(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
+#endif // AVX2
+};
 
-__m128i sse::stream_load(void *p) {
-    return _mm_stream_load_si128(static_cast<__m128i *>(p));
-}
+namespace avx_detail {
+#if WJR_HAS_SIMD(AVX2)
 
-int sse::test_all_ones(__m128i a) { return _mm_test_all_ones(a); }
+const static __m256i srli_epi8_mask[8] = {
+    avx::set1_epi16(0xFFFF), avx::set1_epi16(0x7F7F), avx::set1_epi16(0x3F3F),
+    avx::set1_epi16(0x1F1F), avx::set1_epi16(0xF0F),  avx::set1_epi16(0x707),
+    avx::set1_epi16(0x303),  avx::set1_epi16(0x101),
+};
 
-int sse::test_all_zeros(__m128i a, __m128i b) { return _mm_test_all_zeros(a, b); }
+#endif
+} // namespace avx_detail
 
-int sse::test_all_zeros(__m128i a) { return _mm_test_all_zeros(a, a); }
+#if WJR_HAS_SIMD(AVX)
 
-int sse::test_mix_ones_zeros(__m128i a, __m128i b) {
-    return _mm_test_mix_ones_zeros(a, b);
-}
+template <>
+struct broadcast_fn<uint8_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint8_t v) const {
+        return _mm256_set1_epi8(v);
+    }
+};
 
-int sse::testc(__m128i a, __m128i b) { return _mm_testc_si128(a, b); }
+template <>
+struct broadcast_fn<uint16_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint16_t v) const {
+        return _mm256_set1_epi16(v);
+    }
+};
 
-int sse::testnzc(__m128i a, __m128i b) { return _mm_testnzc_si128(a, b); }
+template <>
+struct broadcast_fn<uint32_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint32_t v) const {
+        return _mm256_set1_epi32(v);
+    }
+};
 
-int sse::testz(__m128i a, __m128i b) { return _mm_testz_si128(a, b); }
+template <>
+struct broadcast_fn<uint64_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(uint64_t v) const {
+        return _mm256_set1_epi64x(v);
+    }
+};
+
+template <>
+struct broadcast_fn<__m256i_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m256i v) const { return v; }
+};
 
+template <>
+struct broadcast_fn<__m128i_t, __m256i_t> {
+    WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const {
+#if WJR_HAS_SIMD(AVX2)
+        return _mm256_broadcastsi128_si256(v);
+#else
+        return _mm256_insertf128_si256(_mm256_castsi128_si256(v), v, 1);
 #endif
+    }
+};
+
+#endif // AVX
 
 /*------------------------avx------------------------*/
 
@@ -7647,8 +8346,14 @@ __m256i avx::unpacklo(__m256i a, __m256i b, uint32_t) { return unpacklo_epi32(a,
 
 #endif
 
-#define WJR_REGISTER_NORMAL_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8, ADVANCE,     \
-                                          INIT, RET)                                     \
+} // namespace wjr
+
+#endif // WJR_X86_SIMD_AVX_HPP__
+
+namespace wjr {
+
+#define WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8, ADVANCE, \
+                                              INIT, RET)                                 \
     if (WJR_UNLIKELY(N <= 16)) {                                                         \
         if (WJR_UNLIKELY(N <= 4)) {                                                      \
             UNROLL2(N - 2);                                                              \
@@ -7698,8 +8403,8 @@ __m256i avx::unpacklo(__m256i a, __m256i b, uint32_t) { return unpacklo_epi32(a,
     WJR_PP_BOOL_IF(IS_UNROLL_8,                                                          \
         }, )
 
-#define WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8,      \
-                                                  ADVANCE, INIT, RET)                    \
+#define WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION(N, UNROLL2, UNROLL4, IS_UNROLL_8,  \
+                                                      ADVANCE, INIT, RET)                \
     if (WJR_UNLIKELY(N <= 16)) {                                                         \
         if (WJR_UNLIKELY(N <= 4)) {                                                      \
             UNROLL2(0);                                                                  \
@@ -7749,6 +8454,151 @@ __m256i avx::unpacklo(__m256i a, __m256i b, uint32_t) { return unpacklo_epi32(a,
     WJR_PP_BOOL_IF(IS_UNROLL_8,                                                          \
         }, )
 
+template <typename T, size_t N, typename Simd>
+class __x86_simd_base {
+    static constexpr size_t BitWidth = Simd::width();
+    using int_type = typename Simd::int_type;
+    using Mybase = fixed_size_simd<T, N>;
+
+public:
+    using mask_type = simd_detail::basic_simd_mask<T, N, BitWidth / 8>;
+
+    WJR_ENABLE_DEFAULT_SPECIAL_MEMBERS(__x86_simd_base);
+
+    template <typename U, WJR_REQUIRES(is_value_preserving_or_int_v<U, T>)>
+    __x86_simd_base(U value) noexcept : m_data(Simd::set1(value, U())) {}
+
+    template <typename Flags = element_aligned_t>
+    __x86_simd_base(const T *mem, Flags flags = {}) noexcept {
+        copy_from(mem, flags);
+    }
+
+    void copy_from(const T *mem, element_aligned_t = {}) noexcept {
+        m_data = Simd::loadu(mem);
+    }
+
+    void copy_from(const T *mem, vector_aligned_t) noexcept { m_data = Simd::load(mem); }
+
+    void copy_to(T *mem, element_aligned_t = {}) noexcept { Simd::storeu(mem, m_data); }
+
+    void copy_to(T *mem, vector_aligned_t) noexcept { Simd::store(mem, m_data); }
+
+    Mybase &operator&=(const Mybase &other) noexcept {
+        m_data = Simd::And(m_data, other.m_data);
+        return static_cast<Mybase &>(*this);
+    }
+
+    friend Mybase operator&(const Mybase &lhs, const Mybase &rhs) noexcept {
+        Mybase ret(lhs);
+        ret &= rhs;
+        return ret;
+    }
+
+    Mybase &operator|=(const Mybase &other) noexcept {
+        m_data = Simd::Or(m_data, other.m_data);
+        return static_cast<Mybase &>(*this);
+    }
+
+    friend Mybase operator|(const Mybase &lhs, const Mybase &rhs) noexcept {
+        Mybase ret(lhs);
+        ret |= rhs;
+        return ret;
+    }
+
+    Mybase &operator^=(const Mybase &other) noexcept {
+        m_data = Simd::Xor(m_data, other.m_data);
+        return static_cast<Mybase &>(*this);
+    }
+
+    friend Mybase operator^(const Mybase &lhs, const Mybase &rhs) noexcept {
+        Mybase ret(lhs);
+        ret ^= rhs;
+        return ret;
+    }
+
+    friend constexpr mask_type operator==(const Mybase &lhs, const Mybase &rhs) noexcept {
+        return Simd::movemask_epi8(Simd::cmpeq(lhs.m_data, rhs.m_data, T()));
+    }
+
+private:
+    int_type m_data;
+};
+
+#if WJR_HAS_SIMD(SSE2)
+#define WJR_HAS_SIMD_NATIVE_128BIT WJR_HAS_DEF
+
+template <>
+class simd<uint8_t, simd_abi::fixed_size<16>> : public __x86_simd_base<uint8_t, 16, sse> {
+    using Mybase = __x86_simd_base<uint8_t, 16, sse>;
+
+public:
+    using Mybase::Mybase;
+};
+
+template <>
+class simd<uint16_t, simd_abi::fixed_size<8>> : public __x86_simd_base<uint16_t, 8, sse> {
+    using Mybase = __x86_simd_base<uint16_t, 8, sse>;
+
+public:
+    using Mybase::Mybase;
+};
+
+template <>
+class simd<uint32_t, simd_abi::fixed_size<4>> : public __x86_simd_base<uint32_t, 4, sse> {
+    using Mybase = __x86_simd_base<uint32_t, 4, sse>;
+
+public:
+    using Mybase::Mybase;
+};
+
+template <>
+class simd<uint64_t, simd_abi::fixed_size<2>> : public __x86_simd_base<uint64_t, 2, sse> {
+    using Mybase = __x86_simd_base<uint64_t, 2, sse>;
+
+public:
+    using Mybase::Mybase;
+};
+
+#endif
+
+#if WJR_HAS_SIMD(AVX2)
+#define WJR_HAS_SIMD_NATIVE_256BIT WJR_HAS_DEF
+
+template <>
+class simd<uint8_t, simd_abi::fixed_size<32>> : public __x86_simd_base<uint8_t, 32, avx> {
+    using Mybase = __x86_simd_base<uint8_t, 32, avx>;
+
+public:
+    using Mybase::Mybase;
+};
+
+template <>
+class simd<uint16_t, simd_abi::fixed_size<16>>
+    : public __x86_simd_base<uint16_t, 16, avx> {
+    using Mybase = __x86_simd_base<uint16_t, 16, avx>;
+
+public:
+    using Mybase::Mybase;
+};
+
+template <>
+class simd<uint32_t, simd_abi::fixed_size<8>> : public __x86_simd_base<uint32_t, 8, avx> {
+    using Mybase = __x86_simd_base<uint32_t, 8, avx>;
+
+public:
+    using Mybase::Mybase;
+};
+
+template <>
+class simd<uint64_t, simd_abi::fixed_size<4>> : public __x86_simd_base<uint64_t, 4, avx> {
+    using Mybase = __x86_simd_base<uint64_t, 4, avx>;
+
+public:
+    using Mybase::Mybase;
+};
+
+#endif
+
 } // namespace wjr
 
 #endif // WJR_X86_SIMD_SIMD_HPP__
@@ -7921,121 +8771,7 @@ std::basic_ostream<CharT, Tratis> &__ostream_insert(std::basic_ostream<CharT, Tr
 
 #include <array>
 
-#ifndef WJR_ASSERT_HPP__
-#define WJR_ASSERT_HPP__
-
-/**
- * @file assert.hpp
- * @author wjr
- * @brief Assertion utilities
- *
- * @details WJR_DEBUG_LEVEL : 0 ~ 3 \n
- * 0 : Release \n
- * 1 : Beta \n
- * 2 : Runtime detect \n
- * 3 : Maximize runtime detect, for debug \n
- * If WJR_DEBUG_LEVEL is not defined, \n
- * If NDEBUG is defined, WJR_DEBUG_LEVEL is set to 0 by default. \n
- * Otherwise, WJR_DEBUG_LEVEL is set to 1 by default. \n
- * WJR_ASSERT_L(level, expr) : Specify the level of assertion, \n
- * if the WJR_DEBUG_LEVEL is greater than or equal to the level, \n
- * the assertion is executed. \n
- * WJR_ASSERT(expr) : Equivalent to WJR_ASSERT_L(1, expr) \n
- * WJR_ASSERT_0(expr) : Always execute the assertion \n
- *
- * @version 0.1
- * @date 2024-06-01
- *
- * @copyright Copyright (c) 2024
- *
- */
-
-#include <iostream>
-
 // Already included
-
-#ifndef WJR_DEBUG_LEVEL
-#if defined(NDEBUG)
-#define WJR_DEBUG_LEVEL 0
-#else
-#define WJR_DEBUG_LEVEL 1
-#endif
-#endif
-
-#if WJR_DEBUG_LEVEL < 0 || WJR_DEBUG_LEVEL > 3
-#error "WJR_DEBUG_LEVEL must be 0 ~ 3"
-#endif
-
-namespace wjr {
-
-#define WJR_DEBUG_IF(level, expr0, expr1)                                                \
-    WJR_PP_BOOL_IF(WJR_PP_GT(WJR_DEBUG_LEVEL, level), expr0, expr1)
-
-WJR_NORETURN extern void __assert_failed(const char *expr, const char *file,
-                                         const char *func, int line) noexcept;
-
-// LCOV_EXCL_START
-
-/// @private
-template <typename... Args>
-WJR_NOINLINE void __assert_handler(const char *expr, const char *file, const char *func,
-                                   int line, Args &&...args) noexcept {
-    std::cerr << "Additional information: ";
-    (void)(std::cerr << ... << std::forward<Args>(args));
-    std::cerr << '\n';
-    __assert_failed(expr, file, func, line);
-}
-
-/// @private
-inline void __assert_handler(const char *expr, const char *file, const char *func,
-                             int line) noexcept {
-    __assert_failed(expr, file, func, line);
-}
-
-// LCOV_EXCL_STOP
-
-#define WJR_ASSERT_CHECK_I(expr, ...)                                                    \
-    do {                                                                                 \
-        if (WJR_UNLIKELY(!(expr))) {                                                     \
-            ::wjr::__assert_handler(#expr, WJR_FILE, WJR_CURRENT_FUNCTION, WJR_LINE,     \
-                                    ##__VA_ARGS__);                                      \
-        }                                                                                \
-    } while (0)
-
-// do nothing
-#define WJR_ASSERT_UNCHECK_I(expr, ...)                                                  \
-    do {                                                                                 \
-    } while (0)
-
-// level = [0, 2]
-// The higher the level, the less likely it is to be detected
-// Runtime detect  : 1
-// Maximize detect : 2
-#define WJR_ASSERT_L(level, ...)                                                         \
-    WJR_DEBUG_IF(level, WJR_ASSERT_CHECK_I, WJR_ASSERT_UNCHECK_I)                        \
-    (__VA_ARGS__)
-
-// level of assert is zero at default.
-#define WJR_ASSERT_L0(...) WJR_ASSERT_CHECK_I(__VA_ARGS__)
-#define WJR_ASSERT_L1(...) WJR_ASSERT_L(1, __VA_ARGS__)
-#define WJR_ASSERT_L2(...) WJR_ASSERT_L(2, __VA_ARGS__)
-#define WJR_ASSERT_L3(...) WJR_ASSERT_L(3, __VA_ARGS__)
-#define WJR_ASSERT(...) WJR_ASSERT_L1(__VA_ARGS__)
-
-#define WJR_ASSERT_ASSUME_L(level, ...)                                                  \
-    WJR_ASSERT_L(level, __VA_ARGS__);                                                    \
-    __WJR_ASSERT_ASSUME_L_ASSUME(__VA_ARGS__)
-#define __WJR_ASSERT_ASSUME_L_ASSUME(expr, ...) WJR_ASSUME(expr)
-
-#define WJR_ASSERT_ASSUME_L0(...) WJR_ASSERT_ASSUME_L(0, __VA_ARGS__)
-#define WJR_ASSERT_ASSUME_L1(...) WJR_ASSERT_ASSUME_L(1, __VA_ARGS__)
-#define WJR_ASSERT_ASSUME_L2(...) WJR_ASSERT_ASSUME_L(2, __VA_ARGS__)
-#define WJR_ASSERT_ASSUME_L3(...) WJR_ASSERT_ASSUME_L(3, __VA_ARGS__)
-#define WJR_ASSERT_ASSUME(...) WJR_ASSERT_ASSUME_L1(__VA_ARGS__)
-
-} // namespace wjr
-
-#endif // WJR_ASSERT_HPP__
 #ifndef WJR_CONTAINER_GENERIC_TYPE_TRAITS_HPP__
 #define WJR_CONTAINER_GENERIC_TYPE_TRAITS_HPP__
 
@@ -11344,162 +12080,7 @@ struct pointer_traits<wjr::contiguous_iterator_adapter<Container, Traits>> {
 } // namespace std
 
 #endif // WJR_ITERATOR_CONTIGUOUS_ITERATOR_ADAPTER_HPP__
-#ifndef WJR_MATH_DETAIL_HPP__
-#define WJR_MATH_DETAIL_HPP__
-
 // Already included
-
-namespace wjr {
-
-#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT))
-
-namespace math_detail {
-
-template <typename T, T seed>
-class de_bruijn {
-public:
-    constexpr static uint8_t digits = std::numeric_limits<T>::digits;
-    constexpr static uint8_t mv = digits == 32 ? 27 : 58;
-    constexpr de_bruijn() noexcept : lookup(), lookupr() { initialize(); }
-
-    constexpr int get(T idx) const noexcept { return lookup[(idx * seed) >> mv]; }
-    constexpr int getr(T idx) const noexcept { return lookupr[(idx * seed) >> mv]; }
-
-private:
-    constexpr void initialize() noexcept {
-        for (uint8_t i = 0; i < digits; ++i) {
-            const auto idx = (seed << i) >> mv;
-            lookup[idx] = i;
-            lookupr[idx] = i == 0 ? 0 : digits - i;
-        }
-    }
-
-    uint8_t lookup[digits];
-    uint8_t lookupr[digits];
-};
-
-inline constexpr de_bruijn<uint32_t, 0x077C'B531> de_bruijn32 = {};
-inline constexpr de_bruijn<uint64_t, 0x03f7'9d71'b4ca'8b09> de_bruijn64 = {};
-
-} // namespace math_detail
-
-#endif
-
-/**
- * @brief
- *
- * @note `n & -n` is the lowest bit of n.
- */
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr T lowbit(T n) noexcept {
-    return n & -n;
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr T clear_lowbit(T n) noexcept {
-    return n & (n - 1);
-}
-
-// preview :
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr bool is_zero_or_single_bit(T n) noexcept {
-    return (n & (n - 1)) == 0;
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr bool __has_high_bit(T n) noexcept {
-    return n >> (std::numeric_limits<T>::digits - 1);
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr T __ceil_div(T n, type_identity_t<T> div) noexcept {
-    return (n + div - 1) / div;
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr T __align_down(T n, type_identity_t<T> alignment) noexcept {
-    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
-    return n & (-alignment);
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr T __align_down_offset(T n, type_identity_t<T> alignment) noexcept {
-    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
-    return n & (alignment - 1);
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr T __align_up(T n, type_identity_t<T> alignment) noexcept {
-    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
-    return (n + alignment - 1) & (-alignment);
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr T __align_up_offset(T n, type_identity_t<T> alignment) noexcept {
-    WJR_ASSERT_ASSUME_L2(is_zero_or_single_bit(alignment));
-    return (-n) & (alignment - 1);
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST constexpr std::make_signed_t<T> __fasts_from_unsigned(T x) noexcept {
-    const std::make_signed_t<T> ret = x;
-    WJR_ASSERT_ASSUME_L2(ret >= 0, "overflow");
-    return ret;
-}
-
-template <typename T, typename U = std::make_unsigned_t<T>,
-          WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr U __fasts_abs(T x) noexcept {
-    return static_cast<U>(x < 0 ? -x : x);
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr T __fasts_negate(T x) noexcept {
-    return -x;
-}
-
-template <typename T, typename U = std::make_unsigned_t<T>,
-          WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr T __fasts_conditional_negate(bool condition, T x) noexcept {
-    return condition ? -x : x;
-}
-
-template <typename T, typename U = std::make_unsigned_t<T>,
-          WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr T __fasts_negate_with(T condition, T x) noexcept {
-    return __fasts_conditional_negate(condition < 0, x);
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr T __fasts_increment(T x) noexcept {
-    WJR_ASSERT_L2(x != std::numeric_limits<T>::min() &&
-                      x != std::numeric_limits<T>::max(),
-                  "overflow");
-
-    return x < 0 ? x - 1 : x + 1;
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr T __fasts_decrement(T x) noexcept {
-    WJR_ASSERT_L2(x != 0 && x + 1 != T(0), "overflow");
-
-    return x < 0 ? x + 1 : x - 1;
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr T __fasts_add(T x, std::make_unsigned_t<T> y) noexcept {
-    return x < 0 ? x - y : x + y;
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_signed_integral_v<T>)>
-WJR_CONST constexpr T __fasts_sub(T x, std::make_unsigned_t<T> y) noexcept {
-    return x < 0 ? x + y : x - y;
-}
-
-} // namespace wjr
-
-#endif // WJR_MATH_DETAIL_HPP__
 #ifndef WJR_MEMORY_COPY_HPP__
 #define WJR_MEMORY_COPY_HPP__
 
@@ -12375,334 +12956,8 @@ constexpr void replace_uninit(list_node<T> *from, list_node<T> *to) noexcept {
 #ifndef WJR_MATH_BIT_HPP__
 #define WJR_MATH_BIT_HPP__
 
-#ifndef WJR_MATH_CLZ_HPP__
-#define WJR_MATH_CLZ_HPP__
-
 // Already included
-#ifndef WJR_MATH_POPCOUNT_HPP__
-#define WJR_MATH_POPCOUNT_HPP__
-
 // Already included
-
-namespace wjr {
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR int fallback_popcount(T x) noexcept {
-    constexpr auto nd = std::numeric_limits<T>::digits;
-    if constexpr (nd < 32) {
-        return fallback_popcount(static_cast<uint32_t>(x));
-    } else {
-        if constexpr (nd == 32) {
-            x -= (x >> 1) & 0x5555'5555;
-            x = (x & 0x3333'3333) + ((x >> 2) & 0x3333'3333);
-            x = (x + (x >> 4)) & 0x0f0f'0f0f;
-            return (x * 0x0101'0101) >> 24;
-        } else {
-            x -= (x >> 1) & 0x5555'5555'5555'5555;
-            x = (x & 0x3333'3333'3333'3333) + ((x >> 2) & 0x3333'3333'3333'3333);
-            x = (x + (x >> 4)) & 0x0f0f'0f0f'0f0f'0f0f;
-            return (x * 0x0101'0101'0101'0101) >> 56;
-        }
-    }
-}
-
-#if WJR_HAS_BUILTIN(POPCOUNT)
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_INLINE int builtin_popcount(T x) noexcept {
-    constexpr auto nd = std::numeric_limits<T>::digits;
-    if constexpr (nd < 32) {
-        return builtin_popcount(static_cast<uint32_t>(x));
-    } else {
-        if constexpr (nd <= std::numeric_limits<unsigned int>::digits) {
-            return __builtin_popcount(x);
-        } else if constexpr (nd <= std::numeric_limits<unsigned long>::digits) {
-            return __builtin_popcountl(x);
-        }
-        if constexpr (nd <= std::numeric_limits<unsigned long long>::digits) {
-            return __builtin_popcountll(x);
-        } else {
-            static_assert(nd <= 64, "not support yet");
-        }
-    }
-}
-
-#endif // WJR_HAS_BUILTIN(POPCOUNT)
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount_impl(T x) noexcept {
-    if (WJR_BUILTIN_CONSTANT_P_TRUE(is_zero_or_single_bit(x))) {
-        return x != 0;
-    }
-
-#if WJR_HAS_BUILTIN(POPCOUNT)
-    if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) {
-        return fallback_popcount(x);
-    }
-
-    return builtin_popcount(x);
-#else
-    return fallback_popcount(x);
-#endif
-}
-
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int popcount(T x) noexcept {
-    const int ret = popcount_impl(x);
-    WJR_ASSUME(0 <= ret && ret <= std::numeric_limits<T>::digits);
-    return ret;
-}
-
-} // namespace wjr
-
-#endif // WJR_MATH_POPCOUNT_HPP__
-
-#if WJR_HAS_BUILTIN(__builtin_clz)
-#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF
-#elif defined(WJR_MSVC) && defined(WJR_X86)
-#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF_VAR(2)
-#endif
-
-#if WJR_HAS_BUILTIN(CLZ) == 2
-// Already included
-#endif
-
-namespace wjr {
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_clz(T x) noexcept {
-    constexpr auto nd = std::numeric_limits<T>::digits;
-
-    x |= (x >> 1);
-    x |= (x >> 2);
-    x |= (x >> 4);
-
-    if constexpr (nd >= 16) {
-        x |= (x >> 8);
-    }
-
-    if constexpr (nd >= 32) {
-        x |= (x >> 16);
-    }
-
-    if constexpr (nd >= 64) {
-        x |= (x >> 32);
-    }
-
-    return fallback_popcount(~x);
-}
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_clz(T x) noexcept {
-    constexpr auto nd = std::numeric_limits<T>::digits;
-
-#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT))
-    if constexpr (nd >= 32) {
-#endif
-        x |= (x >> 1);
-        x |= (x >> 2);
-        x |= (x >> 4);
-
-        if constexpr (nd >= 16) {
-            x |= (x >> 8);
-        }
-
-        if constexpr (nd >= 32) {
-            x |= (x >> 16);
-        }
-
-        if constexpr (nd >= 64) {
-            x |= (x >> 32);
-        }
-#if !(WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT))
-    }
-#endif
-
-#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)
-    return popcount<T>(~x);
-#else
-    if constexpr (nd < 32) {
-        return fallback_clz(static_cast<uint32_t>(x)) - (32 - nd);
-    } else {
-        ++x;
-
-        if constexpr (nd <= 32) {
-            return math_detail::de_bruijn32.getr(x);
-        } else if constexpr (nd <= 64) {
-            return math_detail::de_bruijn64.getr(x);
-        } else {
-            static_assert(nd <= 64, "not support yet");
-        }
-    }
-#endif
-}
-
-#if WJR_HAS_BUILTIN(CLZ)
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_INLINE int builtin_clz(T x) noexcept {
-    constexpr auto nd = std::numeric_limits<T>::digits;
-    if constexpr (nd < 32) {
-        return builtin_clz(static_cast<uint32_t>(x)) - (32 - nd);
-    } else {
-#if WJR_HAS_BUILTIN(CLZ) == 1
-        if constexpr (nd <= std::numeric_limits<unsigned int>::digits) {
-            constexpr auto delta = std::numeric_limits<unsigned int>::digits - nd;
-            return __builtin_clz(static_cast<unsigned int>(x)) - delta;
-        } else if constexpr (nd <= std::numeric_limits<unsigned long>::digits) {
-            constexpr auto delta = std::numeric_limits<unsigned long>::digits - nd;
-            return __builtin_clzl(static_cast<unsigned long>(x)) - delta;
-        } else if constexpr (nd <= std::numeric_limits<unsigned long long>::digits) {
-            constexpr auto delta = std::numeric_limits<unsigned long long>::digits - nd;
-            return __builtin_clzll(static_cast<unsigned long long>(x)) - delta;
-        } else {
-            static_assert(nd <= 64, "not supported yet");
-        }
-#else
-        if constexpr (nd == 32) {
-            unsigned long result;
-            (void)_BitScanReverse(&result, x);
-            return 31 - result;
-        } else {
-            unsigned long result;
-            (void)_BitScanReverse64(&result, x);
-            return 63 - result;
-        }
-#endif
-    }
-}
-
-#endif
-
-/**
- * @brief Fast count leading zeros
- *
- * @tparam T Must be an unsigned integral type
- */
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int clz(T x) noexcept {
-#if WJR_HAS_BUILTIN(CLZ)
-    if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) {
-        return fallback_clz(x);
-    }
-
-    return builtin_clz(x);
-#else
-    return fallback_clz(x);
-#endif
-}
-
-} // namespace wjr
-
-#endif // WJR_MATH_CLZ_HPP__
-#ifndef WJR_MATH_CTZ_HPP__
-#define WJR_MATH_CTZ_HPP__
-
-// Already included
-// Already included
-
-#if WJR_HAS_BUILTIN(__builtin_ctz)
-#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF
-#elif defined(WJR_MSVC) && defined(WJR_X86)
-#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF_VAR(2)
-#endif
-
-#if WJR_HAS_BUILTIN(CTZ) == 2
-// Already included
-#endif
-
-namespace wjr {
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR int constexpr_ctz(T x) noexcept {
-    return fallback_popcount<T>(lowbit(x) - 1);
-}
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int fallback_ctz(T x) noexcept {
-#if WJR_HAS_BUILTIN(POPCOUNT) && WJR_HAS_SIMD(POPCNT)
-    return popcount<T>(lowbit(x) - 1);
-#else
-    constexpr auto nd = std::numeric_limits<T>::digits;
-
-    if constexpr (nd < 32) {
-        return fallback_ctz(static_cast<uint32_t>(x));
-    } else {
-        x = lowbit(x);
-
-        if constexpr (nd <= 32) {
-            return math_detail::de_bruijn32.get(x);
-        } else if constexpr (nd <= 64) {
-            return math_detail::de_bruijn64.get(x);
-        } else {
-            static_assert(nd <= 64, "not support yet");
-        }
-    }
-#endif //
-}
-
-#if WJR_HAS_BUILTIN(CTZ)
-
-template <typename T>
-WJR_CONST WJR_INTRINSIC_INLINE int builtin_ctz(T x) noexcept {
-    constexpr auto nd = std::numeric_limits<T>::digits;
-
-    if constexpr (nd < 32) {
-        return builtin_ctz(static_cast<uint32_t>(x));
-    } else {
-#if WJR_HAS_BUILTIN(CTZ) == 1
-        if constexpr (nd <= std::numeric_limits<unsigned int>::digits) {
-            return __builtin_ctz(static_cast<unsigned int>(x));
-        } else if constexpr (nd <= std::numeric_limits<unsigned long>::digits) {
-            return __builtin_ctzl(static_cast<unsigned long>(x));
-        } else if constexpr (nd <= std::numeric_limits<unsigned long long>::digits) {
-            return __builtin_ctzll(static_cast<unsigned long long>(x));
-        } else {
-            static_assert(nd <= 64, "not supported yet");
-        }
-#else
-        if constexpr (nd == 32) {
-            unsigned long result;
-            (void)_BitScanForward(&result, x);
-            return result;
-        } else {
-            unsigned long result;
-            (void)_BitScanForward64(&result, x);
-            return result;
-        }
-#endif
-    }
-}
-
-#endif
-
-/**
- * @brief Fast count trailing zeros
- *
- * @details Very fast even on non-optimized platforms by using a De Bruijn sequence. \n
- * Try __builtin_clz if available, otherwise fallback to a portable implementation. \n
- * In fallback_clz, use popcount and lowbit if POPCOUNT and POPCNT are available, make
- * sure popcount is fast. \n
- * Then use De Bruijn sequence, just a bit slower than popcount + lowbit.
- *
- * @tparam T Must be an unsigned integral type
- */
-template <typename T, WJR_REQUIRES(is_nonbool_unsigned_integral_v<T>)>
-WJR_CONST WJR_INTRINSIC_CONSTEXPR20 int ctz(T x) noexcept {
-#if WJR_HAS_BUILTIN(CTZ)
-    if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(x)) {
-        return fallback_ctz(x);
-    }
-
-    return builtin_ctz(x);
-#else
-    return fallback_ctz(x);
-#endif
-}
-
-} // namespace wjr
-
-#endif // WJR_MATH_CTZ_HPP__
 // Already included
 
 namespace wjr {
@@ -17895,7 +18150,10 @@ enum class chars_format : uint8_t {
     scientific = 0x01,
     fixed = 0x02,
     hex = 0x04,
-    general = fixed | scientific
+    general = fixed | scientific,
+    // only used in integeral_constant
+    __json_format = 0x08,
+    json = general | __json_format,
 };
 
 template <typename Iter>
@@ -18080,7 +18338,7 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src0, const T *src1,
 
 #define WJR_REGISTER_FIND_NOT_N_RET(index) index
 
-    WJR_REGISTER_NORMAL_SIMD_FUNCTION(
+    WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION(
         n, WJR_REGISTER_FIND_NOT_N_2, WJR_REGISTER_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2),
         WJR_REGISTER_FIND_NOT_N_ADVNCE, const auto __src0 = src0,
         WJR_REGISTER_FIND_NOT_N_RET);
@@ -18202,7 +18460,7 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src, T val, size_t n) noexcept
     const auto y4 = broadcast<__m128i_t, __m256i_t>(y2);
 #endif
 
-    WJR_REGISTER_NORMAL_SIMD_FUNCTION(
+    WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION(
         n, WJR_REGISTER_FIND_NOT_N_2, WJR_REGISTER_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2),
         WJR_REGISTER_FIND_NOT_N_ADVANCE, const auto __src = src,
         WJR_REGISTER_FIND_NOT_N_RET);
@@ -18328,7 +18586,7 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src0, const T *src1,
 
 #define WJR_REGISTER_REVERSE_FIND_NOT_N_RET(index) 0
 
-    WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION(
+    WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION(
         n, WJR_REGISTER_REVERSE_FIND_NOT_N_2, WJR_REGISTER_REVERSE_FIND_NOT_N_4,
         WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE, ,
         WJR_REGISTER_REVERSE_FIND_NOT_N_RET);
@@ -18451,7 +18709,7 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src, T val, size_t n)
     const auto y4 = broadcast<__m128i_t, __m256i_t>(y2);
 #endif
 
-    WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION(
+    WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION(
         n, WJR_REGISTER_REVERSE_FIND_NOT_N_2, WJR_REGISTER_REVERSE_FIND_NOT_N_4,
         WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE, ,
         WJR_REGISTER_REVERSE_FIND_NOT_N_RET);
@@ -25188,45 +25446,55 @@ constexpr int fallback_count_digits10(UnsignedValue n) noexcept {
     return count + 3;
 }
 
-inline int builtin_count_digits10_u32(uint32_t n) noexcept {
+namespace charconv_detail {
+
 #define WJR_INC(T) (((sizeof(#T) - 1ull) << 32) - T)
-    static constexpr uint64_t table[] = {
-        WJR_INC(0),          WJR_INC(0),          WJR_INC(0),          // 8
-        WJR_INC(10),         WJR_INC(10),         WJR_INC(10),         // 64
-        WJR_INC(100),        WJR_INC(100),        WJR_INC(100),        // 512
-        WJR_INC(1000),       WJR_INC(1000),       WJR_INC(1000),       // 4096
-        WJR_INC(10000),      WJR_INC(10000),      WJR_INC(10000),      // 32k
-        WJR_INC(100000),     WJR_INC(100000),     WJR_INC(100000),     // 256k
-        WJR_INC(1000000),    WJR_INC(1000000),    WJR_INC(1000000),    // 2048k
-        WJR_INC(10000000),   WJR_INC(10000000),   WJR_INC(10000000),   // 16M
-        WJR_INC(100000000),  WJR_INC(100000000),  WJR_INC(100000000),  // 128M
-        WJR_INC(1000000000), WJR_INC(1000000000), WJR_INC(1000000000), // 1024M
-        WJR_INC(1000000000), WJR_INC(1000000000)                       // 4B
-    };
-    const auto inc = table[clz(n | 1) ^ 31];
-    return static_cast<int>((n + inc) >> 32);
+
+static constexpr uint64_t __count_digits10_u32_table[] = {
+    WJR_INC(0),          WJR_INC(0),          WJR_INC(0),          // 8
+    WJR_INC(10),         WJR_INC(10),         WJR_INC(10),         // 64
+    WJR_INC(100),        WJR_INC(100),        WJR_INC(100),        // 512
+    WJR_INC(1000),       WJR_INC(1000),       WJR_INC(1000),       // 4096
+    WJR_INC(10000),      WJR_INC(10000),      WJR_INC(10000),      // 32k
+    WJR_INC(100000),     WJR_INC(100000),     WJR_INC(100000),     // 256k
+    WJR_INC(1000000),    WJR_INC(1000000),    WJR_INC(1000000),    // 2048k
+    WJR_INC(10000000),   WJR_INC(10000000),   WJR_INC(10000000),   // 16M
+    WJR_INC(100000000),  WJR_INC(100000000),  WJR_INC(100000000),  // 128M
+    WJR_INC(1000000000), WJR_INC(1000000000), WJR_INC(1000000000), // 1024M
+    WJR_INC(1000000000), WJR_INC(1000000000)                       // 4B
+};
+
 #undef WJR_INC
-}
 
-inline int builtin_count_digits10_u64(uint64_t n) noexcept {
 #define WJR_POWERS_OF_10(factor)                                                         \
     factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000,           \
         (factor)*1000000, (factor)*10000000, (factor)*100000000, (factor)*1000000000
-    static constexpr uint8_t bsr2log10[] = {
-        1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
-        6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
-        10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
-        15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
-    const auto t = bsr2log10[clz(n | 1) ^ 63];
-    static constexpr const uint64_t zero_or_powers_of_10[] = {
-        0, 0, WJR_POWERS_OF_10(1U), WJR_POWERS_OF_10(1000000000ull),
-        10000000000000000000ull};
-    return t - (n < zero_or_powers_of_10[t]);
+
+static constexpr uint8_t __count_digits10_u64_bsr2log10[] = {
+    1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
+    6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
+    10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
+    15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
+
+static constexpr const uint64_t __count_digits10_u64_zero_or_powers_of_10[] = {
+    0, 0, WJR_POWERS_OF_10(1U), WJR_POWERS_OF_10(1000000000ull), 10000000000000000000ull};
+
 #undef WJR_POWERS_OF_10
+
+} // namespace charconv_detail
+
+WJR_INTRINSIC_CONSTEXPR20 int builtin_count_digits10_u32(uint32_t n) noexcept {
+    const auto inc = charconv_detail::__count_digits10_u32_table[clz(n | 1) ^ 31];
+    return static_cast<int>((n + inc) >> 32);
+}
+
+WJR_INTRINSIC_CONSTEXPR20 int builtin_count_digits10_u64(uint64_t n) noexcept {
+    const auto t = charconv_detail::__count_digits10_u64_bsr2log10[clz(n | 1) ^ 63];
+    return t - (n < charconv_detail::__count_digits10_u64_zero_or_powers_of_10[t]);
 }
 
 template <typename T>
-WJR_CONSTEXPR20 int count_digits10_impl(T n) noexcept {
+WJR_INTRINSIC_CONSTEXPR20 int count_digits10_impl(T n) noexcept {
     if (is_constant_evaluated() || WJR_BUILTIN_CONSTANT_P(n)) {
         return fallback_count_digits10(n);
     }
@@ -27163,7 +27431,7 @@ WJR_PURE int large_builtin_compare_n(const T *src0, const T *src1, size_t n) noe
 
     WJR_ASSUME(n > 2);
 
-    WJR_REGISTER_NORMAL_SIMD_FUNCTION(
+    WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION(
         n, WJR_REGISTER_COMPARE_NOT_N_2, WJR_REGISTER_COMPARE_NOT_N_4, WJR_HAS_SIMD(AVX2),
         WJR_REGISTER_COMPARE_NOT_N_ADVANCE, , WJR_REGISTER_COMPARE_NOT_N_RET);
 
@@ -27318,7 +27586,7 @@ WJR_PURE int large_builtin_reverse_compare_n(const T *src0, const T *src1,
 
     WJR_ASSUME(n > 2);
 
-    WJR_REGISTER_NORMAL_REVERSE_SIMD_FUNCTION(
+    WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION(
         n, WJR_REGISTER_REVERSE_COMPARE_NOT_N_2, WJR_REGISTER_REVERSE_COMPARE_NOT_N_4,
         WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_COMPARE_NOT_N_ADVANCE, ,
         WJR_REGISTER_REVERSE_COMPARE_NOT_N_RET);
@@ -36599,6 +36867,44 @@ class reader {
 
 namespace wjr::fastfloat {
 
+template <typename T>
+struct default_writer {
+    using float_type = T;
+    using support_integral = std::false_type;
+
+    WJR_INTRINSIC_CONSTEXPR T &get_float() noexcept { return value; }
+
+    T &value;
+};
+
+template <typename Writer, typename Op>
+WJR_NOINLINE from_chars_result<> __from_chars_impl(const char *first, const char *last,
+                                                   Writer wr, Op options) noexcept;
+
+extern template from_chars_result<>
+__from_chars_impl<default_writer<float>,
+                  integral_constant<chars_format, chars_format::general>>(
+    const char *first, const char *last, default_writer<float> wr,
+    integral_constant<chars_format, chars_format::general> options) noexcept;
+
+extern template from_chars_result<>
+__from_chars_impl<default_writer<double>,
+                  integral_constant<chars_format, chars_format::general>>(
+    const char *first, const char *last, default_writer<double> wr,
+    integral_constant<chars_format, chars_format::general> options) noexcept;
+
+extern template from_chars_result<>
+__from_chars_impl<default_writer<float>, chars_format>(const char *first,
+                                                       const char *last,
+                                                       default_writer<float> wr,
+                                                       chars_format fmt) noexcept;
+
+extern template from_chars_result<>
+__from_chars_impl<default_writer<double>, chars_format>(const char *first,
+                                                        const char *last,
+                                                        default_writer<double> wr,
+                                                        chars_format fmt) noexcept;
+
 /**
  * This function parses the character sequence [first,last) for a number. It parses
  * floating-point numbers expecting a locale-indepent format equivalent to what is used by
@@ -36622,16 +36928,30 @@ namespace wjr::fastfloat {
  * point and scientific notation respectively. The default is
  * `fast_float::chars_format::general` which allows both `fixed` and `scientific`.
  */
-template <typename T>
+template <chars_format Fmt = chars_format::general>
+from_chars_result<> from_chars(const char *first, const char *last, float &value,
+                               integral_constant<chars_format, Fmt> fmt = {}) noexcept {
+    return __from_chars_impl(first, last, default_writer<float>{value}, fmt);
+}
+
+template <chars_format Fmt = chars_format::general>
+from_chars_result<> from_chars(const char *first, const char *last, double &value,
+                               integral_constant<chars_format, Fmt> fmt = {}) noexcept {
+    return __from_chars_impl(first, last, default_writer<double>{value}, fmt);
+}
+
+template <typename T, WJR_REQUIRES(is_any_of_v<T, float, double>)>
 from_chars_result<> from_chars(const char *first, const char *last, T &value,
-                               chars_format fmt = chars_format::general) noexcept;
+                               chars_format fmt) noexcept {
+    if (WJR_BUILTIN_CONSTANT_P(fmt)) {
+        if (fmt == chars_format::general) {
+            return from_chars(first, last, value);
+        }
+    }
 
-/**
- * Like from_chars, but accepts an `options` argument to govern number parsing.
- */
-template <typename T>
-from_chars_result<> from_chars_advanced(const char *first, const char *last, T &value,
-                                        chars_format options) noexcept;
+    WJR_ASSERT(!(to_underlying(fmt) & to_underlying(chars_format::__json_format)));
+    return __from_chars_impl(first, last, default_writer<T>{value}, fmt);
+}
 
 // Compares two ASCII strings in a case insensitive manner.
 WJR_PURE WJR_INTRINSIC_CONSTEXPR bool
@@ -37813,6 +38133,89 @@ WJR_CONST WJR_INTRINSIC_INLINE adjusted_mantissa compute_float(int64_t q,
     return answer;
 }
 
+/// @brief special case of compute_float when q = 0.
+template <typename binary>
+WJR_CONST WJR_INTRINSIC_INLINE adjusted_mantissa compute_integer(uint64_t w) noexcept {
+    adjusted_mantissa answer;
+    // We want the most significant bit of i to be 1. Shift if needed.
+    const int lz = clz(w);
+    w <<= lz;
+
+    // The required precision is binary::mantissa_explicit_bits() + 3 because
+    // 1. We need the implicit bit
+    // 2. We need an extra bit for rounding purposes
+    // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a
+    // shift)
+
+    const uint128_t product =
+        compute_product_approximation<binary::mantissa_explicit_bits() + 3>(0, w);
+    // The "compute_product_approximation" function can be slightly slower than a
+    // branchless approach: uint128_t product = compute_product(q, w); but in practice, we
+    // can win big with the compute_product_approximation if its additional branch is
+    // easily predicted. Which is best is data specific.
+    const int upperbit = int(product.high >> 63);
+
+    answer.mantissa =
+        product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3);
+
+    answer.power2 = int32_t(63 + upperbit - lz - binary::minimum_exponent());
+    if (answer.power2 <= 0) { // we have a subnormal?
+        // Here have that answer.power2 <= 0 so -answer.power2 >= 0
+        if (-answer.power2 + 1 >= 64) { // if we have more than 64 bits below the minimum
+                                        // exponent, you have a zero for sure.
+            answer.power2 = 0;
+            answer.mantissa = 0;
+            // result should be zero
+            return answer;
+        }
+        // next line is safe because -answer.power2 + 1 < 64
+        answer.mantissa >>= -answer.power2 + 1;
+        // Thankfully, we can't have both "round-to-even" and subnormals because
+        // "round-to-even" only occurs for powers close to 0.
+        answer.mantissa += (answer.mantissa & 1); // round up
+        answer.mantissa >>= 1;
+        // There is a weird scenario where we don't have a subnormal but just.
+        // Suppose we start with 2.2250738585072013e-308, we end up
+        // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+        // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+        // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+        // subnormal, but we can only know this after rounding.
+        // So we only declare a subnormal if we are smaller than the threshold.
+        answer.power2 =
+            (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) ? 0 : 1;
+        return answer;
+    }
+
+    // usually, we round *up*, but if we fall right in between and and we have an
+    // even basis, we need to round down
+    // We are only concerned with the cases where 5**q fits in single 64-bit word.
+    if (product.low <= 1 &&
+        (answer.mantissa & 3) == 1) { // we may fall between two floats!
+        // To be in-between two floats we need that in doing
+        //   answer.mantissa = product.high >> (upperbit + 64 -
+        //   binary::mantissa_explicit_bits() - 3);
+        // ... we dropped out only zeroes. But if this happened, then we can go back!!!
+        if ((answer.mantissa << (upperbit + 64 - binary::mantissa_explicit_bits() - 3)) ==
+            product.high) {
+            answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up
+        }
+    }
+
+    answer.mantissa += (answer.mantissa & 1); // round up
+    answer.mantissa >>= 1;
+    if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) {
+        answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits());
+        answer.power2++; // undo previous addition
+    }
+
+    answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits());
+    if (answer.power2 >= binary::infinite_power()) { // infinity
+        answer.power2 = binary::infinite_power();
+        answer.mantissa = 0;
+    }
+    return answer;
+}
+
 // 1e0 to 1e19
 constexpr static uint64_t powers_of_ten_uint64[] = {1UL,
                                                     10UL,
@@ -38251,7 +38654,8 @@ inline adjusted_mantissa negative_digit_comp(biginteger &bigmant, adjusted_manti
 
     // get the value of `b`, rounded down, and get a biginteger representation of b+h
     adjusted_mantissa am_b = am;
-    // gcc7 buf: use a lambda to remove the noexcept qualifier bug with -Wnoexcept-type.
+    // gcc7 buf: use a lambda to remove the noexcept qualifier bug with
+    // -Wnoexcept-type.
     round<T>(am_b, [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); });
     T b;
     to_float(false, am_b, b);
@@ -38327,8 +38731,8 @@ from_chars_result<> parse_infnan(const char *first, const char *last, T &value)
             answer.ptr = (first += 3);
             value = minusSign ? -std::numeric_limits<T>::quiet_NaN()
                               : std::numeric_limits<T>::quiet_NaN();
-            // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, C11 7.20.1.3.3. At
-            // least MSVC produces nan(ind) and nan(snan).
+            // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7,
+            // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan).
             if (first != last && *first == '(') {
                 for (const char *ptr = first + 1; ptr != last; ++ptr) {
                     if (*ptr == ')') {
@@ -38426,21 +38830,23 @@ WJR_INTRINSIC_INLINE bool rounds_to_nearest() noexcept {
 
 struct parsed_number_string {
     int64_t exponent{0};
-    uint64_t mantissa{0};
     bool negative{false};
     // contains the range of the significant digits
     span<const char> integer{};  // non-nullable
     span<const char> fraction{}; // nullable
 };
 
-template <typename T>
-WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const char *last,
-                                                     T &value,
-                                                     chars_format options) noexcept {
-    static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value,
-                  "only float and double are supported");
+template <typename Writer, typename Op>
+from_chars_result<> __from_chars_impl(const char *first, const char *last, Writer wr,
+                                      Op options) noexcept {
+    static_assert(!std::is_reference_v<Writer>, "");
+
+    using T = typename Writer::float_type;
+    constexpr bool is_support_integral = Writer::support_integral::value;
+    constexpr bool is_constant_options = !std::is_same_v<Op, chars_format>;
 
     from_chars_result<> answer;
+
     if (WJR_UNLIKELY(first == last)) {
         answer.ec = std::errc::invalid_argument;
         answer.ptr = first;
@@ -38448,7 +38854,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
     }
 
     const char *p = first;
-    const auto fmt = to_underlying(options);
+    const auto fmt = to_underlying(static_cast<chars_format>(options));
 
     parsed_number_string pns;
     pns.negative = (*p == '-');
@@ -38462,7 +38868,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
     }
 
     const char *const start_digits = p;
-    uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
+    uint64_t uval = 0; // an unsigned int avoids signed overflows (which are bad)
 
     const char *end_of_integer_part;
     int64_t digit_count;
@@ -38477,13 +38883,21 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
     do {
         uint8_t ch = *p;
         if (!__try_match(ch)) { // This situation rarely occurs
+            if constexpr (is_constant_options) {
+                if (fmt & to_underlying(chars_format::__json_format)) {
+                    answer.ec = std::errc{};
+                    answer.ptr = first;
+                    return answer;
+                }
+            }
+
             break;
         }
 
         do {
             // a multiplication by 10 is cheaper than an arbitrary integer
             // multiplication
-            i = 10 * i + ch; // might overflow, we will handle the overflow later
+            uval = 10 * uval + ch; // might overflow, we will handle the overflow later
 
             if (++p == last) {
                 goto INTEGER_AT_END;
@@ -38498,6 +38912,15 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
         digit_count = static_cast<int64_t>(p - start_digits);
         pns.integer = span<const char>(start_digits, static_cast<size_t>(digit_count));
 
+        if constexpr (is_constant_options) {
+            if (fmt & to_underlying(chars_format::__json_format)) {
+                // at least 1 digit in integer part, without leading zeros
+                if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) {
+                    return answer;
+                }
+            }
+        }
+
         if (*p != '.') {
             exponent = 0;
             if (*p == 'e' || *p == 'E') {
@@ -38512,24 +38935,37 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
         // can occur at most twice without overflowing, but let it occur more, since
         // for integers with many digits, digit parsing is the primary bottleneck.
         while ((std::distance(p, last) >= 8) && is_made_of_eight_digits_fast(p)) {
-            i = i * 100000000 +
-                parse_eight_digits_unrolled(
-                    p); // in rare cases, this will overflow, but that's ok
+            uval = uval * 100000000 +
+                   parse_eight_digits_unrolled(
+                       p); // in rare cases, this will overflow, but that's ok
             p += 8;
         }
 
         while ((p != last) && is_integer(*p)) {
             const auto digit = uint32_t(*p - '0');
             ++p;
-            i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+            uval = uval * 10 + digit; // in rare cases, this will overflow, but that's ok
         }
 
         exponent = before - p;
         pns.fraction = span<const char>(before, size_t(p - before));
         digit_count -= exponent;
 
-        if (WJR_UNLIKELY(digit_count == 0)) {
-            return detail::parse_infnan(first, last, value);
+        auto &float_v = wr.get_float();
+        if constexpr (is_constant_options) {
+            if (fmt & to_underlying(chars_format::__json_format)) {
+                if (WJR_UNLIKELY(exponent == 0)) {
+                    return detail::parse_infnan(first, last, float_v);
+                }
+            } else {
+                if (WJR_UNLIKELY(digit_count == 0)) {
+                    return detail::parse_infnan(first, last, float_v);
+                }
+            }
+        } else {
+            if (WJR_UNLIKELY(digit_count == 0)) {
+                return detail::parse_infnan(first, last, float_v);
+            }
         }
     } while (0);
 
@@ -38550,7 +38986,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
         if ((p == last) || !is_integer(*p)) {
             if (!bool(fmt & to_underlying(chars_format::fixed))) {
                 // We are in error.
-                return detail::parse_infnan(first, last, value);
+                return detail::parse_infnan(first, last, wr.get_float());
             }
             // Otherwise, we will be ignoring the 'e'.
             p = location_of_e;
@@ -38571,7 +39007,7 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
         // If it scientific and not fixed, we have to bail out.
         if (bool(fmt & to_underlying(chars_format::scientific)) &&
             !bool(fmt & to_underlying(chars_format::fixed))) {
-            return detail::parse_infnan(first, last, value);
+            return detail::parse_infnan(first, last, wr.get_float());
         }
     }
 
@@ -38603,38 +39039,39 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
                 // Let us start again, this time, avoiding overflows.
                 // We don't need to check if is_integer, since we use the
                 // pre-tokenized spans from above.
-                i = 0;
+                uval = 0;
                 p = pns.integer.data();
                 const char *int_end = p + pns.integer.size();
                 constexpr uint64_t minimal_nineteen_digit_integer =
                     1000000000000000000ull;
-                while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
-                    i = i * 10 + uint64_t(*p - '0');
+                while ((uval < minimal_nineteen_digit_integer) && (p != int_end)) {
+                    uval = uval * 10 + uint64_t(*p - '0');
                     ++p;
                 }
-                if (i >= minimal_nineteen_digit_integer) { // We have a big integers
+                if (uval >= minimal_nineteen_digit_integer) { // We have a big integers
                     exponent = end_of_integer_part - p + exp_number;
                 } else { // We have a value with a fractional component.
                     p = pns.fraction.data();
                     const char *frac_end = p + pns.fraction.size();
-                    while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
-                        i = i * 10 + uint64_t(*p - '0');
+                    while ((uval < minimal_nineteen_digit_integer) && (p != frac_end)) {
+                        uval = uval * 10 + uint64_t(*p - '0');
                         ++p;
                     }
                     exponent = pns.fraction.data() - p + exp_number;
                 }
-                // We have now corrected both exponent and i, to a truncated value
+                // We have now corrected both exponent and uval, to a truncated value
             }
         }
 
         pns.exponent = exponent;
-        pns.mantissa = i;
+
+        T &float_v = wr.get_float();
 
         // The implementation of the Clinger's fast path is convoluted because
         // we want round-to-nearest in all cases, irrespective of the rounding mode
         // selected on the thread.
-        // We proceed optimistically, assuming that detail::rounds_to_nearest() returns
-        // true.
+        // We proceed optimistically, assuming that detail::rounds_to_nearest()
+        // returns true.
         if (binary_format<T>::min_exponent_fast_path() <= pns.exponent &&
             pns.exponent <= binary_format<T>::max_exponent_fast_path() &&
             !too_many_digits) {
@@ -38648,17 +39085,17 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
             if (detail::rounds_to_nearest()) {
                 // We have that fegetround() == FE_TONEAREST.
                 // Next is Clinger's fast path.
-                if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path()) {
-                    value = T(pns.mantissa);
+                if (uval <= binary_format<T>::max_mantissa_fast_path()) {
+                    float_v = T(uval);
                     if (pns.exponent < 0) {
-                        value =
-                            value / binary_format<T>::exact_power_of_ten(-pns.exponent);
+                        float_v =
+                            float_v / binary_format<T>::exact_power_of_ten(-pns.exponent);
                     } else {
-                        value =
-                            value * binary_format<T>::exact_power_of_ten(pns.exponent);
+                        float_v =
+                            float_v * binary_format<T>::exact_power_of_ten(pns.exponent);
                     }
                     if (pns.negative) {
-                        value = -value;
+                        float_v = -float_v;
                     }
                     return answer;
                 }
@@ -38667,46 +39104,44 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
                 // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's
                 // proposal
                 if (pns.exponent >= 0 &&
-                    pns.mantissa <=
-                        binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
+                    uval <= binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
 #if defined(__clang__)
                     // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
-                    if (pns.mantissa == 0) {
-                        value = pns.negative ? T(-0.) : T(0.);
+                    if (uval == 0) {
+                        float_v = pns.negative ? T(-0.) : T(0.);
                         return answer;
                     }
 #endif
-                    value = T(pns.mantissa) *
-                            binary_format<T>::exact_power_of_ten(pns.exponent);
+                    float_v =
+                        T(uval) * binary_format<T>::exact_power_of_ten(pns.exponent);
                     if (pns.negative) {
-                        value = -value;
+                        float_v = -float_v;
                     }
                     return answer;
                 }
             }
         }
 
-        adjusted_mantissa am =
-            compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+        adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, uval);
         if (too_many_digits && am.power2 >= 0) {
-            if (am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
-                am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
+            if (am != compute_float<binary_format<T>>(pns.exponent, uval + 1)) {
+                am = compute_error<binary_format<T>>(pns.exponent, uval);
             }
         }
 
-        // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa) and we
-        // have an invalid power (am.power2 < 0), then we need to go the long way around
-        // again. This is very uncommon.
+        // If we called compute_float<binary_format<T>>(pns.exponent, uval)
+        // and we have an invalid power (am.power2 < 0), then we need to go the long
+        // way around again. This is very uncommon.
         if (am.power2 < 0) {
             am.power2 -= invalid_am_bias;
 
-            const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa);
+            const int32_t sci_exp = scientific_exponent(pns.exponent, uval);
             am = digit_comp<T>(am, pns.integer, pns.fraction, sci_exp);
         }
 
-        to_float(pns.negative, am, value);
+        to_float(pns.negative, am, float_v);
         // Test for over/underflow.
-        if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+        if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) ||
             am.power2 == binary_format<T>::infinite_power()) {
             answer.ec = std::errc::result_out_of_range;
         }
@@ -38719,6 +39154,15 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
     digit_count = static_cast<int64_t>(p - start_digits);
     pns.integer = span<const char>(start_digits, static_cast<size_t>(digit_count));
 
+    if constexpr (is_constant_options) {
+        if (fmt & to_underlying(chars_format::__json_format)) {
+            // at least 1 digit in integer part, without leading zeros
+            if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) {
+                return answer;
+            }
+        }
+    }
+
 INTEGER:
     answer.ec = std::errc(); // be optimistic
     answer.ptr = p;
@@ -38742,41 +39186,53 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
         if (digit_count > 19) {
             p = start;
 
-            i = __from_chars_unroll_16<10>(reinterpret_cast<const uint8_t *>(p),
-                                           char_converter);
+            uval = __from_chars_unroll_16<10>(reinterpret_cast<const uint8_t *>(p),
+                                              char_converter);
             p += 16;
-            i = i * 10 + char_converter.template from<10>(*p++);
-            i = i * 10 + char_converter.template from<10>(*p++);
-            i = i * 10 + char_converter.template from<10>(*p++);
+            uval = uval * 10 + char_converter.template from<10>(*p++);
+            uval = uval * 10 + char_converter.template from<10>(*p++);
+            uval = uval * 10 + char_converter.template from<10>(*p++);
 
             exponent = end_of_integer_part - p;
             pns.exponent = exponent;
-            pns.mantissa = i;
 
             WJR_ASSUME(exponent >= 0);
 
-            adjusted_mantissa am =
-                compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+            if constexpr (is_support_integral) {
+                constexpr uint64_t max_quot = std::numeric_limits<uint64_t>::max() / 10;
+                constexpr uint32_t max_rem = std::numeric_limits<uint64_t>::max() % 10;
+
+                if (!pns.negative && digit_count == 20 &&
+                    (uval < max_quot ||
+                     (uval == max_quot && static_cast<uint32_t>(*p - '0') <= max_rem))) {
+                    uint64_t &u64_v = wr.get_u64();
+                    u64_v = uval;
+                    return answer;
+                }
+            }
+
+            T &float_v = wr.get_float();
+
+            adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, uval);
             if (am.power2 >= 0) {
-                if (am !=
-                    compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
-                    am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
+                if (am != compute_float<binary_format<T>>(pns.exponent, uval + 1)) {
+                    am = compute_error<binary_format<T>>(pns.exponent, uval);
                 }
             }
 
-            // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa)
-            // and we have an invalid power (am.power2 < 0), then we need to go the long
-            // way around again. This is very uncommon.
+            // If we called compute_float<binary_format<T>>(pns.exponent,
+            // uval) and we have an invalid power (am.power2 < 0), then we
+            // need to go the long way around again. This is very uncommon.
             if (am.power2 < 0) {
                 am.power2 -= invalid_am_bias;
 
-                const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa);
+                const int32_t sci_exp = scientific_exponent(pns.exponent, uval);
                 am = digit_comp<T>(am, pns.integer, pns.fraction, sci_exp);
             }
 
-            to_float(pns.negative, am, value);
+            to_float(pns.negative, am, float_v);
             // Test for over/underflow.
-            if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+            if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) ||
                 am.power2 == binary_format<T>::infinite_power()) {
                 answer.ec = std::errc::result_out_of_range;
             }
@@ -38786,60 +39242,44 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
     }
 
     pns.exponent = 0;
-    pns.mantissa = i;
 
-    // Unfortunately, the conventional Clinger's fast path is only possible
-    // when the system rounds to the nearest float.
-    //
-    // We expect the next branch to almost always be selected.
-    // We could check it first (before the previous branch), but
-    // there might be performance advantages at having the check
-    // be last.
-    if (detail::rounds_to_nearest()) {
-        // We have that fegetround() == FE_TONEAREST.
-        // Next is Clinger's fast path.
-        if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path()) {
-            value = T(pns.mantissa);
-            if (pns.negative) {
-                value = -value;
-            }
+    if constexpr (is_support_integral) {
+        if (!pns.negative) {
+            uint64_t &u64_v = wr.get_u64();
+            u64_v = uval;
             return answer;
-        }
-    } else {
-        // We do not have that fegetround() == FE_TONEAREST.
-        // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's
-        // proposal
-        if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path(0)) {
-#if defined(__clang__)
-            // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
-            if (pns.mantissa == 0) {
-                value = pns.negative ? T(-0.) : T(0.);
-                return answer;
-            }
-#endif
-            value = T(pns.mantissa);
-            if (pns.negative) {
-                value = -value;
-            }
+        } else if (uval <= static_cast<uint64_t>(-std::numeric_limits<int64_t>::min())) {
+            int64_t &i64_v = wr.get_i64();
+            i64_v = static_cast<int64_t>(-uval);
             return answer;
         }
     }
 
-    adjusted_mantissa am = compute_float<binary_format<T>>(0, pns.mantissa);
+    auto &float_v = wr.get_float();
 
-    // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa) and we
-    // have an invalid power (am.power2 < 0), then we need to go the long way around
-    // again. This is very uncommon.
-    if (am.power2 < 0) {
-        am.power2 -= invalid_am_bias;
+    if (WJR_LIKELY(uval <= binary_format<T>::max_mantissa_fast_path())) {
+#if defined(__clang__)
+        // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
+        if (uval == 0) {
+            float_v = pns.negative ? T(-0.) : T(0.);
+            return answer;
+        }
+#endif
 
-        const int32_t sci_exp = scientific_exponent(0, pns.mantissa);
-        am = digit_comp<T>(am, pns.integer, pns.fraction, sci_exp);
+        float_v = T(uval);
+        if (pns.negative) {
+            float_v = -float_v;
+        }
+
+        return answer;
     }
 
-    to_float(pns.negative, am, value);
+    adjusted_mantissa am = compute_integer<binary_format<T>>(uval);
+    WJR_ASSERT_ASSUME(am.power2 >= 0);
+
+    to_float(pns.negative, am, float_v);
     // Test for over/underflow.
-    if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+    if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) ||
         am.power2 == binary_format<T>::infinite_power()) {
         answer.ec = std::errc::result_out_of_range;
     }
@@ -38847,12 +39287,6 @@ WJR_NOINLINE from_chars_result<> from_chars_advanced(const char *first, const ch
     return answer;
 }
 
-template <typename T>
-from_chars_result<> from_chars(const char *first, const char *last, T &value,
-                               chars_format fmt /*= chars_format::general*/) noexcept {
-    return from_chars_advanced(first, last, value, fmt);
-}
-
 } // namespace wjr::fastfloat
 
 namespace wjr {
diff --git a/include/wjr/format/fastfloat.hpp b/include/wjr/format/fastfloat.hpp
index d23eaf5a..cabceb99 100644
--- a/include/wjr/format/fastfloat.hpp
+++ b/include/wjr/format/fastfloat.hpp
@@ -6,27 +6,43 @@
 
 namespace wjr::fastfloat {
 
-template <typename T, typename Op>
+template <typename T>
+struct default_writer {
+    using float_type = T;
+    using support_integral = std::false_type;
+
+    WJR_INTRINSIC_CONSTEXPR T &get_float() noexcept { return value; }
+
+    T &value;
+};
+
+template <typename Writer, typename Op>
 WJR_NOINLINE from_chars_result<> __from_chars_impl(const char *first, const char *last,
-                                                   T &value, Op options) noexcept;
+                                                   Writer wr, Op options) noexcept;
 
 extern template from_chars_result<>
-__from_chars_impl<float, integral_constant<chars_format, chars_format::general>>(
-    const char *first, const char *last, float &value,
+__from_chars_impl<default_writer<float>,
+                  integral_constant<chars_format, chars_format::general>>(
+    const char *first, const char *last, default_writer<float> wr,
     integral_constant<chars_format, chars_format::general> options) noexcept;
 
 extern template from_chars_result<>
-__from_chars_impl<double, integral_constant<chars_format, chars_format::general>>(
-    const char *first, const char *last, double &value,
+__from_chars_impl<default_writer<double>,
+                  integral_constant<chars_format, chars_format::general>>(
+    const char *first, const char *last, default_writer<double> wr,
     integral_constant<chars_format, chars_format::general> options) noexcept;
 
 extern template from_chars_result<>
-__from_chars_impl<float, chars_format>(const char *first, const char *last, float &value,
-                                       chars_format fmt) noexcept;
+__from_chars_impl<default_writer<float>, chars_format>(const char *first,
+                                                       const char *last,
+                                                       default_writer<float> wr,
+                                                       chars_format fmt) noexcept;
 
 extern template from_chars_result<>
-__from_chars_impl<double, chars_format>(const char *first, const char *last,
-                                        double &value, chars_format fmt) noexcept;
+__from_chars_impl<default_writer<double>, chars_format>(const char *first,
+                                                        const char *last,
+                                                        default_writer<double> wr,
+                                                        chars_format fmt) noexcept;
 
 /**
  * This function parses the character sequence [first,last) for a number. It parses
@@ -54,13 +70,13 @@ __from_chars_impl<double, chars_format>(const char *first, const char *last,
 template <chars_format Fmt = chars_format::general>
 from_chars_result<> from_chars(const char *first, const char *last, float &value,
                                integral_constant<chars_format, Fmt> fmt = {}) noexcept {
-    return __from_chars_impl(first, last, value, fmt);
+    return __from_chars_impl(first, last, default_writer<float>{value}, fmt);
 }
 
 template <chars_format Fmt = chars_format::general>
 from_chars_result<> from_chars(const char *first, const char *last, double &value,
                                integral_constant<chars_format, Fmt> fmt = {}) noexcept {
-    return __from_chars_impl(first, last, value, fmt);
+    return __from_chars_impl(first, last, default_writer<double>{value}, fmt);
 }
 
 template <typename T, WJR_REQUIRES(is_any_of_v<T, float, double>)>
@@ -69,13 +85,11 @@ from_chars_result<> from_chars(const char *first, const char *last, T &value,
     if (WJR_BUILTIN_CONSTANT_P(fmt)) {
         if (fmt == chars_format::general) {
             return from_chars(first, last, value);
-        } else if (fmt == chars_format::json) {
-            return from_chars(first, last, value,
-                              integral_constant<chars_format, chars_format::json>{});
         }
     }
 
-    return __from_chars_impl(first, last, value, fmt);
+    WJR_ASSERT(!(to_underlying(fmt) & to_underlying(chars_format::__json_format)));
+    return __from_chars_impl(first, last, default_writer<T>{value}, fmt);
 }
 
 // Compares two ASCII strings in a case insensitive manner.
@@ -1955,17 +1969,21 @@ WJR_INTRINSIC_INLINE bool rounds_to_nearest() noexcept {
 
 struct parsed_number_string {
     int64_t exponent{0};
-    uint64_t mantissa{0};
     bool negative{false};
     // contains the range of the significant digits
     span<const char> integer{};  // non-nullable
     span<const char> fraction{}; // nullable
 };
 
-template <typename T, typename Op>
-from_chars_result<> __from_chars_impl(const char *first, const char *last, T &value,
+template <typename Writer, typename Op>
+from_chars_result<> __from_chars_impl(const char *first, const char *last, Writer wr,
                                       Op options) noexcept {
+    static_assert(!std::is_reference_v<Writer>, "");
+
+    using T = typename Writer::float_type;
+    constexpr bool is_support_integral = Writer::support_integral::value;
     constexpr bool is_constant_options = !std::is_same_v<Op, chars_format>;
+
     from_chars_result<> answer;
 
     if (WJR_UNLIKELY(first == last)) {
@@ -1977,10 +1995,6 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
     const char *p = first;
     const auto fmt = to_underlying(static_cast<chars_format>(options));
 
-    if constexpr (!is_constant_options) {
-        WJR_ASSERT(!(fmt & to_underlying(chars_format::__json_format)));
-    }
-
     parsed_number_string pns;
     pns.negative = (*p == '-');
 
@@ -1993,7 +2007,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
     }
 
     const char *const start_digits = p;
-    uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
+    uint64_t uval = 0; // an unsigned int avoids signed overflows (which are bad)
 
     const char *end_of_integer_part;
     int64_t digit_count;
@@ -2022,7 +2036,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
         do {
             // a multiplication by 10 is cheaper than an arbitrary integer
             // multiplication
-            i = 10 * i + ch; // might overflow, we will handle the overflow later
+            uval = 10 * uval + ch; // might overflow, we will handle the overflow later
 
             if (++p == last) {
                 goto INTEGER_AT_END;
@@ -2037,6 +2051,15 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
         digit_count = static_cast<int64_t>(p - start_digits);
         pns.integer = span<const char>(start_digits, static_cast<size_t>(digit_count));
 
+        if constexpr (is_constant_options) {
+            if (fmt & to_underlying(chars_format::__json_format)) {
+                // at least 1 digit in integer part, without leading zeros
+                if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) {
+                    return answer;
+                }
+            }
+        }
+
         if (*p != '.') {
             exponent = 0;
             if (*p == 'e' || *p == 'E') {
@@ -2051,24 +2074,37 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
         // can occur at most twice without overflowing, but let it occur more, since
         // for integers with many digits, digit parsing is the primary bottleneck.
         while ((std::distance(p, last) >= 8) && is_made_of_eight_digits_fast(p)) {
-            i = i * 100000000 +
-                parse_eight_digits_unrolled(
-                    p); // in rare cases, this will overflow, but that's ok
+            uval = uval * 100000000 +
+                   parse_eight_digits_unrolled(
+                       p); // in rare cases, this will overflow, but that's ok
             p += 8;
         }
 
         while ((p != last) && is_integer(*p)) {
             const auto digit = uint32_t(*p - '0');
             ++p;
-            i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+            uval = uval * 10 + digit; // in rare cases, this will overflow, but that's ok
         }
 
         exponent = before - p;
         pns.fraction = span<const char>(before, size_t(p - before));
         digit_count -= exponent;
 
-        if (WJR_UNLIKELY(digit_count == 0)) {
-            return detail::parse_infnan(first, last, value);
+        auto &float_v = wr.get_float();
+        if constexpr (is_constant_options) {
+            if (fmt & to_underlying(chars_format::__json_format)) {
+                if (WJR_UNLIKELY(exponent == 0)) {
+                    return detail::parse_infnan(first, last, float_v);
+                }
+            } else {
+                if (WJR_UNLIKELY(digit_count == 0)) {
+                    return detail::parse_infnan(first, last, float_v);
+                }
+            }
+        } else {
+            if (WJR_UNLIKELY(digit_count == 0)) {
+                return detail::parse_infnan(first, last, float_v);
+            }
         }
     } while (0);
 
@@ -2089,7 +2125,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
         if ((p == last) || !is_integer(*p)) {
             if (!bool(fmt & to_underlying(chars_format::fixed))) {
                 // We are in error.
-                return detail::parse_infnan(first, last, value);
+                return detail::parse_infnan(first, last, wr.get_float());
             }
             // Otherwise, we will be ignoring the 'e'.
             p = location_of_e;
@@ -2110,7 +2146,7 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
         // If it scientific and not fixed, we have to bail out.
         if (bool(fmt & to_underlying(chars_format::scientific)) &&
             !bool(fmt & to_underlying(chars_format::fixed))) {
-            return detail::parse_infnan(first, last, value);
+            return detail::parse_infnan(first, last, wr.get_float());
         }
     }
 
@@ -2142,32 +2178,33 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
                 // Let us start again, this time, avoiding overflows.
                 // We don't need to check if is_integer, since we use the
                 // pre-tokenized spans from above.
-                i = 0;
+                uval = 0;
                 p = pns.integer.data();
                 const char *int_end = p + pns.integer.size();
                 constexpr uint64_t minimal_nineteen_digit_integer =
                     1000000000000000000ull;
-                while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
-                    i = i * 10 + uint64_t(*p - '0');
+                while ((uval < minimal_nineteen_digit_integer) && (p != int_end)) {
+                    uval = uval * 10 + uint64_t(*p - '0');
                     ++p;
                 }
-                if (i >= minimal_nineteen_digit_integer) { // We have a big integers
+                if (uval >= minimal_nineteen_digit_integer) { // We have a big integers
                     exponent = end_of_integer_part - p + exp_number;
                 } else { // We have a value with a fractional component.
                     p = pns.fraction.data();
                     const char *frac_end = p + pns.fraction.size();
-                    while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
-                        i = i * 10 + uint64_t(*p - '0');
+                    while ((uval < minimal_nineteen_digit_integer) && (p != frac_end)) {
+                        uval = uval * 10 + uint64_t(*p - '0');
                         ++p;
                     }
                     exponent = pns.fraction.data() - p + exp_number;
                 }
-                // We have now corrected both exponent and i, to a truncated value
+                // We have now corrected both exponent and uval, to a truncated value
             }
         }
 
         pns.exponent = exponent;
-        pns.mantissa = i;
+
+        T &float_v = wr.get_float();
 
         // The implementation of the Clinger's fast path is convoluted because
         // we want round-to-nearest in all cases, irrespective of the rounding mode
@@ -2187,17 +2224,17 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
             if (detail::rounds_to_nearest()) {
                 // We have that fegetround() == FE_TONEAREST.
                 // Next is Clinger's fast path.
-                if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path()) {
-                    value = T(pns.mantissa);
+                if (uval <= binary_format<T>::max_mantissa_fast_path()) {
+                    float_v = T(uval);
                     if (pns.exponent < 0) {
-                        value =
-                            value / binary_format<T>::exact_power_of_ten(-pns.exponent);
+                        float_v =
+                            float_v / binary_format<T>::exact_power_of_ten(-pns.exponent);
                     } else {
-                        value =
-                            value * binary_format<T>::exact_power_of_ten(pns.exponent);
+                        float_v =
+                            float_v * binary_format<T>::exact_power_of_ten(pns.exponent);
                     }
                     if (pns.negative) {
-                        value = -value;
+                        float_v = -float_v;
                     }
                     return answer;
                 }
@@ -2206,46 +2243,44 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
                 // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's
                 // proposal
                 if (pns.exponent >= 0 &&
-                    pns.mantissa <=
-                        binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
+                    uval <= binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
 #if defined(__clang__)
                     // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
-                    if (pns.mantissa == 0) {
-                        value = pns.negative ? T(-0.) : T(0.);
+                    if (uval == 0) {
+                        float_v = pns.negative ? T(-0.) : T(0.);
                         return answer;
                     }
 #endif
-                    value = T(pns.mantissa) *
-                            binary_format<T>::exact_power_of_ten(pns.exponent);
+                    float_v =
+                        T(uval) * binary_format<T>::exact_power_of_ten(pns.exponent);
                     if (pns.negative) {
-                        value = -value;
+                        float_v = -float_v;
                     }
                     return answer;
                 }
             }
         }
 
-        adjusted_mantissa am =
-            compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+        adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, uval);
         if (too_many_digits && am.power2 >= 0) {
-            if (am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
-                am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
+            if (am != compute_float<binary_format<T>>(pns.exponent, uval + 1)) {
+                am = compute_error<binary_format<T>>(pns.exponent, uval);
             }
         }
 
-        // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa)
+        // If we called compute_float<binary_format<T>>(pns.exponent, uval)
         // and we have an invalid power (am.power2 < 0), then we need to go the long
         // way around again. This is very uncommon.
         if (am.power2 < 0) {
             am.power2 -= invalid_am_bias;
 
-            const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa);
+            const int32_t sci_exp = scientific_exponent(pns.exponent, uval);
             am = digit_comp<T>(am, pns.integer, pns.fraction, sci_exp);
         }
 
-        to_float(pns.negative, am, value);
+        to_float(pns.negative, am, float_v);
         // Test for over/underflow.
-        if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+        if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) ||
             am.power2 == binary_format<T>::infinite_power()) {
             answer.ec = std::errc::result_out_of_range;
         }
@@ -2258,6 +2293,15 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
     digit_count = static_cast<int64_t>(p - start_digits);
     pns.integer = span<const char>(start_digits, static_cast<size_t>(digit_count));
 
+    if constexpr (is_constant_options) {
+        if (fmt & to_underlying(chars_format::__json_format)) {
+            // at least 1 digit in integer part, without leading zeros
+            if (digit_count == 0 || (start_digits[0] == '0' && digit_count > 1)) {
+                return answer;
+            }
+        }
+    }
+
 INTEGER:
     answer.ec = std::errc(); // be optimistic
     answer.ptr = p;
@@ -2281,41 +2325,53 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
         if (digit_count > 19) {
             p = start;
 
-            i = __from_chars_unroll_16<10>(reinterpret_cast<const uint8_t *>(p),
-                                           char_converter);
+            uval = __from_chars_unroll_16<10>(reinterpret_cast<const uint8_t *>(p),
+                                              char_converter);
             p += 16;
-            i = i * 10 + char_converter.template from<10>(*p++);
-            i = i * 10 + char_converter.template from<10>(*p++);
-            i = i * 10 + char_converter.template from<10>(*p++);
+            uval = uval * 10 + char_converter.template from<10>(*p++);
+            uval = uval * 10 + char_converter.template from<10>(*p++);
+            uval = uval * 10 + char_converter.template from<10>(*p++);
 
             exponent = end_of_integer_part - p;
             pns.exponent = exponent;
-            pns.mantissa = i;
 
             WJR_ASSUME(exponent >= 0);
 
-            adjusted_mantissa am =
-                compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+            if constexpr (is_support_integral) {
+                constexpr uint64_t max_quot = std::numeric_limits<uint64_t>::max() / 10;
+                constexpr uint32_t max_rem = std::numeric_limits<uint64_t>::max() % 10;
+
+                if (!pns.negative && digit_count == 20 &&
+                    (uval < max_quot ||
+                     (uval == max_quot && static_cast<uint32_t>(*p - '0') <= max_rem))) {
+                    uint64_t &u64_v = wr.get_u64();
+                    u64_v = uval;
+                    return answer;
+                }
+            }
+
+            T &float_v = wr.get_float();
+
+            adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, uval);
             if (am.power2 >= 0) {
-                if (am !=
-                    compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
-                    am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
+                if (am != compute_float<binary_format<T>>(pns.exponent, uval + 1)) {
+                    am = compute_error<binary_format<T>>(pns.exponent, uval);
                 }
             }
 
             // If we called compute_float<binary_format<T>>(pns.exponent,
-            // pns.mantissa) and we have an invalid power (am.power2 < 0), then we
+            // uval) and we have an invalid power (am.power2 < 0), then we
             // need to go the long way around again. This is very uncommon.
             if (am.power2 < 0) {
                 am.power2 -= invalid_am_bias;
 
-                const int32_t sci_exp = scientific_exponent(pns.exponent, pns.mantissa);
+                const int32_t sci_exp = scientific_exponent(pns.exponent, uval);
                 am = digit_comp<T>(am, pns.integer, pns.fraction, sci_exp);
             }
 
-            to_float(pns.negative, am, value);
+            to_float(pns.negative, am, float_v);
             // Test for over/underflow.
-            if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+            if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) ||
                 am.power2 == binary_format<T>::infinite_power()) {
                 answer.ec = std::errc::result_out_of_range;
             }
@@ -2325,31 +2381,44 @@ from_chars_result<> __from_chars_impl(const char *first, const char *last, T &va
     }
 
     pns.exponent = 0;
-    pns.mantissa = i;
 
-    if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path()) {
+    if constexpr (is_support_integral) {
+        if (!pns.negative) {
+            uint64_t &u64_v = wr.get_u64();
+            u64_v = uval;
+            return answer;
+        } else if (uval <= static_cast<uint64_t>(-std::numeric_limits<int64_t>::min())) {
+            int64_t &i64_v = wr.get_i64();
+            i64_v = static_cast<int64_t>(-uval);
+            return answer;
+        }
+    }
+
+    auto &float_v = wr.get_float();
+
+    if (WJR_LIKELY(uval <= binary_format<T>::max_mantissa_fast_path())) {
 #if defined(__clang__)
         // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
-        if (pns.mantissa == 0) {
-            value = pns.negative ? T(-0.) : T(0.);
+        if (uval == 0) {
+            float_v = pns.negative ? T(-0.) : T(0.);
             return answer;
         }
 #endif
 
-        value = T(pns.mantissa);
+        float_v = T(uval);
         if (pns.negative) {
-            value = -value;
+            float_v = -float_v;
         }
-        
+
         return answer;
     }
 
-    adjusted_mantissa am = compute_integer<binary_format<T>>(pns.mantissa);
+    adjusted_mantissa am = compute_integer<binary_format<T>>(uval);
     WJR_ASSERT_ASSUME(am.power2 >= 0);
 
-    to_float(pns.negative, am, value);
+    to_float(pns.negative, am, float_v);
     // Test for over/underflow.
-    if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+    if ((uval != 0 && am.mantissa == 0 && am.power2 == 0) ||
         am.power2 == binary_format<T>::infinite_power()) {
         answer.ec = std::errc::result_out_of_range;
     }
diff --git a/src/wjr/format/fastfloat.cpp b/src/wjr/format/fastfloat.cpp
index 4893a551..f97409de 100644
--- a/src/wjr/format/fastfloat.cpp
+++ b/src/wjr/format/fastfloat.cpp
@@ -1,4 +1,25 @@
 #include <wjr/format/fastfloat.hpp>
 
-namespace wjr {
-}
\ No newline at end of file
+namespace wjr::fastfloat {
+
+template from_chars_result<>
+__from_chars_impl<default_writer<float>,
+                  integral_constant<chars_format, chars_format::general>>(
+    const char *first, const char *last, default_writer<float> wr,
+    integral_constant<chars_format, chars_format::general> options) noexcept;
+
+template from_chars_result<>
+__from_chars_impl<default_writer<double>,
+                  integral_constant<chars_format, chars_format::general>>(
+    const char *first, const char *last, default_writer<double> wr,
+    integral_constant<chars_format, chars_format::general> options) noexcept;
+
+template from_chars_result<> __from_chars_impl<default_writer<float>, chars_format>(
+    const char *first, const char *last, default_writer<float> wr,
+    chars_format fmt) noexcept;
+
+template from_chars_result<> __from_chars_impl<default_writer<double>, chars_format>(
+    const char *first, const char *last, default_writer<double> wr,
+    chars_format fmt) noexcept;
+
+} // namespace wjr::fastfloat
\ No newline at end of file