diff --git a/include/wjr/math/mul.hpp b/include/wjr/math/mul.hpp
index a4dc350d..75217cf2 100644
--- a/include/wjr/math/mul.hpp
+++ b/include/wjr/math/mul.hpp
@@ -210,11 +210,15 @@ WJR_INTRINSIC_CONSTEXPR_E T addmul_1(T *dst, const T *src, size_t n,
     }
 
 #if WJR_HAS_BUILTIN(ASM_ADDMUL_1)
-    if (is_constant_evaluated()) {
+    if constexpr (sizeof(T) == 8) {
+        if (is_constant_evaluated()) {
+            return fallback_addmul_1(dst, src, n, ml);
+        }
+
+        return asm_addmul_1(dst, src, n, ml);
+    } else {
         return fallback_addmul_1(dst, src, n, ml);
     }
-
-    return asm_addmul_1(dst, src, n, ml);
 #else
     return fallback_addmul_1(dst, src, n, ml);
 #endif
@@ -260,6 +264,42 @@ WJR_INTRINSIC_CONSTEXPR_E T submul_1(T *dst, const T *src, size_t n,
 #endif
 }
 
+template <typename T>
+WJR_INTRINSIC_CONSTEXPR T fallback_addlsh_n(T *dst, const T *src0, const T *src1,
+                                            size_t n, type_identity_t<T> cl) {
+    T tcl = std::numeric_limits<T>::digits - cl;
+    T lo = 0, hi = 0;
+    T o_in = 0, c_in = 0;
+
+    for (size_t i = 0; i < n; ++i) {
+        lo = src1[i] << cl;
+        hi = src1[i] >> tcl;
+        lo = addc<T>(lo, c_in, 0u, c_in);
+        dst[i] = addc<T>(lo, src0[i], 0u, o_in);
+        c_in += hi + o_in;
+    }
+
+    return c_in;
+}
+
+template <typename T>
+WJR_INTRINSIC_CONSTEXPR_E T addlsh_n(T *dst, const T *src0, const T *src1, size_t n,
+                                     type_identity_t<T> cl) {
+    if (WJR_UNLIKELY(cl == 0)) {
+        return wjr::addc_n(dst, src0, src1, n, 0u);
+    }
+
+#if WJR_HAS_BUILTIN(ASM_ADDLSH_N)
+    if (is_constant_evaluated()) {
+        return fallback_addlsh_n(dst, src0, src1, n, cl);
+    }
+
+    return asm_addlsh_n(dst, src0, src1, n, cl);
+#else
+    return fallback_addlsh_n(dst, src0, src1, n, cl);
+#endif
+}
+
 // preview :
 
 // native default threshold of toom-cook-2
diff --git a/include/wjr/preprocessor/preview.hpp b/include/wjr/preprocessor/preview.hpp
index 5b93d639..2948cb47 100644
--- a/include/wjr/preprocessor/preview.hpp
+++ b/include/wjr/preprocessor/preview.hpp
@@ -29,8 +29,18 @@
 #endif //
 
 #if defined(NDEBUG)
-#define WJR_ASSERT_NOMESSAGE_I(expr) WJR_ASSUME(expr)
-#define WJR_ASSERT_MESSAGE_I(expr) WJR_UNREACHABLE()
+#define WJR_ASSERT_NOMESSAGE_I(expr)                                                     \
+    do {                                                                                 \
+        if (WJR_UNLIKELY(!(expr))) {                                                     \
+            std::abort();                                                                \
+            WJR_UNREACHABLE();                                                           \
+        }                                                                                \
+    } while (0)
+#define WJR_ASSERT_MESSAGE_I(expr)                                                       \
+    do {                                                                                 \
+        std::abort();                                                                    \
+        WJR_UNREACHABLE();                                                               \
+    } while (0)
 #else
 #define WJR_ASSERT_NOMESSAGE_I(expr) assert(expr)
 #define WJR_ASSERT_MESSAGE_I(expr)                                                       \
@@ -58,8 +68,9 @@
                    WJR_ASSERT_CHECK_I_MESSAGE)                                           \
     (__VA_ARGS__)
 
-#define WJR_ASSERT_UNCHECK_I(...)                                                        \
+#define WJR_ASSERT_UNCHECK_I(expr, ...)                                                  \
     do {                                                                                 \
+        WJR_ASSUME(expr);                                                                \
     } while (0)
 
 // level = [0, 2]
diff --git a/include/wjr/x86/add.hpp b/include/wjr/x86/add.hpp
index 19ef8b36..2397f59e 100644
--- a/include/wjr/x86/add.hpp
+++ b/include/wjr/x86/add.hpp
@@ -13,12 +13,12 @@ namespace wjr {
     (defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC))
 #define WJR_HAS_BUILTIN_ASM_ADDC WJR_HAS_DEF
 #define WJR_HAS_BUILTIN_ASM_ADDC_N WJR_HAS_DEF
-#endif
 
 #define WJR_ADDSUB_I 1
-
 #include <wjr/x86/gen_addsub.hpp>
 
+#endif
+
 } // namespace wjr
 
 #endif // WJR_X86_ADD_HPP__
\ No newline at end of file
diff --git a/include/wjr/x86/gen_addrsblsh_n.hpp b/include/wjr/x86/gen_addrsblsh_n.hpp
new file mode 100644
index 00000000..d2247f84
--- /dev/null
+++ b/include/wjr/x86/gen_addrsblsh_n.hpp
@@ -0,0 +1,210 @@
+// WJR_ADDSUB_I :
+// 0 : SUB
+// 1 : ADD
+
+#ifndef WJR_ADDSUB_I
+#error "abort"
+#endif
+
+#define WJR_addsub WJR_PP_BOOL_IF(WJR_ADDSUB_I, add, rsb)
+#define WJR_adcsbb WJR_PP_BOOL_IF(WJR_ADDSUB_I, adc, sbb)
+
+WJR_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addsub, lsh_n))(
+    uint64_t *dst, const uint64_t *src0, const uint64_t *src1, size_t n, uint64_t cl) {
+    WJR_ASSERT(n != 0);
+    WJR_ASSERT(cl != 0);
+
+    size_t cx = n / 8;
+    uint64_t tcl = 64 - cl;
+    uint64_t r8, r9 = n, r10;
+
+    asm volatile(
+        "and{l $7, %k[r9]| %k[r9], 7}\n\t"
+        "lea{q| %[r8], [rip +} .Llookup%={(%%rip), %[r8]|]}\n\t"
+        "movs{lq (%[r8], %[r9], 4), %[r9]|xd %[r9], DWORD PTR [%[r8] + "
+        "%[r9] * 4]}\n\t"
+        "lea{q (%[r8], %[r9], 1), %[r9]| %[r9], [%[r9] + %[r8]]}\n\t"
+        "jmp{q *%[r9]| %[r9]}\n\t"
+
+        ".align 8\n\t"
+        ".Llookup%=:\n\t"
+        ".long .Ll0%=-.Llookup%=\n\t"
+        ".long .Ll1%=-.Llookup%=\n\t"
+        ".long .Ll2%=-.Llookup%=\n\t"
+        ".long .Ll3%=-.Llookup%=\n\t"
+        ".long .Ll4%=-.Llookup%=\n\t"
+        ".long .Ll5%=-.Llookup%=\n\t"
+        ".long .Ll6%=-.Llookup%=\n\t"
+        ".long .Ll7%=-.Llookup%=\n\t"
+        ".align 16\n\t"
+
+        ".Ll0%=:\n\t"
+        "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
+        "xor %k[r9], %k[r9]\n\t"
+        "jmp .Lb0%=\n\t"
+
+        ".Ll2%=:\n\t"
+        "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
+        "xor %k[r9], %k[r9]\n\t"
+        "lea{q -48(%[src0]), %[src0]| %[src0], [%[src0] - 48]}\n\t"
+        "lea{q -48(%[src1]), %[src1]| %[src1], [%[src1] - 48]}\n\t"
+        "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t"
+        "jmp .Lb2%=\n\t"
+
+        ".Ll3%=:\n\t"
+        "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
+        "xor %k[r10], %k[r10]\n\t"
+        "lea{q -40(%[src0]), %[src0]| %[src0], [%[src0] - 40]}\n\t"
+        "lea{q -40(%[src1]), %[src1]| %[src1], [%[src1] - 40]}\n\t"
+        "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t"
+        "jmp .Lb3%=\n\t"
+
+        ".Ll4%=:\n\t"
+        "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
+        "xor %k[r9], %k[r9]\n\t"
+        "lea{q -32(%[src0]), %[src0]| %[src0], [%[src0] - 32]}\n\t"
+        "lea{q -32(%[src1]), %[src1]| %[src1], [%[src1] - 32]}\n\t"
+        "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t"
+        "jmp .Lb4%=\n\t"
+
+        ".Ll5%=:\n\t"
+        "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
+        "xor %k[r10], %k[r10]\n\t"
+        "lea{q -24(%[src0]), %[src0]| %[src0], [%[src0] - 24]}\n\t"
+        "lea{q -24(%[src1]), %[src1]| %[src1], [%[src1] - 24]}\n\t"
+        "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t"
+        "jmp .Lb5%=\n\t"
+
+        ".Ll6%=:\n\t"
+        "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
+        "xor %k[r9], %k[r9]\n\t"
+        "lea{q -16(%[src0]), %[src0]| %[src0], [%[src0] - 16]}\n\t"
+        "lea{q -16(%[src1]), %[src1]| %[src1], [%[src1] - 16]}\n\t"
+        "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t"
+        "jmp .Lb6%=\n\t"
+
+        ".Ll7%=:\n\t"
+        "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
+        "xor %k[r10], %k[r10]\n\t"
+        "lea{q -8(%[src0]), %[src0]| %[src0], [%[src0] - 8]}\n\t"
+        "lea{q -8(%[src1]), %[src1]| %[src1], [%[src1] - 8]}\n\t"
+        "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t"
+        "jmp .Lb7%=\n\t"
+
+        ".Ld1%=:\n\t"
+        "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q (%[dst]), %[r8]| [%[dst]], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r9], %[r9]| %[r9], %[r9], %[tcl]}\n\t"
+        "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q %[cx], %[r9]| %[r9], %[cx]}\n\t"
+        "jmp .Ldone%=\n\t"
+
+        ".Ll1%=:\n\t"
+        "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
+        "xor %k[r10], %k[r10]\n\t"
+        "jrcxz .Ld1%=\n\t"
+        "lea{q 8(%[src0]), %[src0]| %[src0], [%[src0] + 8]}\n\t"
+        "lea{q 8(%[src1]), %[src1]| %[src1], [%[src1] + 8]}\n\t"
+        "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t"
+
+        ".align 32\n\t"
+        ".Lloop%=:\n\t"
+
+        ".Lb1%=:\n\t"
+        "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
+        "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
+        "mov{q (%[src1]), %[r10]| [%[src1]], %[r10]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q -8(%[src0]), %[r8]| [%[src0] - 8], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
+        "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t"
+
+        ".Lb0%=:\n\t"
+        "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
+        "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
+        "mov{q 8(%[src1]), %[r9]| [%[src1] + 8], %[r9]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q (%[src0]), %[r8]| [%[src0]], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
+        "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t"
+
+        "lea{q -1(%[cx]), %[cx]| %[cx], [%[cx] - 1]}\n\t"
+
+        ".Lb7%=:\n\t"
+        "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
+        "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
+        "mov{q 16(%[src1]), %[r10]| [%[src1] + 16], %[r10]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q 8(%[src0]), %[r8]| [%[src0] + 8], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
+        "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t"
+
+        ".Lb6%=:\n\t"
+        "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
+        "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
+        "mov{q 24(%[src1]), %[r9]| [%[src1] + 24], %[r9]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q 16(%[src0]), %[r8]| [%[src0] + 16], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
+        "mov{q %[r8], 16(%[dst])| [%[dst] + 16], %[r8]}\n\t"
+
+        ".Lb5%=:\n\t"
+        "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
+        "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
+        "mov{q 32(%[src1]), %[r10]| [%[src1] + 32], %[r10]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q 24(%[src0]), %[r8]| [%[src0] + 24], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
+        "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t"
+
+        ".Lb4%=:\n\t"
+        "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
+        "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
+        "mov{q 40(%[src1]), %[r9]| [%[src1] + 40], %[r9]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q 32(%[src0]), %[r8]| [%[src0] + 32], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
+        "mov{q %[r8], 32(%[dst])| [%[dst] + 32], %[r8]}\n\t"
+
+        ".Lb3%=:\n\t"
+        "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
+        "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
+        "mov{q 48(%[src1]), %[r10]| [%[src1] + 48], %[r10]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q 40(%[src0]), %[r8]| [%[src0] + 40], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
+        "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t"
+
+        ".Lb2%=:\n\t"
+        "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
+        "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
+        "mov{q 56(%[src1]), %[r9]| [%[src1] + 56], %[r9]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q 48(%[src0]), %[r8]| [%[src0]+ 48], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
+        "mov{q %[r8], 48(%[dst])| [%[dst] + 48], %[r8]}\n\t"
+
+        "lea{q 64(%[src0]), %[src0]| %[src0], [%[src0] + 64]}\n\t"
+        "lea{q 64(%[src1]), %[src1]| %[src1], [%[src1] + 64]}\n\t"
+        "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"
+
+        "jrcxz .Lloop_out%=\n\t"
+        "jmp .Lloop%=\n\t"
+        ".Lloop_out%=:\n\t"
+
+        "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
+        "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q -8(%[dst]), %[r8]| [%[dst] - 8], %[r8]}\n\t"
+        "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
+        "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t"
+        WJR_PP_STR(WJR_adcsbb) "{q %[cx], %[r9]| %[r9], %[cx]}\n\t"
+
+        ".Ldone%=:"
+
+        : [dst] "+r"(dst), [src0] "+r"(src0), [src1] "+r"(src1), [cx] "+c"(cx),
+          [cl] "+r"(cl), [tcl] "+r"(tcl), [r8] "=r"(r8), [r9] "+r"(r9),
+          [r10] "=r"(r10)
+        :
+        : "cc", "memory");
+
+    WJR_ASSUME(cx == 0);
+
+    return r9;
+}
+
+#undef WJR_adcsbb
+#undef WJR_addcsubc
+
+#undef WJR_ADDSUB_I
\ No newline at end of file
diff --git a/include/wjr/x86/gen_addsub.hpp b/include/wjr/x86/gen_addsub.hpp
index cd6e0b67..8660515c 100644
--- a/include/wjr/x86/gen_addsub.hpp
+++ b/include/wjr/x86/gen_addsub.hpp
@@ -6,15 +6,12 @@
 #error "abort"
 #endif
 
-#define WJR_ADDSUB WJR_PP_BOOL_IF(WJR_ADDSUB_I, ADDC, SUBC)
 #define WJR_addcsubc WJR_PP_BOOL_IF(WJR_ADDSUB_I, addc, subc)
 #define WJR_adcsbb WJR_PP_BOOL_IF(WJR_ADDSUB_I, adc, sbb)
 
-#if WJR_HAS_BUILTIN(WJR_PP_CONCAT(ASM_, WJR_ADDSUB))
-
-template <typename T, typename U>
-WJR_INTRINSIC_INLINE T WJR_PP_CONCAT(asm_, WJR_addcsubc)(T a, T b, U c_in, U &c_out) {
-    static_assert(std::is_same_v<T, uint64_t>, "");
+template <typename U>
+WJR_INTRINSIC_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_addcsubc)(uint64_t a, uint64_t b,
+                                                                U c_in, U &c_out) {
 
 #if WJR_ADDSUB_I == 0
     if (WJR_BUILTIN_CONSTANT_P(c_in)) {
@@ -99,16 +96,11 @@ WJR_INTRINSIC_INLINE T WJR_PP_CONCAT(asm_, WJR_addcsubc)(T a, T b, U c_in, U &c_
 #undef WJR_REGISTER_BUILTIN_ASM_ADDSUB_0
 }
 
-#endif
-
-#if WJR_HAS_BUILTIN(WJR_PP_CONCAT(ASM_, WJR_PP_CONCAT(WJR_ADDSUB, _N)))
-
-template <typename T, typename U>
-WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const T *src0,
-                                                                  const T *src1, size_t n,
-                                                                  U c_in) {
-    static_assert(std::is_same_v<T, uint64_t>, "");
-
+template <typename U>
+WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(uint64_t *dst,
+                                                                  const uint64_t *src0,
+                                                                  const uint64_t *src1,
+                                                                  size_t n, U c_in) {
     if (WJR_BUILTIN_CONSTANT_P(n)) {
         if (n == 1) {
             dst[0] = WJR_PP_CONCAT(asm_, WJR_addcsubc)(src0[0], src1[0], c_in, c_in);
@@ -116,30 +108,26 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const
         }
     }
 
-    const auto cdst = dst;
-    const auto csrc0 = src0;
-    const auto csrc1 = src1;
-
     size_t cx = n / 8;
-    T r8 = c_in, r9, r10 = n & 7, r11;
+    uint64_t r8 = c_in, r9, r10 = n & 7, r11;
 
     asm volatile(
         "add{b $255, %b[r8]| %b[r8], 255}\n\t"
-        "lea{q| %[r9], [rip +} .Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%={(%%rip), %[r9]|]}\n\t"
+        "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t"
         "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * 4]}\n\t"
         "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r9] + %[r10]]}\n\t"
         "jmp{q *%[r10]| %[r10]}\n\t"
         
         ".align 8\n\t"
-        ".Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=:\n\t"
-        ".long .Ll0%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
-        ".long .Ll1%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
-        ".long .Ll2%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
-        ".long .Ll3%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
-        ".long .Ll4%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
-        ".long .Ll5%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
-        ".long .Ll6%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
-        ".long .Ll7%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t"
+        ".Llookup%=:\n\t"
+        ".long .Ll0%=-.Llookup%=\n\t"
+        ".long .Ll1%=-.Llookup%=\n\t"
+        ".long .Ll2%=-.Llookup%=\n\t"
+        ".long .Ll3%=-.Llookup%=\n\t"
+        ".long .Ll4%=-.Llookup%=\n\t"
+        ".long .Ll5%=-.Llookup%=\n\t"
+        ".long .Ll6%=-.Llookup%=\n\t"
+        ".long .Ll7%=-.Llookup%=\n\t"
         ".align 16\n\t"
         
         ".Ll0%=:\n\t"
@@ -229,7 +217,7 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const
         "lea{q 16(%[dst]), %[dst]| %[dst], [%[dst] + 16]}\n\t"
 
         ".align 32\n\t"
-        ".Lwjr_asm_" WJR_PP_STR(WJR_adcsbb) "_n_loop%=:\n\t"
+        ".Lloop%=:\n\t"
 
         ".Lb2%=:\n\t"
         "mov{q (%[src0]), %[r9]| %[r9], [%[src0]]}\n\t"
@@ -277,7 +265,7 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const
         "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"
         "dec %[cx]\n\t"
         
-        "jne .Lwjr_asm_" WJR_PP_STR(WJR_adcsbb) "_n_loop%=\n\t"
+        "jne .Lloop%=\n\t"
 
         WJR_PP_STR(WJR_adcsbb) "{q -8(%[src1]), %[r10]| %[r10], [%[src1] - 8]}\n\t"
         "mov{q %[r8], -16(%[dst])| [%[dst] - 16], %[r8]}\n\t"
@@ -292,19 +280,13 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const
         :
         : "cc", "memory");
 
-    WJR_ASSUME(dst == cdst + n);
-    WJR_ASSUME(src0 == csrc0 + n);
-    WJR_ASSUME(src1 == csrc1 + n);
     WJR_ASSUME(cx == 0);
     WJR_ASSUME(r9 == 0u || r9 == 1u);
 
     return r9;
 }
 
-#endif
-
 #undef WJR_adcsbb
 #undef WJR_addcsubc
-#undef WJR_ADDSUB
 
 #undef WJR_ADDSUB_I
\ No newline at end of file
diff --git a/include/wjr/x86/mul.hpp b/include/wjr/x86/mul.hpp
index b145f5e4..b88db200 100644
--- a/include/wjr/x86/mul.hpp
+++ b/include/wjr/x86/mul.hpp
@@ -70,30 +70,27 @@ WJR_INTRINSIC_INLINE T mulx(T a, T b, T &hi) {
 WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t dx) {
     WJR_ASSERT(n != 0);
 
-    const auto cdst = dst;
-    const auto csrc = src;
-
     size_t cx = n / 8;
     uint64_t r8, r9, r10 = n, r11;
 
     asm volatile(
         "and{l $7, %k[r10]| %k[r10], 7}\n\t"
-        "lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t"
+        "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t"
         "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + "
         "%[r10] * 4]}\n\t"
         "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t"
         "jmp{q *%[r10]| %[r10]}\n\t"
 
         ".align 8\n\t"
-        ".Lasm_mul_1_lookup%=:\n\t"
-        ".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t"
+        ".Llookup%=:\n\t"
+        ".long .Ll0%=-.Llookup%=\n\t"
+        ".long .Ll1%=-.Llookup%=\n\t"
+        ".long .Ll2%=-.Llookup%=\n\t"
+        ".long .Ll3%=-.Llookup%=\n\t"
+        ".long .Ll4%=-.Llookup%=\n\t"
+        ".long .Ll5%=-.Llookup%=\n\t"
+        ".long .Ll6%=-.Llookup%=\n\t"
+        ".long .Ll7%=-.Llookup%=\n\t"
         ".align 16\n\t"
 
         ".Ll0%=:\n\t"
@@ -153,7 +150,7 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint
         "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t"
 
         ".align 32\n\t"
-        ".Lasm_addmul_1_loop%=:\n\t"
+        ".Lloop%=:\n\t"
 
         ".Lb1%=:\n\t"
         "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t"
@@ -199,9 +196,9 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint
         "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"
 
         "dec %[cx]\n\t"
-        "jne .Lasm_addmul_1_loop%=\n\t"
+        "jne .Lloop%=\n\t"
 
-        "adc{q $0, %[r9]| %[r9], 0}\n\t"
+        "adc{q %[cx], %[r9]| %[r9], %[cx]}\n\t"
         "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t"
 
         ".Ldone%=:"
@@ -211,8 +208,6 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint
         :
         : "cc", "memory");
 
-    WJR_ASSUME(dst == cdst + n);
-    WJR_ASSUME(src == csrc + n);
     WJR_ASSUME(cx == 0);
 
     return r9;
@@ -231,30 +226,27 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n,
                                  uint64_t dx) {
     WJR_ASSERT(n != 0);
 
-    const auto cdst = dst;
-    const auto csrc = src;
-
     size_t cx = n / 8;
     uint64_t r8, r9, r10 = n, r11;
 
     asm volatile(
         "and{l $7, %k[r10]| %k[r10], 7}\n\t"
-        "lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t"
+        "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t"
         "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + "
         "%[r10] * 4]}\n\t"
         "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t"
         "jmp{q *%[r10]| %[r10]}\n\t"
 
         ".align 8\n\t"
-        ".Lasm_mul_1_lookup%=:\n\t"
-        ".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t"
+        ".Llookup%=:\n\t"
+        ".long .Ll0%=-.Llookup%=\n\t"
+        ".long .Ll1%=-.Llookup%=\n\t"
+        ".long .Ll2%=-.Llookup%=\n\t"
+        ".long .Ll3%=-.Llookup%=\n\t"
+        ".long .Ll4%=-.Llookup%=\n\t"
+        ".long .Ll5%=-.Llookup%=\n\t"
+        ".long .Ll6%=-.Llookup%=\n\t"
+        ".long .Ll7%=-.Llookup%=\n\t"
         ".align 16\n\t"
 
         ".Ll0%=:\n\t"
@@ -299,7 +291,7 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n,
 
         ".Ld1%=:\n\t"
         "add{q (%[dst]), %[r8]| [%[dst]], %[r8]}\n\t"
-        "adc{q $0, %[r9]| %[r9], 0}\n\t"
+        "adc{q %[cx], %[r9]| %[r9], %[cx]}\n\t"
         "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t"
         "jmp .Ldone%=\n\t"
 
@@ -310,7 +302,7 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n,
         "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t"
 
         ".align 32\n\t"
-        ".Lasm_addmul_1_loop%=:\n\t"
+        ".Lloop%=:\n\t"
 
         ".Lb1%=:\n\t"
         "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t"
@@ -366,7 +358,7 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n,
         "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"
 
         "jrcxz .Lloop_out%=\n\t"
-        "jmp .Lasm_addmul_1_loop%=\n\t"
+        "jmp .Lloop%=\n\t"
         ".Lloop_out%=:\n\t"
 
         "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t"
@@ -381,8 +373,6 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n,
         :
         : "cc", "memory");
 
-    WJR_ASSUME(dst == cdst + n);
-    WJR_ASSUME(src == csrc + n);
     WJR_ASSUME(cx == 0);
 
     return r9;
@@ -410,9 +400,6 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n,
                                  uint64_t dx) {
     WJR_ASSERT(n != 0);
 
-    const auto cdst = dst;
-    const auto csrc = src;
-
     size_t cx = n / 8;
     uint64_t r8, r9, r10 = n & 7, r11;
 
@@ -421,22 +408,22 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n,
         "mov{b $255, %b[r11]| %b[r11], 255}\n\t"
         "add{b $1, %b[r11]| %b[r11], 1}\n\t"
 
-        "lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t"
+        "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t"
         "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * "
         "4]}\n\t"
         "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t"
         "jmp{q *%[r10]| %[r10]}\n\t"
 
         ".align 8\n\t"
-        ".Lasm_mul_1_lookup%=:\n\t"
-        ".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t"
-        ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t"
+        ".Llookup%=:\n\t"
+        ".long .Ll0%=-.Llookup%=\n\t"
+        ".long .Ll1%=-.Llookup%=\n\t"
+        ".long .Ll2%=-.Llookup%=\n\t"
+        ".long .Ll3%=-.Llookup%=\n\t"
+        ".long .Ll4%=-.Llookup%=\n\t"
+        ".long .Ll5%=-.Llookup%=\n\t"
+        ".long .Ll6%=-.Llookup%=\n\t"
+        ".long .Ll7%=-.Llookup%=\n\t"
         ".align 16\n\t"
 
         ".Ll0%=:\n\t"
@@ -500,7 +487,7 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n,
         "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t"
 
         ".align 32\n\t"
-        ".Lasm_addmul_1_loop%=:\n\t"
+        ".Lloop%=:\n\t"
 
         ".Lb1%=:\n\t"
         "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t"
@@ -564,7 +551,7 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n,
         "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"
 
         "jrcxz .Lloop_out%=\n\t"
-        "jmp .Lasm_addmul_1_loop%=\n\t"
+        "jmp .Lloop%=\n\t"
         ".Lloop_out%=:\n\t"
 
         "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t"
@@ -579,8 +566,6 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n,
         :
         : "cc", "memory");
 
-    WJR_ASSUME(dst == cdst + n);
-    WJR_ASSUME(src == csrc + n);
     WJR_ASSUME(cx == 0);
 
     return r9;
@@ -588,6 +573,15 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n,
 
 #endif
 
+#if WJR_HAS_BUILTIN(ASM_MUL_1)
+#define WJR_HAS_BUILTIN_ASM_ADDLSH_N WJR_HAS_DEF
+#define WJR_HAS_BUILTIN_ASM_SUBLSH_N WJR_HAS_DEF
+
+#define WJR_ADDSUB_I 1
+#include <wjr/x86/gen_addrsblsh_n.hpp>
+
+#endif
+
 } // namespace wjr
 
 #endif // WJR_X86_MUL_HPP__
\ No newline at end of file
diff --git a/include/wjr/x86/sub.hpp b/include/wjr/x86/sub.hpp
index fe953064..fab10c33 100644
--- a/include/wjr/x86/sub.hpp
+++ b/include/wjr/x86/sub.hpp
@@ -13,12 +13,12 @@ namespace wjr {
     (defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC))
 #define WJR_HAS_BUILTIN_ASM_SUBC WJR_HAS_DEF
 #define WJR_HAS_BUILTIN_ASM_SUBC_N WJR_HAS_DEF
-#endif
 
 #define WJR_ADDSUB_I 0
-
 #include <wjr/x86/gen_addsub.hpp>
 
+#endif
+
 } // namespace wjr
 
 #endif // WJR_X86_SUB_HPP__
\ No newline at end of file