Skip to content

Commit

Permalink
opt
Browse files Browse the repository at this point in the history
  • Loading branch information
wjr-z committed Jan 11, 2024
1 parent 4e1ad97 commit c6015a4
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 19 deletions.
10 changes: 5 additions & 5 deletions include/wjr/math/div.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ namespace wjr {

// reference : https://ieeexplore.ieee.org/document/5487506
template <typename T, std::enable_if_t<std::is_same_v<T, uint64_t>, int> = 0>
WJR_CONSTEXPR_E T fallback_divmod_1(T *dst, const T *src, size_t n,
WJR_CONSTEXPR20 T fallback_divmod_1(T *dst, const T *src, size_t n,
div2by1_divider<T> div) {
uint64_t divisor = div.divisor();
uint64_t value = div.value();
Expand Down Expand Up @@ -140,7 +140,7 @@ WJR_CONSTEXPR_E T fallback_divmod_1(T *dst, const T *src, size_t n,
}

template <typename T, std::enable_if_t<is_unsigned_integral_v<T>, int> = 0>
WJR_INTRINSIC_CONSTEXPR_E T divmod_1(T *dst, const T *src, size_t n,
WJR_INTRINSIC_CONSTEXPR20 T divmod_1(T *dst, const T *src, size_t n,
div2by1_divider<T> div) {
WJR_ASSERT(n != 0);
WJR_ASSUME(n != 0);
Expand All @@ -164,7 +164,7 @@ WJR_INTRINSIC_CONSTEXPR_E T divmod_1(T *dst, const T *src, size_t n,
}

template <typename T, std::enable_if_t<is_unsigned_integral_v<T>, int> = 0>
WJR_INTRINSIC_CONSTEXPR_E T divmod_1(T *dst, const T *src, size_t n,
WJR_INTRINSIC_CONSTEXPR20 T divmod_1(T *dst, const T *src, size_t n,
type_identity_t<T> div) {
WJR_ASSERT(n != 0);
WJR_ASSUME(n != 0);
Expand Down Expand Up @@ -214,7 +214,7 @@ WJR_CONSTEXPR_E void fallback_divexact_1(T *dst, const T *src, size_t n,
}

r10 = src[n];
r10 = subc(r10, rdx, cf, cf);
r10 -= rdx + cf;
r10 = mullo(r10, value);
dst[n] = r10;
return;
Expand All @@ -237,7 +237,7 @@ WJR_CONSTEXPR_E void fallback_divexact_1(T *dst, const T *src, size_t n,
}

r10 = r10 >> shift;
r10 = subc(r10, rdx, cf, cf);
r10 -= rdx + cf;
r10 = mullo(r10, value);
dst[n] = r10;
return;
Expand Down
2 changes: 0 additions & 2 deletions include/wjr/math/mul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,6 @@ uint64_t fallback_mulhi64(uint64_t a, uint64_t b) {
template <typename T>
WJR_ATTRIBUTES(CONST, INTRINSIC_CONSTEXPR_E)
T mulhi(T a, T b) {
constexpr auto nd = std::numeric_limits<T>::digits;

T ret = 0;
(void)mul(a, b, ret);
return ret;
Expand Down
3 changes: 2 additions & 1 deletion include/wjr/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ WJR_INTRINSIC_CONSTEXPR20 P *container_of_offset_impl(M *ptr, const M P::*member
#define WJR_CONTAINER_OF(ptr, type, member) container_of_offset_impl(ptr, &type::member)

WJR_INTRINSIC_CONSTEXPR size_t abs_cast(size_t n) {
WJR_ASSUME((n & static_cast<size_t>(std::numeric_limits<ptrdiff_t>::min())) == 0);
constexpr auto nd = std::numeric_limits<size_t>::digits;
WJR_ASSUME((n >> (nd - 1)) == 0);
return n;
}

Expand Down
21 changes: 10 additions & 11 deletions include/wjr/x86/gen_addsub.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,17 +152,13 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const
src0 += n - 8; \
src1 += n - 8; \
dst += n - 8; \
uint64_t t1; \
uint64_t t2; \
t1 = n; \
asm volatile("add{b $255, %b[t0]| %b[t0], 255}\n\t" \
"lea{q| %[t2], [rip +} .Lasm_" \
WJR_PP_STR(WJR_addcsubc) "_n_lookup%={(%%rip), %[t2]|]}\n\t" \
"movs{lq (%[t2], %[t1], 4), %[t1]|xd %[t1], DWORD PTR [%[t2] + " \
"%[t1] * 4]}\n\t" \
"lea{q (%[t2], %[t1], 1), %[t1]| %[t1], [%[t2] + %[t1]]}\n\t" \
"jmp{q *%[t1]| %[t1]}\n\t" \
\
"lea{q| %[t0], [rip +} .Lasm_" \
WJR_PP_STR(WJR_addcsubc) "_n_lookup%={(%%rip), %[t0]|]}\n\t" \
"movs{lq (%[t0], %[n], 4), %[n]|xd %[n], DWORD PTR [%[t0] + " \
"%[n] * 4]}\n\t" \
"lea{q (%[t0], %[n], 1), %[n]| %[n], [%[t0] + %[n]]}\n\t" \
"jmp{q *%[n]| %[n]}\n\t" \
".align 4\n\t" \
".Lasm_" WJR_PP_STR(WJR_addcsubc) "_n_lookup%=:\n\t" \
".long .Lcase0%=-.Lasm_" WJR_PP_STR(WJR_addcsubc) "_n_lookup%=\n\t" \
Expand Down Expand Up @@ -251,7 +247,7 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const
"setb %b[t0]" \
\
: [dst] "+r"(dst), [src0] "+%r"(src0), [src1] "+r"(src1), \
[m] "+r"(m), [t0] "+r"(t0), [t1] "+r"(t1), [t2] "=r"(t2) \
[m] "+r"(m), [t0] "+r"(t0), [n] "+r"(n) \
: \
: "cc", "memory"); \
} else
Expand All @@ -263,6 +259,9 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const
static_assert(nd <= 64, "not support yet");
}

#undef WJR_REGISTER_ASM_ADDSUB_N_JUMP
#undef WJR_REGISTER_ASM_ADDSUB_N_PIC_JUMP
#undef WJR_REGISTER_ASM_ADDSUB_N_NOPIC_JUMP
#undef WJR_REGISTER_ASM_ADDSUB_N

return static_cast<unsigned char>(t0);
Expand Down

0 comments on commit c6015a4

Please sign in to comment.