Skip to content

Commit

Permalink
upd
Browse files Browse the repository at this point in the history
  • Loading branch information
wjr-z committed Jan 26, 2024
1 parent 7b89a9e commit 094699b
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 74 deletions.
2 changes: 1 addition & 1 deletion include/wjr/math/add.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ WJR_INTRINSIC_CONSTEXPR_E T addc(T a, T b, type_identity_t<U> c_in, U &c_out) {
#if !WJR_HAS_BUILTIN(ADDC) && !WJR_HAS_BUILTIN(ASM_ADDC)
return fallback_addc(a, b, c_in, c_out);
#else
constexpr auto is_constant_or_zero = [](const auto &x) -> int {
constexpr auto is_constant_or_zero = [](auto x) -> int {
return WJR_BUILTIN_CONSTANT_P(x == 0) && x == 0 ? 2
: WJR_BUILTIN_CONSTANT_P(x) ? 1
: 0;
Expand Down
51 changes: 28 additions & 23 deletions include/wjr/math/div.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

#include <wjr/math/divider.hpp>

#if defined(WJR_X86)
#include <wjr/x86/div.hpp>
#endif

namespace wjr {

// reference : https://ieeexplore.ieee.org/document/5487506
Expand Down Expand Up @@ -187,28 +191,47 @@ template <typename T, std::enable_if_t<std::is_same_v<T, uint64_t>, int> = 0>
WJR_CONSTEXPR_E T fallback_divexact_dbm1c(T *dst, const T *src, size_t n, T bd, T h) {
T a = 0, p0 = 0, p1 = 0, cf = 0;

// GCC can't optimize well
WJR_UNROLL(4)
for (size_t i = 0; i < n; i++) {
a = src[i];
p0 = mul(a, bd, p1);
cf = h < p0;
h = (h - p0);
h = subc(h, p0, 0u, cf);
dst[i] = h;
h = h - p1 - cf;
h -= p1 + cf;
}

return h;
}

template <typename T, std::enable_if_t<std::is_same_v<T, uint64_t>, int> = 0>
WJR_CONSTEXPR_E T divexact_dbm1c(T *dst, const T *src, size_t n, T bd, T h) {
#if WJR_HAS_BUILTIN(ASM_DIVEXACT_DBM1C)
if (is_constant_evaluated()) {
return fallback_divexact_dbm1c(dst, src, n, bd, h);
}
return asm_divexact_dbm1c(dst, src, n, bd, h);
#else
return fallback_divexact_dbm1c(dst, src, n, bd, h);
#endif
}

template <typename T, std::enable_if_t<std::is_same_v<T, uint64_t>, int>>
WJR_CONSTEXPR_E void divexact_by3(T *dst, const T *src, size_t n) {
constexpr auto max = std::numeric_limits<T>::max();
(void)fallback_divexact_dbm1c<T>(dst, src, n, max / 3, 0);
(void)divexact_dbm1c<T>(dst, src, n, max / 3, 0);
}

template <typename T, std::enable_if_t<std::is_same_v<T, uint64_t>, int> = 0>
WJR_CONSTEXPR_E void divexact_by5(T *dst, const T *src, size_t n) {
constexpr auto max = std::numeric_limits<T>::max();
(void)fallback_divexact_dbm1c<T>(dst, src, n, max / 5, 0);
(void)divexact_dbm1c<T>(dst, src, n, max / 5, 0);
}

template <typename T, std::enable_if_t<std::is_same_v<T, uint64_t>, int> = 0>
WJR_CONSTEXPR_E void divexact_by15(T *dst, const T *src, size_t n) {
constexpr auto max = std::numeric_limits<T>::max();
(void)divexact_dbm1c<T>(dst, src, n, max / 15, 0);
}

// reference : ftp://ftp.risc.uni-linz.ac.at/pub/techreports/1992/92-35.ps.gz
Expand Down Expand Up @@ -280,16 +303,6 @@ WJR_INTRINSIC_CONSTEXPR_E void divexact_1(T *dst, const T *src, size_t n,
return;
}

if (WJR_BUILTIN_CONSTANT_P(div.shift() == 0) && div.shift() == 0) {
if (WJR_BUILTIN_CONSTANT_P(div.divisor() == 3) && div.divisor() == 3) {
return divexact_by3(dst, src, n);
}

if (WJR_BUILTIN_CONSTANT_P(div.divisor() == 5) && div.divisor() == 5) {
return divexact_by5(dst, src, n);
}
}

return fallback_divexact_1(dst, src, n, div);
}

Expand All @@ -304,14 +317,6 @@ WJR_INTRINSIC_CONSTEXPR_E void divexact_1(T *dst, const T *src, size_t n,
return;
}

if (WJR_BUILTIN_CONSTANT_P(div == 3) && div == 3) {
return divexact_by3(dst, src, n);
}

if (WJR_BUILTIN_CONSTANT_P(div == 5) && div == 5) {
return divexact_by5(dst, src, n);
}

return fallback_divexact_1(dst, src, n, divexact1_divider<T>(div));
}

Expand Down
10 changes: 5 additions & 5 deletions include/wjr/math/sub.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ namespace wjr {

template <typename T, typename U>
WJR_INTRINSIC_CONSTEXPR T fallback_subc(T a, T b, U c_in, U &c_out) {
T ret = a;
U c = ret < b;
ret -= b;
c |= ret < c_in;
T ret = a - b;
U c = ret > a;
a = ret;
ret -= c_in;
c |= ret > a;
c_out = c;
return ret;
}
Expand Down Expand Up @@ -64,7 +64,7 @@ WJR_INTRINSIC_CONSTEXPR_E T subc(T a, T b, type_identity_t<U> c_in, U &c_out) {
#if !WJR_HAS_BUILTIN(SUBC) && !WJR_HAS_BUILTIN(ASM_SUBC)
return fallback_subc(a, b, c_in, c_out);
#else
constexpr auto is_constant_or_zero = [](const auto &x) -> int {
constexpr auto is_constant_or_zero = [](auto x) -> int {
return WJR_BUILTIN_CONSTANT_P(x == 0) && x == 0 ? 2
: WJR_BUILTIN_CONSTANT_P(x) ? 1
: 0;
Expand Down
83 changes: 83 additions & 0 deletions include/wjr/x86/div.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#ifndef WJR_X86_DIV_HPP__
#define WJR_X86_DIV_HPP__

#include <wjr/type_traits.hpp>

#ifndef WJR_X86
#error "x86 required"
#endif

namespace wjr {

#if WJR_HAS_FEATURE(INLINE_ASM) && \
(defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC))
#define WJR_HAS_BUILTIN_ASM_DIVEXACT_DBM1C WJR_HAS_DEF
#endif

#if WJR_HAS_BUILTIN(ASM_DIVEXACT_DBM1C)

WJR_INLINE uint64_t asm_divexact_dbm1c(uint64_t *dst, const uint64_t *src, size_t n,
uint64_t bd, uint64_t h) {
WJR_ASSERT(n >= 1);

uint64_t r8 = h, r9 = n, r10, r11 = static_cast<uint32_t>(n);

src += r9;
dst += r9;
r9 = -r9;

asm volatile(
"and{l $3, %k[r11]| %k[r11], 3}\n\t"
"je .Lb0%=\n\t"
"lea{q -4(%[r9], %[r11], 1), %[r9]| %[r9], [%[r9] + %[r11] * 1 - 4]}\n\t"
"cmp{l $2, %k[r11]| %k[r11], 2}\n\t"
"jb .Lb1%=\n\t"
"je .Lb2%=\n\t"
"jmp .Lb3%=\n\t"

".Lloop%=:\n\t"

".Lb0%=:\n\t"
"mov{q (%[src], %[r9], 8), %[r10]| %[r10], [%[src] + %[r9] * 8]}\n\t"
"mul{q %[bd]| %[bd]}\n\t"
"sub{q %[r10], %[r8]| %[r8], %[r10]}\n\t"
"mov{q %[r8], (%[dst], %[r9], 8)| [%[dst] + %[r9] * 8], %[r8]}\n\t"
"sbb{q %[r11], %[r8]| %[r8], %[r11]}\n\t"

".Lb3%=:\n\t"
"mov{q 8(%[src], %[r9], 8), %[r10]| %[r10], [%[src] + %[r9] * 8 + 8]}\n\t"
"mul{q %[bd]| %[bd]}\n\t"
"sub{q %[r10], %[r8]| %[r8], %[r10]}\n\t"
"mov{q %[r8], 8(%[dst], %[r9], 8)| [%[dst] + %[r9] * 8 + 8], %[r8]}\n\t"
"sbb{q %[r11], %[r8]| %[r8], %[r11]}\n\t"

".Lb2%=:\n\t"
"mov{q 16(%[src], %[r9], 8), %[r10]| %[r10], [%[src] + %[r9] * 8 + 16]}\n\t"
"mul{q %[bd]| %[bd]}\n\t"
"sub{q %[r10], %[r8]| %[r8], %[r10]}\n\t"
"mov{q %[r8], 16(%[dst], %[r9], 8)| [%[dst] + %[r9] * 8 + 16], %[r8]}\n\t"
"sbb{q %[r11], %[r8]| %[r8], %[r11]}\n\t"

".Lb1%=:\n\t"
"mov{q 24(%[src], %[r9], 8), %[r10]| %[r10], [%[src] + %[r9] * 8 + 24]}\n\t"
"mul{q %[bd]| %[bd]}\n\t"
"sub{q %[r10], %[r8]| %[r8], %[r10]}\n\t"
"mov{q %[r8], 24(%[dst], %[r9], 8)| [%[dst] + %[r9] * 8 + 24], %[r8]}\n\t"
"sbb{q %[r11], %[r8]| %[r8], %[r11]}\n\t"

"add $4, %[r9]\n\t"
"jne .Lloop%=\n\t"

: [dst] "+r"(dst), [src] "+r"(src), [bd] "+r"(bd), [r8] "+r"(r8), [r9] "+r"(r9),
[r10] "=a"(r10), [r11] "+d"(r11)
:
: "cc", "memory");

return r8;
}

#endif

} // namespace wjr

#endif // WJR_X86_DIV_HPP__
67 changes: 22 additions & 45 deletions include/wjr/x86/gen_addsub.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,18 @@
template <typename U>
WJR_INTRINSIC_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_addcsubc)(uint64_t a, uint64_t b,
U c_in, U &c_out) {

// c_in == 0, handled at addc/subc
#if WJR_ADDSUB_I == 0
if (WJR_BUILTIN_CONSTANT_P(c_in)) {
if (c_in == 0) {
// GCC seems to have some optimization issues
#if defined(WJR_COMPILER_GCC)
asm("sub{q %2, %0| %0, %2}\n\t"
"setb %b1"
: "=r"(a), "+r"(c_in)
: "ri"(b), "0"(a)
: "cc");
c_out = c_in;
return a;
#else
c_out = a < b;
a -= b;
return a;
#endif
} else {
c_in = 0;
asm("stc\n\t"
"sbb{q %2, %0| %0, %2}\n\t"
"setb %b1"
: "=r"(a), "+r"(c_in)
: "ri"(b), "0"(a)
: "cc");
c_out = c_in;
return a;
}
if (WJR_BUILTIN_CONSTANT_P(c_in == 1) && c_in == 1) {
c_in = 0;
asm("stc\n\t"
"sbb{q %2, %0| %0, %2}\n\t"
"setb %b1"
: "=r"(a), "+r"(c_in)
: "ri"(b), "0"(a)
: "cc");
c_out = c_in;
return a;
}

asm("add{b $255, %b1| %b1, 255}\n\t"
Expand All @@ -52,23 +35,17 @@ WJR_INTRINSIC_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_addcsubc)(uint64_t a, uint
c_out = c_in;
return a;
#else
if (WJR_BUILTIN_CONSTANT_P(c_in)) {
if (c_in == 0) {
a += b;
c_out = a < b;
return a;
} else {
c_in = 0;
asm("stc\n\t"
"adc{q"
" %2, %0| %0, %2}\n\t"
"setb %b1"
: "=r"(a), "+r"(c_in)
: "ri"(b), "0"(a)
: "cc");
c_out = c_in;
return a;
}
if (WJR_BUILTIN_CONSTANT_P(c_in == 1) && c_in == 1) {
c_in = 0;
asm("stc\n\t"
"adc{q"
" %2, %0| %0, %2}\n\t"
"setb %b1"
: "=r"(a), "+r"(c_in)
: "ri"(b), "0"(a)
: "cc");
c_out = c_in;
return a;
}

if (WJR_BUILTIN_CONSTANT_P(a)) {
Expand Down

0 comments on commit 094699b

Please sign in to comment.