diff --git a/include/intx/int128.hpp b/include/intx/int128.hpp index 1f422d55..5721f1a7 100644 --- a/include/intx/int128.hpp +++ b/include/intx/int128.hpp @@ -1,5 +1,5 @@ // intx: extended precision integer library. -// Copyright 2019 Pawel Bylica. +// Copyright 2019-2020 Pawel Bylica. // Licensed under the Apache License, Version 2.0. #pragma once @@ -506,29 +506,20 @@ constexpr uint16_t reciprocal_table[] = {REPEAT256()}; /// Based on Algorithm 2 from "Improved division by invariant integers". inline uint64_t reciprocal_2by1(uint64_t d) noexcept { - auto d9 = uint8_t(d >> 55); - auto v0 = uint64_t{internal::reciprocal_table[d9]}; + const uint64_t d9 = d >> 55; + const uint32_t v0 = internal::reciprocal_table[d9 - 256]; - auto d40 = (d >> 24) + 1; - auto v1 = (v0 << 11) - (v0 * v0 * d40 >> 40) - 1; + const uint64_t d40 = (d >> 24) + 1; + const uint64_t v1 = (v0 << 11) - uint32_t(v0 * v0 * d40 >> 40) - 1; - auto v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47); + const uint64_t v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47); - auto d0 = d % 2; - auto d63 = d / 2 + d0; // ceil(d/2) - auto nd0 = uint64_t(-int64_t(d0)); - auto e = ((v2 / 2) & nd0) - v2 * d63; - auto mh = umul(v2, e).hi; - auto v3 = (v2 << 31) + (mh >> 1); - - // OPT: The compiler tries a bit too much with 128 + 64 addition and ends up using subtraction. - // Compare with __int128. - auto mf = umul(v3, d); - auto m = fast_add(mf, d); - auto v3a = m.hi + d; - - auto v4 = v3 - v3a; + const uint64_t d0 = d & 1; + const uint64_t d63 = (d >> 1) + d0; // ceil(d/2) + const uint64_t e = ((v2 >> 1) & (0 - d0)) - v2 * d63; + const uint64_t v3 = (umul(v2, e).hi >> 1) + (v2 << 31); + const uint64_t v4 = v3 - (umul(v3, d) + d).hi - d; return v4; } @@ -548,7 +539,7 @@ inline uint64_t reciprocal_3by2(uint128 d) noexcept p -= d.hi; } - auto t = umul(v, d.lo); + const auto t = umul(v, d.lo); p += t.hi; if (p < t.hi) diff --git a/test/benchmarks/bench_div.cpp b/test/benchmarks/bench_div.cpp index d3324675..528706d3 100644 --- a/test/benchmarks/bench_div.cpp +++ b/test/benchmarks/bench_div.cpp @@ -12,6 +12,8 @@ uint64_t nop(uint64_t x, uint64_t y) noexcept; uint64_t soft_div_unr_unrolled(uint64_t x, uint64_t y) noexcept; uint64_t soft_div_unr(uint64_t x, uint64_t y) noexcept; +uint64_t reciprocal_2by1_noinline(uint64_t d) noexcept; + using namespace intx; inline uint64_t udiv_by_reciprocal(uint64_t uu, uint64_t du) noexcept @@ -80,6 +82,7 @@ static void div_unary(benchmark::State& state) } BENCHMARK_TEMPLATE(div_unary, neg); BENCHMARK_TEMPLATE(div_unary, reciprocal_2by1); +BENCHMARK_TEMPLATE(div_unary, reciprocal_2by1_noinline); BENCHMARK_TEMPLATE(div_unary, reciprocal_naive); template diff --git a/test/benchmarks/noinline.cpp b/test/benchmarks/noinline.cpp index 57b12154..9f9024d9 100644 --- a/test/benchmarks/noinline.cpp +++ b/test/benchmarks/noinline.cpp @@ -1,5 +1,5 @@ // intx: extended precision integer library. -// Copyright 2019 Pawel Bylica. +// Copyright 2019-2020 Pawel Bylica. // Licensed under the Apache License, Version 2.0. #include @@ -30,3 +30,8 @@ auto exp(const uint256& x, const uint256& y) noexcept { return intx::exp(x, y); } + +auto reciprocal_2by1_noinline(uint64_t d) noexcept +{ + return reciprocal_2by1(d); +}