Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize 64-bit division-by-constant for x86 platforms #73

Merged
merged 4 commits into from
Aug 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ryu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ cc_library(
"d2s.c",
"d2s.h",
"d2s_full_table.h",
"d2s_intrinsics.h",
"digit_table.h",
"mulshift128.h",
"common.h",
],
hdrs = ["ryu.h"],
Expand Down
92 changes: 56 additions & 36 deletions ryu/d2s.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,14 @@
#include "ryu/common.h"
#include "ryu/digit_table.h"
#include "ryu/d2s.h"
#include "ryu/d2s_intrinsics.h"

static inline uint32_t pow5Factor(uint64_t value) {
uint32_t count = 0;
for (;;) {
assert(value != 0);
const uint64_t q = value / 5;
const uint64_t r = value % 5;
const uint64_t q = div5(value);
const uint32_t r = (uint32_t) (value - 5 * q);
if (r != 0) {
break;
}
Expand Down Expand Up @@ -290,7 +291,8 @@ static inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_
// This should use q <= 22, but I think 21 is also safe. Smaller values
// may still be safe, but it's more difficult to reason about them.
// Only one of mp, mv, and mm can be a multiple of 5, if any.
if (mv % 5 == 0) {
const uint32_t mvMod5 = (uint32_t) (mv - 5 * div5(mv));
if (mvMod5 == 0) {
vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
} else if (acceptBounds) {
// Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
Expand Down Expand Up @@ -358,32 +360,42 @@ static inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_
// On average, we remove ~2 digits.
if (vmIsTrailingZeros || vrIsTrailingZeros) {
// General case, which happens rarely (~0.7%).
while (vp / 10 > vm / 10) {
#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=23106
// The compiler does not realize that vm % 10 can be computed from vm / 10
// as vm - (vm / 10) * 10.
vmIsTrailingZeros &= vm - (vm / 10) * 10 == 0;
#else
vmIsTrailingZeros &= vm % 10 == 0;
#endif
for (;;) {
const uint64_t vpDiv10 = div10(vp);
const uint64_t vmDiv10 = div10(vm);
if (vpDiv10 <= vmDiv10) {
break;
}
const uint32_t vmMod10 = (uint32_t) (vm - 10 * vmDiv10);
const uint64_t vrDiv10 = div10(vr);
const uint32_t vrMod10 = (uint32_t) (vr - 10 * vrDiv10);
vmIsTrailingZeros &= vmMod10 == 0;
vrIsTrailingZeros &= lastRemovedDigit == 0;
lastRemovedDigit = (uint8_t) (vr % 10);
vr /= 10;
vp /= 10;
vm /= 10;
lastRemovedDigit = (uint8_t) vrMod10;
vr = vrDiv10;
vp = vpDiv10;
vm = vmDiv10;
++removed;
}
#ifdef RYU_DEBUG
printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
printf("d-10=%s\n", vmIsTrailingZeros ? "true" : "false");
#endif
if (vmIsTrailingZeros) {
while (vm % 10 == 0) {
for (;;) {
const uint64_t vmDiv10 = div10(vm);
const uint32_t vmMod10 = (uint32_t) (vm - 10 * vmDiv10);
if (vmMod10 != 0) {
break;
}
const uint64_t vpDiv10 = div10(vp);
const uint64_t vrDiv10 = div10(vr);
const uint32_t vrMod10 = (uint32_t) (vr - 10 * vrDiv10);
vrIsTrailingZeros &= lastRemovedDigit == 0;
lastRemovedDigit = (uint8_t) (vr % 10);
vr /= 10;
vp /= 10;
vm /= 10;
lastRemovedDigit = (uint8_t) vrMod10;
vr = vrDiv10;
vp = vpDiv10;
vm = vmDiv10;
++removed;
}
}
Expand All @@ -401,22 +413,33 @@ static inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_
} else {
// Specialized for the common case (~99.3%). Percentages below are relative to this.
bool roundUp = false;
if (vp / 100 > vm / 100) { // Optimization: remove two digits at a time (~86.2%).
roundUp = (vr % 100) >= 50;
vr /= 100;
vp /= 100;
vm /= 100;
const uint64_t vpDiv100 = div100(vp);
const uint64_t vmDiv100 = div100(vm);
if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
const uint64_t vrDiv100 = div100(vr);
const uint32_t vrMod100 = (uint32_t) (vr - 100 * vrDiv100);
roundUp = vrMod100 >= 50;
vr = vrDiv100;
vp = vpDiv100;
vm = vmDiv100;
removed += 2;
}
// Loop iterations below (approximately), without optimization above:
// 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
// Loop iterations below (approximately), with optimization above:
// 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
while (vp / 10 > vm / 10) {
roundUp = vr % 10 >= 5;
vr /= 10;
vp /= 10;
vm /= 10;
for (;;) {
const uint64_t vpDiv10 = div10(vp);
const uint64_t vmDiv10 = div10(vm);
if (vpDiv10 <= vmDiv10) {
break;
}
const uint64_t vrDiv10 = div10(vr);
const uint32_t vrMod10 = (uint32_t) (vr - 10 * vrDiv10);
roundUp = vrMod10 >= 5;
vr = vrDiv10;
vp = vpDiv10;
vm = vmDiv10;
++removed;
}
#ifdef RYU_DEBUG
Expand Down Expand Up @@ -471,12 +494,9 @@ static inline int to_chars(const floating_decimal_64 v, const bool sign, char* c
// so the rest will fit into uint32_t.
if ((output >> 32) != 0) {
// Expensive 64-bit division.
#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
uint32_t output2 = (uint32_t) (output - 100000000 * (output / 100000000));
#else
uint32_t output2 = (uint32_t) (output % 100000000);
#endif
output /= 100000000;
const uint64_t q = div100000000(output);
uint32_t output2 = (uint32_t) (output - 100000000 * q);
output = q;

const uint32_t c = output2 % 10000;
output2 /= 10000;
Expand Down
2 changes: 1 addition & 1 deletion ryu/d2s.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#if defined(HAS_UINT128)
typedef __uint128_t uint128_t;
#else
#include "ryu/mulshift128.h"
#include "ryu/d2s_intrinsics.h"
#endif

#define DOUBLE_MANTISSA_BITS 52
Expand Down
69 changes: 66 additions & 3 deletions ryu/mulshift128.h → ryu/d2s_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
// Unless required by applicable law or agreed to in writing, this software
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.
#ifndef RYU_MULSHIFT128_H
#define RYU_MULSHIFT128_H
#ifndef RYU_D2S_INTRINSICS_H
#define RYU_D2S_INTRINSICS_H

#include <assert.h>
#include <stdint.h>
Expand Down Expand Up @@ -82,4 +82,67 @@ static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const

#endif // defined(HAS_64_BIT_INTRINSICS)

#endif // RYU_MULSHIFT128_H
#if defined(_M_IX86)

// Returns the high 64 bits of the 128-bit product of a and b.
static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
// Reuse the umul128 implementation.
// Optimizers will likely eliminate the instructions used to compute the
// low part of the product.
uint64_t hi;
umul128(a, b, &hi);
return hi;
}

// On x86 platforms, compilers typically generate calls to library
// functions for 64-bit divisions, even if the divisor is a constant.
//
// E.g.:
// https://bugs.llvm.org/show_bug.cgi?id=37932
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
//
// The functions here perform division-by-constant using multiplications
// in the same way as 64-bit compilers would do.
//
// NB:
// The multipliers and shift values are the ones generated by clang x64
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have an upstream bug that we can cite here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For clang I just found this: https://bugs.llvm.org/show_bug.cgi?id=37932
I haven't found something similar for gcc or msvc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've filed VSO#664754 "On x86, division by 64-bit constants could be dramatically more efficient" in MSVC's internal database, with your code as a minimal repro. (This is separate from VSO#634761 "C2 emits code that's 34.3% slower than Clang/LLVM for the Ryu algorithm" which is the overall bug for Ryu codegen quality.)

// for expressions like x/5, x/10, etc.

static inline uint64_t div5(const uint64_t x) {
return umulh(x, 0xCCCCCCCCCCCCCCCD) >> 2;
}

static inline uint64_t div10(const uint64_t x) {
return umulh(x, 0xCCCCCCCCCCCCCCCD) >> 3;
}

static inline uint64_t div100(const uint64_t x) {
return umulh(x >> 2, 0x28F5C28F5C28F5C3) >> 2;
}

static inline uint64_t div100000000(const uint64_t x) {
return umulh(x, 0xABCC77118461CEFD) >> 26;
}

#else

static inline uint64_t div5(const uint64_t x) {
return x / 5;
}

static inline uint64_t div10(const uint64_t x) {
return x / 10;
}

static inline uint64_t div100(const uint64_t x) {
return x / 100;
}

static inline uint64_t div100000000(const uint64_t x) {
return x / 100000000;
}

#endif

#endif // RYU_D2S_INTRINSICS_H