Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adopting Alexhuszagh's decimal comparison approach for long input strings #104

Merged
merged 2 commits into from
Sep 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 32 additions & 123 deletions include/fast_float/ascii_number.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,16 +91,20 @@ CXX20_CONSTEXPR fastfloat_really_inline bool is_made_of_eight_digits_fast(const
return is_made_of_eight_digits_fast(read_u64(chars));
}

typedef span<const char> byte_span;

struct parsed_number_string {
int64_t exponent;
uint64_t mantissa;
const char *lastmatch;
bool negative;
bool valid;
bool too_many_digits;
int64_t exponent{0};
uint64_t mantissa{0};
const char *lastmatch{nullptr};
bool negative{false};
bool valid{false};
bool too_many_digits{false};
// contains the range of the significant digits
byte_span integer{}; // non-nullable
byte_span fraction{}; // nullable
};


// Assuming that you use no more than 19 digits, this will
// parse an ASCII string.
CXX20_CONSTEXPR fastfloat_really_inline
Expand All @@ -125,6 +129,10 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_

uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)

while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
p += 8;
}
while ((p != pend) && is_integer(*p)) {
// a multiplication by 10 is cheaper than an arbitrary integer
// multiplication
Expand All @@ -134,24 +142,24 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
}
const char *const end_of_integer_part = p;
int64_t digit_count = int64_t(end_of_integer_part - start_digits);
answer.integer = byte_span(start_digits, size_t(digit_count));
int64_t exponent = 0;
if ((p != pend) && (*p == decimal_point)) {
++p;
// Fast approach only tested under little endian systems
if ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
p += 8;
if ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
const char* before = p;
// can occur at most twice without overflowing, but let it occur more, since
// for integers with many digits, digit parsing is the primary bottleneck.
while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
p += 8;
}
}
while ((p != pend) && is_integer(*p)) {
uint8_t digit = uint8_t(*p - '0');
++p;
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
}
exponent = end_of_integer_part + 1 - p;
exponent = before - p;
answer.fraction = byte_span(before, size_t(p - before));
digit_count -= exponent;
}
// we must have encountered at least one integer!
Expand Down Expand Up @@ -179,7 +187,7 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
} else {
while ((p != pend) && is_integer(*p)) {
uint8_t digit = uint8_t(*p - '0');
if (exp_number < 0x10000) {
if (exp_number < 0x10000000) {
exp_number = 10 * exp_number + digit;
}
++p;
Expand Down Expand Up @@ -212,23 +220,26 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
if (digit_count > 19) {
answer.too_many_digits = true;
// Let us start again, this time, avoiding overflows.
// We don't need to check if is_integer, since we use the
// pre-tokenized spans from above.
i = 0;
p = start_digits;
p = answer.integer.ptr;
const char* int_end = p + answer.integer.len();
const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
while((i < minimal_nineteen_digit_integer) && (p != pend) && is_integer(*p)) {
while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
i = i * 10 + uint64_t(*p - '0');
++p;
}
if (i >= minimal_nineteen_digit_integer) { // We have a big integers
exponent = end_of_integer_part - p + exp_number;
} else { // We have a value with a fractional component.
p++; // skip the dot
const char *first_after_period = p;
while((i < minimal_nineteen_digit_integer) && (p != pend) && is_integer(*p)) {
p = answer.fraction.ptr;
const char* frac_end = p + answer.fraction.len();
while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
i = i * 10 + uint64_t(*p - '0');
++p;
}
exponent = first_after_period - p + exp_number;
exponent = answer.fraction.ptr - p + exp_number;
}
// We have now corrected both exponent and i, to a truncated value
}
Expand All @@ -238,108 +249,6 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
return answer;
}


// This should always succeed since it follows a call to parse_number_string
// This function could be optimized. In particular, we could stop after 19 digits
// and try to bail out. Furthermore, we should be able to recover the computed
// exponent from the pass in parse_number_string.
CXX20_CONSTEXPR fastfloat_really_inline decimal parse_decimal(const char *p, const char *pend, parse_options options) noexcept {
const char decimal_point = options.decimal_point;

decimal answer;
answer.num_digits = 0;
answer.decimal_point = 0;
answer.truncated = false;
answer.negative = (*p == '-');
if (*p == '-') { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
++p;
}
// skip leading zeroes
while ((p != pend) && (*p == '0')) {
++p;
}
while ((p != pend) && is_integer(*p)) {
if (answer.num_digits < max_digits) {
answer.digits[answer.num_digits] = uint8_t(*p - '0');
}
answer.num_digits++;
++p;
}
if ((p != pend) && (*p == decimal_point)) {
++p;
const char *first_after_period = p;
// if we have not yet encountered a zero, we have to skip it as well
if(answer.num_digits == 0) {
// skip zeros
while ((p != pend) && (*p == '0')) {
++p;
}
}
// We expect that this loop will often take the bulk of the running time
// because when a value has lots of digits, these digits often
while ((std::distance(p, pend) >= 8) && (answer.num_digits + 8 < max_digits)) {
uint64_t val = read_u64(p);
if(! is_made_of_eight_digits_fast(val)) { break; }
// We have eight digits, process them in one go!
val -= 0x3030303030303030;
write_u64(answer.digits + answer.num_digits, val);
answer.num_digits += 8;
p += 8;
}
while ((p != pend) && is_integer(*p)) {
if (answer.num_digits < max_digits) {
answer.digits[answer.num_digits] = uint8_t(*p - '0');
}
answer.num_digits++;
++p;
}
answer.decimal_point = int32_t(first_after_period - p);
}
// We want num_digits to be the number of significant digits, excluding
// leading *and* trailing zeros! Otherwise the truncated flag later is
// going to be misleading.
if(answer.num_digits > 0) {
// We potentially need the answer.num_digits > 0 guard because we
// prune leading zeros. So with answer.num_digits > 0, we know that
// we have at least one non-zero digit.
const char *preverse = p - 1;
int32_t trailing_zeros = 0;
while ((*preverse == '0') || (*preverse == decimal_point)) {
if(*preverse == '0') { trailing_zeros++; };
--preverse;
}
answer.decimal_point += int32_t(answer.num_digits);
answer.num_digits -= uint32_t(trailing_zeros);
}
if(answer.num_digits > max_digits) {
answer.truncated = true;
answer.num_digits = max_digits;
}
if ((p != pend) && (('e' == *p) || ('E' == *p))) {
++p;
bool neg_exp = false;
if ((p != pend) && ('-' == *p)) {
neg_exp = true;
++p;
} else if ((p != pend) && ('+' == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
++p;
}
int32_t exp_number = 0; // exponential part
while ((p != pend) && is_integer(*p)) {
uint8_t digit = uint8_t(*p - '0');
if (exp_number < 0x10000) {
exp_number = 10 * exp_number + digit;
}
++p;
}
answer.decimal_point += (neg_exp ? -exp_number : exp_number);
}
// In very rare cases, we may have fewer than 19 digits, we want to be able to reliably
// assume that all digits up to max_digit_without_overflow have been initialized.
for(uint32_t i = answer.num_digits; i < max_digit_without_overflow; i++) { answer.digits[i] = 0; }

return answer;
}
} // namespace fast_float

#endif
Loading