Merge pull request #96 from Alexhuszagh/bigint

Implement the big-integer arithmetic algorithm.
fastfloat · Sep 14, 2021 · 3f0ba09 · 3f0ba09
2 parents 25b240a + fc0c868
commit 3f0ba09
Show file tree

Hide file tree

Showing 8 changed files with 1,140 additions and 189 deletions.
diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
@@ -91,16 +91,20 @@ CXX20_CONSTEXPR fastfloat_really_inline bool is_made_of_eight_digits_fast(const
   return is_made_of_eight_digits_fast(read_u64(chars));
 }
 
+typedef span<const char> byte_span;
+
 struct parsed_number_string {
-  int64_t exponent;
-  uint64_t mantissa;
-  const char *lastmatch;
-  bool negative;
-  bool valid;
-  bool too_many_digits;
+  int64_t exponent{0};
+  uint64_t mantissa{0};
+  const char *lastmatch{nullptr};
+  bool negative{false};
+  bool valid{false};
+  bool too_many_digits{false};
+  // contains the range of the significant digits
+  byte_span integer{};  // non-nullable
+  byte_span fraction{}; // nullable
 };
 
-
 // Assuming that you use no more than 19 digits, this will
 // parse an ASCII string.
 CXX20_CONSTEXPR fastfloat_really_inline
@@ -125,6 +129,10 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
+  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
   while ((p != pend) && is_integer(*p)) {
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
@@ -134,24 +142,24 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
   }
   const char *const end_of_integer_part = p;
   int64_t digit_count = int64_t(end_of_integer_part - start_digits);
+  answer.integer = byte_span(start_digits, size_t(digit_count));
   int64_t exponent = 0;
   if ((p != pend) && (*p == decimal_point)) {
     ++p;
-  // Fast approach only tested under little endian systems
-  if ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
-    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
-    p += 8;
-    if ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
+    const char* before = p;
+    // can occur at most twice without overflowing, but let it occur more, since
+    // for integers with many digits, digit parsing is the primary bottleneck.
+    while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
       i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
       p += 8;
     }
-  }
     while ((p != pend) && is_integer(*p)) {
       uint8_t digit = uint8_t(*p - '0');
       ++p;
       i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
     }
-    exponent = end_of_integer_part + 1 - p;
+    exponent = before - p;
+    answer.fraction = byte_span(before, size_t(p - before));
     digit_count -= exponent;
   }
   // we must have encountered at least one integer!
@@ -179,7 +187,7 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
     } else {
       while ((p != pend) && is_integer(*p)) {
         uint8_t digit = uint8_t(*p - '0');
-        if (exp_number < 0x10000) {
+        if (exp_number < 0x10000000) {
           exp_number = 10 * exp_number + digit;
         }
         ++p;
@@ -212,23 +220,26 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
     if (digit_count > 19) {
       answer.too_many_digits = true;
       // Let us start again, this time, avoiding overflows.
+      // We don't need to check if is_integer, since we use the
+      // pre-tokenized spans from above.
       i = 0;
-      p = start_digits;
+      p = answer.integer.ptr;
+      const char* int_end = p + answer.integer.len();
       const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
-      while((i < minimal_nineteen_digit_integer) && (p != pend) && is_integer(*p)) {
+      while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
         i = i * 10 + uint64_t(*p - '0');
         ++p;
       }
       if (i >= minimal_nineteen_digit_integer) { // We have a big integers
         exponent = end_of_integer_part - p + exp_number;
       } else { // We have a value with a fractional component.
-          p++; // skip the dot
-          const char *first_after_period = p;
-          while((i < minimal_nineteen_digit_integer) && (p != pend) && is_integer(*p)) {
+          p = answer.fraction.ptr;
+          const char* frac_end = p + answer.fraction.len();
+          while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
             i = i * 10 + uint64_t(*p - '0');
             ++p;
           }
-          exponent = first_after_period - p + exp_number;
+          exponent = answer.fraction.ptr - p + exp_number;
       }
       // We have now corrected both exponent and i, to a truncated value
     }
@@ -238,108 +249,6 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_
   return answer;
 }
 
-
-// This should always succeed since it follows a call to parse_number_string
-// This function could be optimized. In particular, we could stop after 19 digits
-// and try to bail out. Furthermore, we should be able to recover the computed
-// exponent from the pass in parse_number_string.
-CXX20_CONSTEXPR fastfloat_really_inline decimal parse_decimal(const char *p, const char *pend, parse_options options) noexcept {
-  const char decimal_point = options.decimal_point;
-
-  decimal answer;
-  answer.num_digits = 0;
-  answer.decimal_point = 0;
-  answer.truncated = false;
-  answer.negative = (*p == '-');
-  if (*p == '-') { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
-    ++p;
-  }
-  // skip leading zeroes
-  while ((p != pend) && (*p == '0')) {
-    ++p;
-  }
-  while ((p != pend) && is_integer(*p)) {
-    if (answer.num_digits < max_digits) {
-      answer.digits[answer.num_digits] = uint8_t(*p - '0');
-    }
-    answer.num_digits++;
-    ++p;
-  }
-  if ((p != pend) && (*p == decimal_point)) {
-    ++p;
-    const char *first_after_period = p;
-    // if we have not yet encountered a zero, we have to skip it as well
-    if(answer.num_digits == 0) {
-      // skip zeros
-      while ((p != pend) && (*p == '0')) {
-       ++p;
-      }
-    }
-    // We expect that this loop will often take the bulk of the running time
-    // because when a value has lots of digits, these digits often
-    while ((std::distance(p, pend) >= 8) && (answer.num_digits + 8 < max_digits)) {
-      uint64_t val = read_u64(p);
-      if(! is_made_of_eight_digits_fast(val)) { break; }
-      // We have eight digits, process them in one go!
-      val -= 0x3030303030303030;
-      write_u64(answer.digits + answer.num_digits, val);
-      answer.num_digits += 8;
-      p += 8;
-    }
-    while ((p != pend) && is_integer(*p)) {
-      if (answer.num_digits < max_digits) {
-        answer.digits[answer.num_digits] = uint8_t(*p - '0');
-      }
-      answer.num_digits++;
-      ++p;
-    }
-    answer.decimal_point = int32_t(first_after_period - p);
-  }
-  // We want num_digits to be the number of significant digits, excluding
-  // leading *and* trailing zeros! Otherwise the truncated flag later is
-  // going to be misleading.
-  if(answer.num_digits > 0) {
-    // We potentially need the answer.num_digits > 0 guard because we
-    // prune leading zeros. So with answer.num_digits > 0, we know that
-    // we have at least one non-zero digit.
-    const char *preverse = p - 1;
-    int32_t trailing_zeros = 0;
-    while ((*preverse == '0') || (*preverse == decimal_point)) {
-      if(*preverse == '0') { trailing_zeros++; };
-      --preverse;
-    }
-    answer.decimal_point += int32_t(answer.num_digits);
-    answer.num_digits -= uint32_t(trailing_zeros);
-  }
-  if(answer.num_digits > max_digits) {
-    answer.truncated = true;
-    answer.num_digits = max_digits;
-  }
-  if ((p != pend) && (('e' == *p) || ('E' == *p))) {
-    ++p;
-    bool neg_exp = false;
-    if ((p != pend) && ('-' == *p)) {
-      neg_exp = true;
-      ++p;
-    } else if ((p != pend) && ('+' == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
-      ++p;
-    }
-    int32_t exp_number = 0; // exponential part
-    while ((p != pend) && is_integer(*p)) {
-      uint8_t digit = uint8_t(*p - '0');
-      if (exp_number < 0x10000) {
-        exp_number = 10 * exp_number + digit;
-      }
-      ++p;
-    }
-    answer.decimal_point += (neg_exp ? -exp_number : exp_number);
-  }
-  // In very rare cases, we may have fewer than 19 digits, we want to be able to reliably
-  // assume that all digits up to max_digit_without_overflow have been initialized.
-  for(uint32_t i = answer.num_digits; i < max_digit_without_overflow; i++) { answer.digits[i] = 0; }
-
-  return answer;
-}
 } // namespace fast_float
 
 #endif