Skip to content

Commit

Permalink
Fixes the is_contiguous check on digits iterators.
Browse files Browse the repository at this point in the history
This no longer does the check on the byte which is the correct check.

Fixes #157
  • Loading branch information
Alexhuszagh committed Sep 20, 2024
1 parent 2cf88c8 commit 0c9456a
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 62 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

- A correctness regression.
- Regressions where parsing digit separators without the `compact` panicked.

## [1.0.0] 2024-09-14

Expand Down
2 changes: 2 additions & 0 deletions ci/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ test() {
cargo test $test_features $DOCTESTS
cargo test $test_features $DOCTESTS --release
cargo test --features=radix,format,compact $DOCTESTS --release
# NOTE: This tests a regressions, related to #96.
cargo test --features=format $DOCTESTS
}

# Dry-run bench target
Expand Down
162 changes: 100 additions & 62 deletions lexical-util/src/skip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,21 +141,24 @@ macro_rules! is_ilt {
/// Peeks the next token that's not a digit separator.
macro_rules! peek_1 {
($self:ident, $is_skip:ident) => {{
// This will consume consecutive digit separators.
let value = $self.byte.slc.get($self.byte.index)?;
// This will consume a single, non-consecutive digit separators.
let mut index = $self.cursor();
let buffer = $self.get_buffer();
let value = buffer.get(index)?;
let is_digit_separator = $self.is_digit_separator(*value);
if is_digit_separator && $is_skip!($self) {
// Have a skippable digit separator: keep incrementing until we find
// a non-digit separator character. Don't need any complex checks
// here, since we've already done them above.
let mut index = $self.byte.index + 1;
while index < $self.buffer_length()
&& $self.byte.slc.get(index).map_or(false, |&x| $self.is_digit_separator(x))
index += 1;
while index < buffer.len()
&& buffer.get(index).map_or(false, |&x| $self.is_digit_separator(x))
{
index += 1;
}
$self.byte.index = index;
$self.byte.slc.get($self.byte.index)
// SAFETY: Safe since `index < buffer.len()`.
unsafe { $self.set_cursor(index) };
buffer.get(index)
} else {
// Have 1 of 2 conditions:
// 1. A non-digit separator character.
Expand All @@ -170,20 +173,23 @@ macro_rules! peek_1 {
macro_rules! peek_n {
($self:ident, $is_skip:ident) => {{
// This will consume consecutive digit separators.
let value = $self.byte.slc.get($self.byte.index)?;
let mut index = $self.cursor();
let buffer = $self.get_buffer();
let value = buffer.get(index)?;
let is_digit_separator = $self.is_digit_separator(*value);
if is_digit_separator && $is_skip!($self) {
// Have a skippable digit separator: keep incrementing until we find
// a non-digit separator character. Don't need any complex checks
// here, since we've already done them above.
let mut index = $self.byte.index + 1;
while index < $self.byte.slc.len()
&& $self.byte.slc.get(index).map_or(false, |&x| $self.is_digit_separator(x))
index += 1;
while index < buffer.len()
&& buffer.get(index).map_or(false, |&x| $self.is_digit_separator(x))
{
index += 1;
}
$self.byte.index = index;
$self.byte.slc.get($self.byte.index)
// SAFETY: Safe since `index < buffer.len()`.
unsafe { $self.set_cursor(index) };
buffer.get(index)
} else {
// Have 1 of 2 conditions:
// 1. A non-digit separator character.
Expand Down Expand Up @@ -371,11 +377,12 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> {
/// For this reason, since it's easy to get wrong, we only
/// expose it to our `DigitsIterator`s and nothing else.
///
/// This is only ever used for contiguous arrays.
/// This is only ever used for contiguous iterators. However,
/// it's not guaranteed to only valid for our contiguous
/// iterators.
#[inline(always)]
const unsafe fn from_parts(slc: &'a [u8], index: usize) -> Self {
debug_assert!(index <= slc.len());
debug_assert!(Self::IS_CONTIGUOUS);
Self {
slc,
index,
Expand Down Expand Up @@ -414,6 +421,58 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> {
byte: self,
}
}

/// Internal implementation that handles if it's contiguous.
///
/// # Safety
///
/// Safe if the buffer has at least `N` elements.
#[inline(always)]
unsafe fn step_by_unchecked_impl(&mut self, count: usize, is_contiguous: bool) {
// NOTE: THIS IS NOT a duplicate calling `step_by_unchecked` from a digits
// iterator on the byte, since they can have different contiguousness.
if is_contiguous {
// Contiguous, can skip most of these checks.
debug_assert!(self.as_slice().len() >= count);
} else {
// Since this isn't contiguous, it only works
// if the value is in the range `[0, 1]`.
// We also need to make sure the **current** value
// isn't a digit separator.
let format = NumberFormat::<{ FORMAT }> {};
debug_assert!(self.as_slice().len() >= count);
debug_assert!(count == 0 || count == 1);
debug_assert!(
count == 0 || self.slc.get(self.index) != Some(&format.digit_separator())
);
}
self.index += count;
if !is_contiguous {
// Only increment the count if it's not contiguous, otherwise,
// this is an unnecessary performance penalty.
self.count += count;
}
}

/// Internal implementation that handles if it's contiguous.
///
/// If it's contiguous or not does not affect the safety guarantees,
/// however, it can affect correctness.
///
/// # Safety
///
/// Safe if the buffer has at least `size_of::<V>` elements.
#[inline(always)]
unsafe fn peek_many_unchecked_impl<V>(&self, is_contiguous: bool) -> V {
// NOTE: THIS IS NOT a duplicate calling `peek_many_unchecked` from a digits
// iterator on the byte, since they can have different contiguousness.
debug_assert!(is_contiguous);
debug_assert!(self.as_slice().len() >= mem::size_of::<V>());

let slc = self.as_slice();
// SAFETY: safe as long as the slice has at least count elements.
unsafe { ptr::read_unaligned::<V>(slc.as_ptr() as *const _) }
}
}

unsafe impl<'a, const FORMAT: u128> Iter<'a> for Bytes<'a, FORMAT> {
Expand Down Expand Up @@ -456,37 +515,14 @@ unsafe impl<'a, const FORMAT: u128> Iter<'a> for Bytes<'a, FORMAT> {

#[inline(always)]
unsafe fn step_by_unchecked(&mut self, count: usize) {
if Self::IS_CONTIGUOUS {
// Contiguous, can skip most of these checks.
debug_assert!(self.as_slice().len() >= count);
} else {
// Since this isn't contiguous, it only works
// if the value is in the range `[0, 1]`.
// We also need to make sure the **current** value
// isn't a digit separator.
let format = NumberFormat::<{ FORMAT }> {};
debug_assert!(self.as_slice().len() >= count);
debug_assert!(count == 0 || count == 1);
debug_assert!(
count == 0 || self.slc.get(self.index) != Some(&format.digit_separator())
);
}
self.index += count;
if !Self::IS_CONTIGUOUS {
// Only increment the count if it's not contiguous, otherwise,
// this is an unnecessary performance penalty.
self.count += count;
}
// SAFETY: Safe if the buffer has at least `N` elements.
unsafe { self.step_by_unchecked_impl(count, Self::IS_CONTIGUOUS) }
}

#[inline(always)]
unsafe fn peek_many_unchecked<V>(&self) -> V {
debug_assert!(Self::IS_CONTIGUOUS);
debug_assert!(self.as_slice().len() >= mem::size_of::<V>());

let slc = self.as_slice();
// SAFETY: safe as long as the slice has at least count elements.
unsafe { ptr::read_unaligned::<V>(slc.as_ptr() as *const _) }
// SAFETY: Safe if the buffer has at least `size_of::<V>` elements.
unsafe { self.peek_many_unchecked_impl(Self::IS_CONTIGUOUS) }
}
}

Expand Down Expand Up @@ -619,16 +655,14 @@ macro_rules! skip_iterator_iter_base {

#[inline(always)]
unsafe fn step_by_unchecked(&mut self, count: usize) {
debug_assert!(self.as_slice().len() >= count);
// SAFETY: safe as long as `slc.len() >= count`.
unsafe { self.byte.step_by_unchecked(count) }
// SAFETY: Safe if the buffer has at least `N` elements.
unsafe { self.byte.step_by_unchecked_impl(count, Self::IS_CONTIGUOUS) }
}

#[inline(always)]
unsafe fn peek_many_unchecked<V>(&self) -> V {
debug_assert!(self.as_slice().len() >= mem::size_of::<V>());
// SAFETY: safe as long as the slice has at least count elements.
unsafe { self.byte.peek_many_unchecked() }
// SAFETY: Safe if the buffer has at least `size_of::<V>` elements.
unsafe { self.byte.peek_many_unchecked_impl(Self::IS_CONTIGUOUS) }
}
};
}
Expand Down Expand Up @@ -657,23 +691,27 @@ macro_rules! skip_iterator_bytesiter_impl {
#[inline(always)]
fn peek(&mut self) -> Option<<Self as Iterator>::Item> {
let format = NumberFormat::<{ FORMAT }> {};
const IL: u128 = flags::$i | flags::$l;
const IT: u128 = flags::$i | flags::$t;
const LT: u128 = flags::$l | flags::$t;
const ILT: u128 = flags::$i | flags::$l | flags::$t;
const IC: u128 = flags::$i | flags::$c;
const LC: u128 = flags::$l | flags::$c;
const TC: u128 = flags::$t | flags::$c;
const ILC: u128 = IL | flags::$c;
const ITC: u128 = IT | flags::$c;
const LTC: u128 = LT | flags::$c;
const ILTC: u128 = ILT | flags::$c;
const I: u128 = flags::$i;
const L: u128 = flags::$l;
const T: u128 = flags::$t;
const C: u128 = flags::$c;
const IL: u128 = I | L;
const IT: u128 = I | T;
const LT: u128 = L | T;
const ILT: u128 = I | L | T;
const IC: u128 = I | C;
const LC: u128 = L | C;
const TC: u128 = T | C;
const ILC: u128 = IL | C;
const ITC: u128 = IT | C;
const LTC: u128 = LT | C;
const ILTC: u128 = ILT | C;

match format.digit_separator_flags() & flags::$mask {
0 => peek_noskip!(self),
flags::$i => peek_i!(self),
flags::$l => peek_l!(self),
flags::$t => peek_t!(self),
I => peek_i!(self),
L => peek_l!(self),
T => peek_t!(self),
IL => peek_il!(self),
IT => peek_it!(self),
LT => peek_lt!(self),
Expand Down

0 comments on commit 0c9456a

Please sign in to comment.