Skip to content

Commit

Permalink
Correct first-byte computation for interval
Browse files Browse the repository at this point in the history
When computing a predicate to match the first byte, we incorrectly
assumed that the set of first bytes of a closed interval (in UTF8) are
contiguous. But of course this is false. Reimplement this algorithm and
add tests.

Fixes #73
  • Loading branch information
ridiculousfish committed Nov 19, 2023
1 parent df9730f commit 4858a1b
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 31 deletions.
2 changes: 1 addition & 1 deletion src/bytesearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ impl ByteSet for AsciiBitmap {
}

/// A bitmap covering all bytes.
#[derive(Default, Copy, Clone)]
#[derive(Default, Copy, Clone, PartialEq, Eq)]
#[repr(align(4))]
pub struct ByteBitmap([u16; 16]);

Expand Down
30 changes: 3 additions & 27 deletions src/startpredicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@ use crate::codepointset;
use crate::insn::StartPredicate;
use crate::ir;
use crate::ir::Node;
use crate::util::utf8_first_byte;
use crate::util::{add_utf8_first_bytes_to_bitmap, utf8_first_byte};
#[cfg(not(feature = "std"))]
use alloc::{boxed::Box, vec::Vec};
use core::cmp;
use core::convert::TryInto;

/// Support for quickly finding potential match locations.
Expand All @@ -15,32 +14,9 @@ use core::convert::TryInto;
/// That is, make a list of all of the possible first bytes of every contained
/// code point, and store that in a bitmap.
fn cps_to_first_byte_bitmap(input: &codepointset::CodePointSet) -> Box<ByteBitmap> {
// Rather than walking over all contained codepoints, which could be very
// expensive, we perform searches over all bytes.
// Note that UTF-8 first-bytes increase monotone and contiguously.
// Increasing a code point will either increase the first byte by 1 or 0.
let mut bitmap = Box::<ByteBitmap>::default();
let mut ivs = input.intervals();
for target_byte in 0..=255 {
let search = ivs.binary_search_by(|iv| {
let left_byte = utf8_first_byte(iv.first);
let right_byte = utf8_first_byte(iv.last);
debug_assert!(left_byte <= right_byte);
if left_byte <= target_byte && target_byte <= right_byte {
cmp::Ordering::Equal
} else if left_byte > target_byte {
cmp::Ordering::Greater
} else {
debug_assert!(right_byte < target_byte, "Binary search logic is wrong");
cmp::Ordering::Less
}
});
if let Ok(idx) = search {
bitmap.set(target_byte);
// Since our bytes are strictly increasing, previous intervals will never hold
// the next byte. Help out our binary search by removing previous intervals.
ivs = &ivs[idx..];
}
for iv in input.intervals() {
add_utf8_first_bytes_to_bitmap(*iv, &mut bitmap);
}
bitmap
}
Expand Down
2 changes: 1 addition & 1 deletion src/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ mod tests {
for c in 0..=0x10FFFF {
let fc = fold(c);
if fc != c {
unfold_map.entry(fc).or_insert_with(Vec::new).push(c);
unfold_map.entry(fc).or_default().push(c);
}
}

Expand Down
48 changes: 46 additions & 2 deletions src/util.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use crate::bytesearch::ByteBitmap;
use crate::codepointset::Interval;
use crate::codepointset::CODE_POINT_MAX;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
Expand Down Expand Up @@ -89,6 +91,7 @@ pub fn to_char_sat(c: u32) -> char {

/// \return the first byte of a UTF-8 encoded code point.
/// We do not use char because we don't want to deal with failing on surrogates.
#[inline(always)]
pub fn utf8_first_byte(cp: u32) -> u8 {
debug_assert!(cp <= CODE_POINT_MAX);
if cp < 0x80 {
Expand All @@ -106,6 +109,25 @@ pub fn utf8_first_byte(cp: u32) -> u8 {
}
}

/// Add all of the first bytes of a code point interval to a byte bitmap.
pub fn add_utf8_first_bytes_to_bitmap(interval: Interval, bitmap: &mut ByteBitmap) {
// Note this is an inclusive interval.
let Interval { first, last } = interval;
let ranges = [
(first, last.min(0x7F)), // 1 byte range
(first.max(0x80), last.min(0x7FF)), // 2 byte range
(first.max(0x800), last.min(0xFFFF)), // 3 byte range
(first.max(0x10000), last), // 4 byte range
];
for (first, last) in ranges.into_iter() {
if first <= last {
for byte in utf8_first_byte(first)..=utf8_first_byte(last) {
bitmap.set(byte);
}
}
}
}

pub trait SliceHelp {
type Item;

Expand Down Expand Up @@ -184,9 +206,10 @@ pub fn utf8_w4(b0: u8, b1: u8, b2: u8, b3: u8) -> u32 {

#[cfg(test)]
mod tests {
use super::{add_utf8_first_bytes_to_bitmap, utf8_first_byte, ByteBitmap, Interval, SliceHelp};

#[test]
fn ranges() {
use super::SliceHelp;
let vals = [0, 1, 2, 3, 4, 4, 4, 7, 8, 9, 9];
let fast_er = |needle: usize| vals.equal_range_by(|v| v.cmp(&needle));
let slow_er = |needle: usize| {
Expand Down Expand Up @@ -228,7 +251,7 @@ mod tests {
let mut buff = [0; 4];
let s = core::char::from_u32(cp).unwrap().encode_utf8(&mut buff);
let bytes = s.as_bytes();
assert_eq!(bytes[0], super::utf8_first_byte(cp));
assert_eq!(bytes[0], utf8_first_byte(cp));

match bytes.len() {
1 => assert_eq!(bytes[0] as u32, cp),
Expand All @@ -239,4 +262,25 @@ mod tests {
}
}
}

#[test]
fn test_add_utf8_first_bytes_to_bitmap() {
let locs = [
0, 1, 2, 3, 17, 25, 34, 99, 127, 128, 150, 255, 256, 257, 511, 512, 513, 0x7FF, 0x800,
0x905, 0xA123, 0xFF0F, 0xFFFF, 0x10000, 0x10001, 0x1FFFF, 0x20001, 0x2FFF7, 0x50001,
0x100000, 0x10FFFE, 0x10FFFF,
];
for (idx, &first) in locs.iter().enumerate() {
let mut expected = ByteBitmap::default();
for &last in locs[idx..].iter() {
assert!(first <= last);
for cp in first..=last {
expected.set(utf8_first_byte(cp));
}
let mut bitmap = ByteBitmap::default();
add_utf8_first_bytes_to_bitmap(Interval { first, last }, &mut bitmap);
assert_eq!(bitmap, expected);
}
}
}
}

0 comments on commit 4858a1b

Please sign in to comment.