From 21fc173b6d13f4da980f232c8e6f6d954da753cb Mon Sep 17 00:00:00 2001 From: Alex Rutar Date: Tue, 19 Nov 2024 17:29:28 +0200 Subject: [PATCH] Improvements to grapheme truncation for Utf32Str(ing) This commit corrects the internal handling of grapheme truncation. Most notably, it fixes two bugs with the previous implementation of Utf32Str(ing): 1. Fixes a bug where an Ascii variant could have been returned even though the original string was not ASCII. (The converse, where a Unicode variant consists only of ASCII, is totally fine). 2. Fixes the handling of windows-style newline (i.e. `\r\n`) since these are single graphemes. Moreover, the `\r\n` grapheme is now mapped to `\n` rather than `\r`. In particular, Utf32Str(ing)s constructed from text containing windows-style newlines will result in Unicode variants, even if the string is entirely valid Ascii. --- matcher/src/chars.rs | 15 ++-- matcher/src/utf32_str.rs | 125 ++++++++++++++++++++++++++------- matcher/src/utf32_str/tests.rs | 44 ++++++++++++ 3 files changed, 154 insertions(+), 30 deletions(-) create mode 100644 matcher/src/utf32_str/tests.rs diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs index 53555f5..d13a246 100644 --- a/matcher/src/chars.rs +++ b/matcher/src/chars.rs @@ -189,10 +189,17 @@ pub(crate) enum CharClass { pub fn graphemes(text: &str) -> impl Iterator + '_ { #[cfg(feature = "unicode-segmentation")] let res = text.graphemes(true).map(|grapheme| { - grapheme - .chars() - .next() - .expect("graphemes must be non-empty") + // we need to special-case this check since `\r\n` is a single grapheme and is + // therefore the exception to the rule that normalization of a grapheme should + // map to the first character. + if grapheme == "\r\n" { + '\n' + } else { + grapheme + .chars() + .next() + .expect("graphemes must be non-empty") + } }); #[cfg(not(feature = "unicode-segmentation"))] let res = text.chars(); diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs index 8e4fd24..664dae7 100644 --- a/matcher/src/utf32_str.rs +++ b/matcher/src/utf32_str.rs @@ -1,56 +1,115 @@ +#[cfg(test)] +mod tests; + use std::borrow::Cow; use std::ops::{Bound, RangeBounds}; use std::{fmt, slice}; +use memchr::memmem; + use crate::chars; -/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching. +/// Check if a given string can be represented internally as the `Ascii` variant in a +/// [`Utf32String`] or a [`Utf32Str`]. +/// +/// This returns true if the string is ASCII and does not contain a windows-style newline +/// `'\r'`. +/// The additional carriage return check is required since even for strings consisting only +/// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme. +#[inline] +fn has_ascii_graphemes(string: &str) -> bool { + string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none() +} + +/// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching. +/// +/// This is mostly intended as an internal string type, but some methods are exposed for +/// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string +/// using one of its `From` constructors for string types `T` or from the +/// [`Utf32Str::new`] method. +/// +/// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string +/// slice. +/// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical +/// to the original string. +/// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string. +/// +/// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these +/// assumptions when handling `Utf32Str(ing)`s of unknown origin. +/// +/// ## Caveats +/// Despite the name, this type is quite far from being a true string type. Here are some +/// examples demonstrating this. /// -/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching -/// operates on codepoints (it should operate on graphemes but that's too much -/// hassle to deal with). We want to quickly iterate these codepoints between -/// (up to 5 times) during matching. +/// ### String conversions are not round-trip +/// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u + +/// COMBINING_DIAERESIS`), the trailing codepoints are truncated. +/// ``` +/// # use nucleo_matcher::Utf32String; +/// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u"); +/// ``` +/// +/// ### Indexing is done by grapheme +/// Indexing into a string is done by grapheme rather than by codepoint. +/// ``` +/// # use nucleo_matcher::Utf32String; +/// assert!(Utf32String::from("au\u{0308}").len() == 2); +/// ``` +/// +/// ### A `Unicode` variant may be produced by all-ASCII characters. +/// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme, +/// strings containing `\r\n` will still result in a `Unicode` variant. +/// ``` +/// # use nucleo_matcher::Utf32String; +/// let s = Utf32String::from("\r\n"); +/// assert!(!s.slice(..).is_ascii()); +/// assert!(s.len() == 1); +/// assert!(s.slice(..).get(0) == '\n'); +/// ``` +/// +/// ## Design rationale +/// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching +/// operates on codepoints (ideally, it should operate on graphemes but that's too +/// much hassle to deal with), we want to quickly iterate over codepoints (up to 5 +/// times) during matching. /// /// Doing codepoint segmentation on the fly not only blows trough the cache -/// (lookuptables and Icache) but also has nontrivial runtime compared to the -/// matching itself. Furthermore there are a lot of exta optimizations available -/// for ascii only text (but checking during each match has too much overhead). +/// (lookup tables and I-cache) but also has nontrivial runtime compared to the +/// matching itself. Furthermore there are many extra optimizations available +/// for ASCII only text, but checking each match has too much overhead. /// -/// Ofcourse this comes at exta memory cost as we usually still need the ut8 -/// encoded variant for rendering. In the (dominant) case of ascii-only text +/// Of course, this comes at extra memory cost as we usually still need the UTF-8 +/// encoded variant for rendering. In the (dominant) case of ASCII-only text /// we don't require a copy. Furthermore fuzzy matching usually is applied while /// the user is typing on the fly so the same item is potentially matched many -/// times (making the the upfront cost more worth it). That means that its -/// basically always worth it to presegment the string. +/// times (making the the up-front cost more worth it). That means that its +/// basically always worth it to pre-segment the string. /// /// For usecases that only match (a lot of) strings once its possible to keep -/// char buffer around that is filled with the presegmented chars +/// char buffer around that is filled with the presegmented chars. /// /// Another advantage of this approach is that the matcher will naturally -/// produce char indices (instead of utf8 offsets) anyway. With a +/// produce grapheme indices (instead of utf8 offsets) anyway. With a /// codepoint basic representation like this the indices can be used /// directly #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] pub enum Utf32Str<'a> { /// A string represented as ASCII encoded bytes. - /// Correctness invariant: must only contain valid ASCII (<=127) + /// Correctness invariant: must only contain valid ASCII (`<= 127`) Ascii(&'a [u8]), /// A string represented as an array of unicode codepoints (basically UTF-32). Unicode(&'a [char]), } impl<'a> Utf32Str<'a> { - /// Convenience method to construct a `Utf32Str` from a normal utf8 str + /// Convenience method to construct a `Utf32Str` from a normal UTF-8 str pub fn new(str: &'a str, buf: &'a mut Vec) -> Self { - if str.is_ascii() { + if has_ascii_graphemes(str) { Utf32Str::Ascii(str.as_bytes()) } else { buf.clear(); buf.extend(crate::chars::graphemes(str)); - if buf.iter().all(|c| c.is_ascii()) { - return Utf32Str::Ascii(str.as_bytes()); - } - Utf32Str::Unicode(&*buf) + Utf32Str::Unicode(buf) } } @@ -107,7 +166,7 @@ impl<'a> Utf32Str<'a> { } } - /// Returns the number of leading whitespaces in this string + /// Returns the number of trailing whitespaces in this string #[inline] pub(crate) fn trailing_white_space(self) -> usize { match self { @@ -144,18 +203,26 @@ impl<'a> Utf32Str<'a> { } } - /// Returns whether this string only contains ascii text. + /// Returns whether this string only contains graphemes which are single ASCII chars. + /// + /// This is almost equivalent to the string being ASCII, except with the additional requirement + /// that the string cannot contain a windows-style newline `\r\n` which is treated as a single + /// grapheme. pub fn is_ascii(self) -> bool { matches!(self, Utf32Str::Ascii(_)) } - /// Returns the `n`th character in this string. + /// Returns the `n`th character in this string, zero-indexed pub fn get(self, n: u32) -> char { match self { Utf32Str::Ascii(bytes) => bytes[n as usize] as char, Utf32Str::Unicode(codepoints) => codepoints[n as usize], } } + + /// Returns the last character in this string. + /// + /// Panics if the string is empty. pub(crate) fn last(self) -> char { match self { Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, @@ -163,6 +230,9 @@ impl<'a> Utf32Str<'a> { } } + /// Returns the first character in this string. + /// + /// Panics if the string is empty. pub(crate) fn first(self) -> char { match self { Utf32Str::Ascii(bytes) => bytes[0] as char, @@ -204,6 +274,7 @@ pub enum Chars<'a> { Ascii(slice::Iter<'a, u8>), Unicode(slice::Iter<'a, char>), } + impl Iterator for Chars<'_> { type Item = char; @@ -226,6 +297,8 @@ impl DoubleEndedIterator for Chars<'_> { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] /// An owned version of [`Utf32Str`]. +/// +/// See the API documentation for [`Utf32Str`] for more detail. pub enum Utf32String { /// A string represented as ASCII encoded bytes. /// Correctness invariant: must only contain valid ASCII (<=127) @@ -307,7 +380,7 @@ impl Utf32String { impl From<&str> for Utf32String { #[inline] fn from(value: &str) -> Self { - if value.is_ascii() { + if has_ascii_graphemes(value) { Self::Ascii(value.to_owned().into_boxed_str()) } else { Self::Unicode(chars::graphemes(value).collect()) @@ -317,7 +390,7 @@ impl From<&str> for Utf32String { impl From> for Utf32String { fn from(value: Box) -> Self { - if value.is_ascii() { + if has_ascii_graphemes(&value) { Self::Ascii(value) } else { Self::Unicode(chars::graphemes(&value).collect()) diff --git a/matcher/src/utf32_str/tests.rs b/matcher/src/utf32_str/tests.rs new file mode 100644 index 0000000..a38c887 --- /dev/null +++ b/matcher/src/utf32_str/tests.rs @@ -0,0 +1,44 @@ +use crate::{Utf32Str, Utf32String}; + +#[test] +fn test_utf32str_ascii() { + /// Helper function for testing + fn expect_ascii(src: &str, is_ascii: bool) { + let mut buffer = Vec::new(); + assert!(Utf32Str::new(src, &mut buffer).is_ascii() == is_ascii); + assert!(Utf32String::from(src).slice(..).is_ascii() == is_ascii); + assert!(Utf32String::from(src.to_owned()).slice(..).is_ascii() == is_ascii); + } + + // ascii + expect_ascii("", true); + expect_ascii("a", true); + expect_ascii("a\nb", true); + expect_ascii("\n\r", true); + + // not ascii + expect_ascii("aü", false); + expect_ascii("au\u{0308}", false); + + // windows-style newline + expect_ascii("a\r\nb", false); + expect_ascii("ü\r\n", false); + expect_ascii("\r\n", false); +} + +#[test] +fn test_grapheme_truncation() { + // ascii is preserved + let s = Utf32String::from("ab"); + assert_eq!(s.slice(..).get(0), 'a'); + assert_eq!(s.slice(..).get(1), 'b'); + + // windows-style newline is truncated to '\n' + let s = Utf32String::from("\r\n"); + assert_eq!(s.slice(..).get(0), '\n'); + + // normal graphemes are truncated to the first character + let s = Utf32String::from("u\u{0308}\r\n"); + assert_eq!(s.slice(..).get(0), 'u'); + assert_eq!(s.slice(..).get(1), '\n'); +}