From ef3da315952d1513b6a02ff05e81d64bd86a47a7 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 29 Mar 2024 11:24:17 -0700 Subject: [PATCH] Return the sibling count in ZeroTrie probe function (#4735) Follow-up to #4725 --- utils/zerotrie/src/cursor.rs | 56 ++++++++++++++++++++++++------------ utils/zerotrie/src/lib.rs | 3 +- utils/zerotrie/src/reader.rs | 15 ++++++++-- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/utils/zerotrie/src/cursor.rs b/utils/zerotrie/src/cursor.rs index cd378f68682..9a947422d60 100644 --- a/utils/zerotrie/src/cursor.rs +++ b/utils/zerotrie/src/cursor.rs @@ -2,6 +2,11 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +//! Types for walking stepwise through a trie. +//! +//! For examples, see the `.cursor()` functions +//! and the `Cursor` types in this module. + use crate::reader; use crate::ZeroAsciiIgnoreCaseTrie; use crate::ZeroTrieSimpleAscii; @@ -144,6 +149,16 @@ pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> { trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>, } +/// Information about a probed edge. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] // no need to destructure or construct this in userland +pub struct AsciiProbeResult { + /// The character's byte value between this node and its parent. + pub byte: u8, + /// The number of siblings of this node, _including itself_. + pub total_siblings: u8, +} + impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// Steps the cursor one character into the trie based on the character's byte value. /// @@ -241,7 +256,7 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// /// let data: &[(String, usize)] = &[ /// ("ab".to_string(), 111), - /// ("abc".to_string(), 22), + /// ("abcxyz".to_string(), 22), /// ("abde".to_string(), 333), /// ("afg".to_string(), 44), /// ]; @@ -255,8 +270,8 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// if let Some(value) = cursor.take_value() { /// break value; /// } - /// let ch = cursor.probe(0).unwrap(); - /// key.push(char::from(ch)); + /// let probe_result = cursor.probe(0).unwrap(); + /// key.push(char::from(probe_result.byte)); /// }; /// /// assert_eq!(key, "ab"); @@ -269,7 +284,7 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// # use zerotrie::ZeroTrieSimpleAscii; /// # let data: &[(String, usize)] = &[ /// # ("ab".to_string(), 111), - /// # ("abc".to_string(), 22), + /// # ("abcxyz".to_string(), 22), /// # ("abde".to_string(), 333), /// # ("afg".to_string(), 44) /// # ]; @@ -280,11 +295,11 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// // (trie built as in previous example) /// /// // Initialize the iteration at the first child of the trie. - /// let mut stack = Vec::from([(trie.cursor(), 0)]); + /// let mut stack = Vec::from([(trie.cursor(), 0, 0)]); /// let mut key = Vec::new(); /// let mut results = Vec::new(); /// loop { - /// let Some((ref mut cursor, ref mut index)) = stack.last_mut() else { + /// let Some((mut cursor, index, suffix_len)) = stack.pop() else { /// // Nothing left in the trie. /// break; /// }; @@ -294,23 +309,28 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// } /// // Now check for children of the current node. /// let mut sub_cursor = cursor.clone(); - /// if let Some(ch) = sub_cursor.probe(*index) { - /// // Found a child. Add the child to the stack, and also - /// // increment the index, so that next time we visit the - /// // current node, we check the next child. - /// *index += 1; - /// stack.push((sub_cursor, 0)); - /// key.push(ch); + /// if let Some(probe_result) = sub_cursor.probe(index) { + /// // Found a child. Add the current byte edge to the key. + /// key.push(probe_result.byte); + /// // Add the child to the stack, and also add back the current + /// // node if there are more siblings to visit. + /// if index + 1 < probe_result.total_siblings as usize { + /// stack.push((cursor, index + 1, suffix_len)); + /// stack.push((sub_cursor, 0, 1)); + /// } else { + /// stack.push((sub_cursor, 0, suffix_len + 1)); + /// } /// } else { - /// // No more children. Pop this node from the stack. - /// stack.pop(); - /// key.pop(); + /// // No more children. Pop this node's bytes from the key. + /// for _ in 0..suffix_len { + /// key.pop(); + /// } /// } /// } /// /// assert_eq!(&results, data); /// ``` - pub fn probe(&mut self, index: usize) -> Option { + pub fn probe(&mut self, index: usize) -> Option { reader::probe_parameterized::>(&mut self.trie.store, index) } @@ -379,7 +399,7 @@ impl<'a> ZeroAsciiIgnoreCaseTrieCursor<'a> { /// Probes the next byte in the cursor. /// /// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`]. - pub fn probe(&mut self, index: usize) -> Option { + pub fn probe(&mut self, index: usize) -> Option { reader::probe_parameterized::>(&mut self.trie.store, index) } diff --git a/utils/zerotrie/src/lib.rs b/utils/zerotrie/src/lib.rs index edac14da291..e4ff90c498f 100644 --- a/utils/zerotrie/src/lib.rs +++ b/utils/zerotrie/src/lib.rs @@ -56,7 +56,7 @@ extern crate alloc; mod builder; mod byte_phf; -mod cursor; +pub mod cursor; mod error; #[macro_use] mod helpers; @@ -67,7 +67,6 @@ mod serde; mod varint; mod zerotrie; -pub use crate::cursor::ZeroTrieSimpleAsciiCursor; pub use crate::zerotrie::ZeroAsciiIgnoreCaseTrie; pub use crate::zerotrie::ZeroTrie; pub use crate::zerotrie::ZeroTrieExtendedCapacity; diff --git a/utils/zerotrie/src/reader.rs b/utils/zerotrie/src/reader.rs index acf8dae2e09..df055ce7263 100644 --- a/utils/zerotrie/src/reader.rs +++ b/utils/zerotrie/src/reader.rs @@ -204,6 +204,7 @@ //! ``` use crate::byte_phf::PerfectByteHashMap; +use crate::cursor::AsciiProbeResult; use crate::helpers::*; use crate::options::*; use crate::varint::read_varint_meta2; @@ -516,7 +517,7 @@ pub(crate) fn step_parameterized( pub(crate) fn probe_parameterized( trie: &mut &[u8], index: usize, -) -> Option { +) -> Option { // Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`. // `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie. // If a span node is encountered, `None` is returned later in this function. @@ -549,7 +550,10 @@ pub(crate) fn probe_parameterized( *trie = &[]; return None; } - return Some(*b); + return Some(AsciiProbeResult { + byte: *b, + total_siblings: 1, + }); } NodeType::Branch => { // Proceed to the branch node logic below @@ -571,6 +575,8 @@ pub(crate) fn probe_parameterized( } // Branch node let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; + debug_assert!(u8::try_from(x).is_ok()); + let total_siblings = x as u8; // See comment above regarding this assertion debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); let w = w & 0x3; @@ -585,7 +591,10 @@ pub(crate) fn probe_parameterized( } else { get_branch(trie, index, x, w) }; - Some(search[index]) + Some(AsciiProbeResult { + byte: search[index], + total_siblings, + }) } /// Steps one node into the trie if the head node is a value node, returning the value.