Skip to content

Commit

Permalink
Return the sibling count in ZeroTrie probe function (#4735)
Browse files Browse the repository at this point in the history
Follow-up to #4725
  • Loading branch information
sffc authored Mar 29, 2024
1 parent f693067 commit ef3da31
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 23 deletions.
56 changes: 38 additions & 18 deletions utils/zerotrie/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Types for walking stepwise through a trie.
//!
//! For examples, see the `.cursor()` functions
//! and the `Cursor` types in this module.

use crate::reader;
use crate::ZeroAsciiIgnoreCaseTrie;
use crate::ZeroTrieSimpleAscii;
Expand Down Expand Up @@ -144,6 +149,16 @@ pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> {
trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>,
}

/// Information about a probed edge.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[non_exhaustive] // no need to destructure or construct this in userland
pub struct AsciiProbeResult {
/// The character's byte value between this node and its parent.
pub byte: u8,
/// The number of siblings of this node, _including itself_.
pub total_siblings: u8,
}

impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
/// Steps the cursor one character into the trie based on the character's byte value.
///
Expand Down Expand Up @@ -241,7 +256,7 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
///
/// let data: &[(String, usize)] = &[
/// ("ab".to_string(), 111),
/// ("abc".to_string(), 22),
/// ("abcxyz".to_string(), 22),
/// ("abde".to_string(), 333),
/// ("afg".to_string(), 44),
/// ];
Expand All @@ -255,8 +270,8 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
/// if let Some(value) = cursor.take_value() {
/// break value;
/// }
/// let ch = cursor.probe(0).unwrap();
/// key.push(char::from(ch));
/// let probe_result = cursor.probe(0).unwrap();
/// key.push(char::from(probe_result.byte));
/// };
///
/// assert_eq!(key, "ab");
Expand All @@ -269,7 +284,7 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
/// # use zerotrie::ZeroTrieSimpleAscii;
/// # let data: &[(String, usize)] = &[
/// # ("ab".to_string(), 111),
/// # ("abc".to_string(), 22),
/// # ("abcxyz".to_string(), 22),
/// # ("abde".to_string(), 333),
/// # ("afg".to_string(), 44)
/// # ];
Expand All @@ -280,11 +295,11 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
/// // (trie built as in previous example)
///
/// // Initialize the iteration at the first child of the trie.
/// let mut stack = Vec::from([(trie.cursor(), 0)]);
/// let mut stack = Vec::from([(trie.cursor(), 0, 0)]);
/// let mut key = Vec::new();
/// let mut results = Vec::new();
/// loop {
/// let Some((ref mut cursor, ref mut index)) = stack.last_mut() else {
/// let Some((mut cursor, index, suffix_len)) = stack.pop() else {
/// // Nothing left in the trie.
/// break;
/// };
Expand All @@ -294,23 +309,28 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
/// }
/// // Now check for children of the current node.
/// let mut sub_cursor = cursor.clone();
/// if let Some(ch) = sub_cursor.probe(*index) {
/// // Found a child. Add the child to the stack, and also
/// // increment the index, so that next time we visit the
/// // current node, we check the next child.
/// *index += 1;
/// stack.push((sub_cursor, 0));
/// key.push(ch);
/// if let Some(probe_result) = sub_cursor.probe(index) {
/// // Found a child. Add the current byte edge to the key.
/// key.push(probe_result.byte);
/// // Add the child to the stack, and also add back the current
/// // node if there are more siblings to visit.
/// if index + 1 < probe_result.total_siblings as usize {
/// stack.push((cursor, index + 1, suffix_len));
/// stack.push((sub_cursor, 0, 1));
/// } else {
/// stack.push((sub_cursor, 0, suffix_len + 1));
/// }
/// } else {
/// // No more children. Pop this node from the stack.
/// stack.pop();
/// key.pop();
/// // No more children. Pop this node's bytes from the key.
/// for _ in 0..suffix_len {
/// key.pop();
/// }
/// }
/// }
///
/// assert_eq!(&results, data);
/// ```
pub fn probe(&mut self, index: usize) -> Option<u8> {
pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index)
}

Expand Down Expand Up @@ -379,7 +399,7 @@ impl<'a> ZeroAsciiIgnoreCaseTrieCursor<'a> {
/// Probes the next byte in the cursor.
///
/// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`].
pub fn probe(&mut self, index: usize) -> Option<u8> {
pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index)
}

Expand Down
3 changes: 1 addition & 2 deletions utils/zerotrie/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ extern crate alloc;

mod builder;
mod byte_phf;
mod cursor;
pub mod cursor;
mod error;
#[macro_use]
mod helpers;
Expand All @@ -67,7 +67,6 @@ mod serde;
mod varint;
mod zerotrie;

pub use crate::cursor::ZeroTrieSimpleAsciiCursor;
pub use crate::zerotrie::ZeroAsciiIgnoreCaseTrie;
pub use crate::zerotrie::ZeroTrie;
pub use crate::zerotrie::ZeroTrieExtendedCapacity;
Expand Down
15 changes: 12 additions & 3 deletions utils/zerotrie/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@
//! ```

use crate::byte_phf::PerfectByteHashMap;
use crate::cursor::AsciiProbeResult;
use crate::helpers::*;
use crate::options::*;
use crate::varint::read_varint_meta2;
Expand Down Expand Up @@ -516,7 +517,7 @@ pub(crate) fn step_parameterized<T: ZeroTrieWithOptions + ?Sized>(
pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>(
trie: &mut &[u8],
index: usize,
) -> Option<u8> {
) -> Option<AsciiProbeResult> {
// Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`.
// `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie.
// If a span node is encountered, `None` is returned later in this function.
Expand Down Expand Up @@ -549,7 +550,10 @@ pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>(
*trie = &[];
return None;
}
return Some(*b);
return Some(AsciiProbeResult {
byte: *b,
total_siblings: 1,
});
}
NodeType::Branch => {
// Proceed to the branch node logic below
Expand All @@ -571,6 +575,8 @@ pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>(
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
debug_assert!(u8::try_from(x).is_ok());
let total_siblings = x as u8;
// See comment above regarding this assertion
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
let w = w & 0x3;
Expand All @@ -585,7 +591,10 @@ pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>(
} else {
get_branch(trie, index, x, w)
};
Some(search[index])
Some(AsciiProbeResult {
byte: search[index],
total_siblings,
})
}

/// Steps one node into the trie if the head node is a value node, returning the value.
Expand Down

0 comments on commit ef3da31

Please sign in to comment.