unicode-org · sffc · Mar 26, 2024 · Mar 23, 2024 · Mar 26, 2024 · Manishearth
@@ -222,6 +222,97 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
         reader::take_value(&mut self.trie.store)
     }
 
+    /// Probes the next byte in the cursor, stepping the cursor and returning the byte.
+    ///
+    /// This function is similar to [`Self::step()`], but it takes an index instead of a char.
+    /// This enables stepwise iteration over the contents of the trie.
+    ///
+    /// If there are multiple possibilities for the next byte, the `index` argument allows
+    /// visiting them in order. Since this function steps the cursor, the cursor must be
+    /// cloned (a cheap operation) in order to visit multiple children.
+    ///
+    /// # Examples
+    ///
+    /// Continually query index 0 to extract the first item from a trie:
+    ///
+    /// ```
+    /// use zerotrie::ZeroTrieSimpleAscii;
+    ///
+    /// let data: &[(String, usize)] = &[
+    ///     ("ab".to_string(), 111),
+    ///     ("abc".to_string(), 22),
+    ///     ("abde".to_string(), 333),
+    ///     ("afg".to_string(), 44),
+    /// ];
+    ///
+    /// let trie: ZeroTrieSimpleAscii<Vec<u8>> =
+    ///     data.iter().map(|(s, v)| (s.as_str(), *v)).collect();
+    ///
+    /// let mut cursor = trie.cursor();
+    /// let mut key = String::new();
+    /// let value = loop {
+    ///     if let Some(value) = cursor.take_value() {
+    ///         break value;
+    ///     }
+    ///     let ch = cursor.probe(0).unwrap();
+    ///     key.push(char::from(ch));
+    /// };
+    ///
+    /// assert_eq!(key, "ab");
+    /// assert_eq!(value, 111);
+    /// ```
+    ///
+    /// Stepwise iterate over all entries in the trie:
+    ///
+    /// ```
+    /// # use zerotrie::ZeroTrieSimpleAscii;
+    /// # let data: &[(String, usize)] = &[
+    /// #     ("ab".to_string(), 111),
+    /// #     ("abc".to_string(), 22),
+    /// #     ("abde".to_string(), 333),
+    /// #     ("afg".to_string(), 44)
+    /// # ];
+    /// # let trie: ZeroTrieSimpleAscii<Vec<u8>> = data
+    /// #     .iter()
+    /// #     .map(|(s, v)| (s.as_str(), *v))
+    /// #     .collect();
+    /// // (trie built as in previous example)
+    ///
+    /// // Initialize the iteration at the first child of the trie.
+    /// let mut stack = Vec::from([(trie.cursor(), 0)]);
+    /// let mut key = Vec::new();
+    /// let mut results = Vec::new();
+    /// loop {
+    ///     let Some((ref mut cursor, ref mut index)) = stack.last_mut() else {
+    ///         // Nothing left in the trie.
+    ///         break;
+    ///     };
+    ///     // Check to see if there is a value at the current node.
+    ///     if let Some(value) = cursor.take_value() {
+    ///         results.push((String::from_utf8(key.clone()).unwrap(), value));
+    ///     }
+    ///     // Now check for children of the current node.
+    ///     let mut sub_cursor = cursor.clone();
+    ///     if let Some(ch) = sub_cursor.probe(*index) {
+    ///         // Found a child. Add the child to the stack, and also
+    ///         // increment the index, so that next time we visit the
+    ///         // current node, we check the next child.
+    ///         *index += 1;
+    ///         stack.push((sub_cursor, 0));
+    ///         key.push(ch);
+    ///     } else {
+    ///         // No more children. Pop this node from the stack.
+    ///         stack.pop();
+    ///         key.pop();
+    ///     }
+    /// }
+    ///
+    /// assert_eq!(&results, data);
+    /// ```
+    pub fn probe(&mut self, index: usize) -> Option<u8> {
+        reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index)
+    }
+
     /// Checks whether the cursor points to an empty trie.
     ///
     /// Use this to determine when to stop iterating.
@@ -270,23 +361,30 @@ impl<'a> ZeroAsciiIgnoreCaseTrieCursor<'a> {
     /// assert_eq!(&*key_str, "aBc".as_bytes());
     /// ```
     ///
-    /// For more examples, see [`ZeroAsciiIgnoreCaseTrieCursor::step`].
+    /// For more examples, see [`ZeroTrieSimpleAsciiCursor::step`].
     #[inline]
     pub fn step(&mut self, byte: u8) -> Option<u8> {
         reader::step_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, byte)
     }
 
     /// Takes the value at the current position.
     ///
-    /// For more details, see [`ZeroAsciiIgnoreCaseTrieCursor::take_value`].
+    /// For more details, see [`ZeroTrieSimpleAsciiCursor::take_value`].
     #[inline]
     pub fn take_value(&mut self) -> Option<usize> {
         reader::take_value(&mut self.trie.store)
     }
 
+    /// Probes the next byte in the cursor.
+    ///
+    /// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`].
+    pub fn probe(&mut self, index: usize) -> Option<u8> {
+        reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index)
+    }
+
     /// Checks whether the cursor points to an empty trie.
     ///
-    /// For more details, see [`ZeroAsciiIgnoreCaseTrieCursor::is_empty`].
+    /// For more details, see [`ZeroTrieSimpleAsciiCursor::is_empty`].
     #[inline]
     pub fn is_empty(&self) -> bool {
         self.trie.is_empty()

@@ -507,6 +507,87 @@ pub(crate) fn step_parameterized<T: ZeroTrieWithOptions + ?Sized>(
     }
 }
 
+/// Steps one node into the trie, assuming all branch nodes are binary search and that
+/// there are no span nodes, using an index.
+///
+/// The input-output argument `trie` starts at the original trie and ends pointing to
+/// the sub-trie indexed by `index`.
+#[inline]
+pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>(
+    trie: &mut &[u8],
+    index: usize,
+) -> Option<u8> {
+    // Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`.
+    // `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie.
+    // If a span node is encountered, `None` is returned later in this function.
+    debug_assert!(
+        matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly),
+        "Spans not yet implemented in step function"
+    );
+    // PHF can be easily implemented but the code is not yet reachable
+    debug_assert!(
+        matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly),
+        "PHF not yet implemented in step function"
+    );
+    // Extended Capacity can be easily implemented but the code is not yet reachable
+    debug_assert!(
+        matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal),
+        "Extended capacity not yet implemented in step function"
+    );
+    let (mut b, x, search);
+    loop {
+        (b, *trie) = match trie.split_first() {
+            Some(v) => v,
+            None => {
+                // Empty trie or only a value node
+                return None;
+            }
+        };
+        match byte_type(*b) {
+            NodeType::Ascii => {
+                if index > 0 {
+                    *trie = &[];
+                    return None;
+                }
+                return Some(*b);
+            }
+            NodeType::Branch => {
+                // Proceed to the branch node logic below
+                (x, *trie) = read_varint_meta2(*b, trie);
+                break;
+            }
+            NodeType::Span => {
+                // Question: Should we put the trie back into a valid state?
+                // Currently this code is unreachable so let's not worry about it.
+                debug_assert!(false, "Span node found in ASCII trie!");
+                return None;
+            }
+            NodeType::Value => {
+                // Skip the value node and go to the next node
+                (_, *trie) = read_varint_meta3(*b, trie);
+                continue;
+            }
+        };
+    }
+    // Branch node
+    let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
+    // See comment above regarding this assertion
+    debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
+    let w = w & 0x3;
+    let x = if x == 0 { 256 } else { x };
+    if index >= x {
+        *trie = &[];
+        return None;
+    }
+    (search, *trie) = trie.debug_split_at(x);
+    *trie = if w == 0 {
+        get_branch_w0(trie, index, x)
+    } else {
+        get_branch(trie, index, x, w)
+    };
+    Some(search[index])
+}
+
 /// Steps one node into the trie if the head node is a value node, returning the value.
 /// If the head node is not a value node, no change is made.
 ///