diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index e962c2d1f64..dd9dab4406d 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -2,6 +2,41 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +//! # Byte Perfect Hash Function Internals +//! +//! This module contains a perfect hash function (PHF) optimized for... TODO +//! +//! Reading a `key` from the PHF uses the following algorithm: +//! +//! 1. Let `t`, the bucket index, be `f1(key, p)`. +//! 2. Let `i`, the key index, be `f2(key, q_t)`. +//! 3. If `key == k_i`, return `Some(i)`; else return `None`. +//! +//! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across +//! serialization versions of `ZeroTrie`. +//! +//! ``` +//! let phf_example_bytes = [ +//! // `p` parameter +//! 1, +//! // `q` parameters, one for each of the N buckets +//! 0, 0, 1, 1, +//! // Exact keys to be compared with the input +//! b'e', b'a', b'c', b'g' +//! ]; +//! +//! let phf = zerotrie::byte_phf::PerfectByteHashMap::from_bytes(&phf_example_bytes); +//! +//! // The PHF returns the index of the key or `None` if not found. +//! assert_eq!(phf.get(b'a'), Some(1)); +//! assert_eq!(phf.get(b'b'), None); +//! assert_eq!(phf.get(b'c'), Some(2)); +//! assert_eq!(phf.get(b'd'), None); +//! assert_eq!(phf.get(b'e'), Some(0)); +//! assert_eq!(phf.get(b'f'), None); +//! assert_eq!(phf.get(b'g'), Some(3)); +//! ``` + #[cfg(feature = "alloc")] mod builder; #[cfg(feature = "alloc")] @@ -12,11 +47,17 @@ pub use builder::find; #[cfg(feature = "alloc")] pub use cached_owned::PerfectByteHashMapCacheOwned; +/// The cutoff for the fast version of [`f1`]. const P_FAST_MAX: u8 = 11; + +/// The cutoff for the fast version of [`f2`]. const Q_FAST_MAX: u8 = 95; +/// The maximum allowable value of `p`. This could be raised if found to be necessary. #[cfg(feature = "alloc")] // used in the builder code const P_REAL_MAX: u8 = 15; + +/// The maximum allowable value of `q`. This could be raised if found to be necessary. #[cfg(feature = "alloc")] // used in the builder code const Q_REAL_MAX: u8 = 127; @@ -44,7 +85,41 @@ fn debug_get(slice: &[u8], index: usize) -> Option { } } +/// Calculates the function `f1` for the PHF. For the exact formula, please read the code. +/// +/// When `p == 0`, the operation is a simple modulus. +/// +/// The argument `n` is used only for taking the modulus so that the return value is +/// in the range `[0, n)`. +/// /// Invariant: n > 0 +/// +/// # Examples +/// +/// ``` +/// use zerotrie::byte_phf::f1; +/// const N: usize = 10; +/// +/// // With p = 0: +/// assert_eq!(0, f1(0, 0, N)); +/// assert_eq!(1, f1(1, 0, N)); +/// assert_eq!(2, f1(2, 0, N)); +/// assert_eq!(9, f1(9, 0, N)); +/// assert_eq!(0, f1(10, 0, N)); +/// assert_eq!(1, f1(11, 0, N)); +/// assert_eq!(2, f1(12, 0, N)); +/// assert_eq!(9, f1(19, 0, N)); +/// +/// // With p = 1: +/// assert_eq!(1, f1(0, 1, N)); +/// assert_eq!(0, f1(1, 1, N)); +/// assert_eq!(2, f1(2, 1, N)); +/// assert_eq!(2, f1(9, 1, N)); +/// assert_eq!(4, f1(10, 1, N)); +/// assert_eq!(5, f1(11, 1, N)); +/// assert_eq!(1, f1(12, 1, N)); +/// assert_eq!(7, f1(19, 1, N)); +/// ``` #[inline] pub fn f1(byte: u8, p: u8, n: usize) -> usize { let n = if n > 0 { @@ -57,6 +132,9 @@ pub fn f1(byte: u8, p: u8, n: usize) -> usize { byte as usize % n } else { let mut result = byte ^ p ^ byte.wrapping_shr(p as u32); + // In almost all cases, the PHF works with the above constant-time operation. + // However, to crack a few difficult cases, we fall back to the linear-time + // operation shown below. for _ in P_FAST_MAX..p { result = result ^ (result << 1) ^ (result >> 1); } @@ -64,7 +142,41 @@ pub fn f1(byte: u8, p: u8, n: usize) -> usize { } } +/// Calculates the function `f2` for the PHF. For the exact formula, please read the code. +/// +/// When `q == 0`, the operation is a simple modulus. +/// +/// The argument `n` is used only for taking the modulus so that the return value is +/// in the range `[0, n)`. +/// /// Invariant: n > 0 +/// +/// # Examples +/// +/// ``` +/// use zerotrie::byte_phf::f2; +/// const N: usize = 10; +/// +/// // With q = 0: +/// assert_eq!(0, f2(0, 0, N)); +/// assert_eq!(1, f2(1, 0, N)); +/// assert_eq!(2, f2(2, 0, N)); +/// assert_eq!(9, f2(9, 0, N)); +/// assert_eq!(0, f2(10, 0, N)); +/// assert_eq!(1, f2(11, 0, N)); +/// assert_eq!(2, f2(12, 0, N)); +/// assert_eq!(9, f2(19, 0, N)); +/// +/// // With q = 1: +/// assert_eq!(1, f2(0, 1, N)); +/// assert_eq!(0, f2(1, 1, N)); +/// assert_eq!(3, f2(2, 1, N)); +/// assert_eq!(8, f2(9, 1, N)); +/// assert_eq!(1, f2(10, 1, N)); +/// assert_eq!(0, f2(11, 1, N)); +/// assert_eq!(3, f2(12, 1, N)); +/// assert_eq!(8, f2(19, 1, N)); +/// ``` #[inline] pub fn f2(byte: u8, q: u8, n: usize) -> usize { let n = if n > 0 { @@ -278,6 +390,11 @@ mod tests { expected: &[1, 0, 1, b'c', b'a'], reordered_keys: "ca", }, + TestCase { + keys: "aceg", + expected: &[1, 0, 0, 1, 1, b'e', b'a', b'c', b'g'], + reordered_keys: "eacg", + }, TestCase { keys: "abd", expected: &[0, 0, 1, 3, b'a', b'b', b'd'], diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 73ca7cf0de9..1da8e7b9cfd 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -66,6 +66,9 @@ //! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256 //! - Bits 9 and 10: width of the offset table (`W`) //! +//! Note that N is always in the range [1, 256]. There can't be more than 256 edges because +//! there are only 256 unique u8 values. +//! //! A few examples of the head node of the branch: //! //! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0 @@ -78,8 +81,32 @@ //! //! ### Binary Search Branch Nodes //! -//! Here, the head branch node is followed by N sorted bytes and then (W+1)*(N-1) bytes -//! for the offset table. +//! A binary search branch node is used when: +//! +//! 1. The trie is a `ZeroTrieSimpleAscii`, OR +//! 2. There are 15 or fewer items in the branch. +//! +//! The head branch node is followed by N sorted bytes. When evaluating a branch node, one byte +//! is consumed from the input. If it is one of the N sorted bytes (scanned using binary search), +//! the index `i` of the byte within the list is used to index into the offset table (described +//! below). If the byte is not in the list, the string is not in the trie, so return `None`. +//! +//! ### Perfect Hash Branch Nodes +//! +//! A perfect hash branch node is used when: +//! +//! 1. The trie is NOT a `ZeroTrieSimpleAscii`, AND +//! 2. There are 16 or more items in the branch. +//! +//! The head branch node is followed by 1 byte containing parameter `p`, N bytes containing +//! parameters `q`, and N bytes containing the bytes to match. From these parameters, either an +//! index within the hash table `i` is resolved and used as input to index into the offset +//! table (described below), or the value is determined to not be present and `None` is +//! returned. For more detail on resolving the perfect hash function, see [`crate::byte_phf`]. +//! +//! ### Offset Tables +//! +//! Both types of branch node are followed by an offset table. use crate::byte_phf::PerfectByteHashMap; use crate::varint::read_varint_meta2;