Skip to content

Commit

Permalink
Add some more docs
Browse files Browse the repository at this point in the history
  • Loading branch information
sffc committed Jul 16, 2023
1 parent 9739ce1 commit 1d55557
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 2 deletions.
117 changes: 117 additions & 0 deletions experimental/zerotrie/src/byte_phf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,41 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! # Byte Perfect Hash Function Internals
//!
//! This module contains a perfect hash function (PHF) optimized for... TODO
//!
//! Reading a `key` from the PHF uses the following algorithm:
//!
//! 1. Let `t`, the bucket index, be `f1(key, p)`.
//! 2. Let `i`, the key index, be `f2(key, q_t)`.
//! 3. If `key == k_i`, return `Some(i)`; else return `None`.
//!
//! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across
//! serialization versions of `ZeroTrie`.
//!
//! ```
//! let phf_example_bytes = [
//! // `p` parameter
//! 1,
//! // `q` parameters, one for each of the N buckets
//! 0, 0, 1, 1,
//! // Exact keys to be compared with the input
//! b'e', b'a', b'c', b'g'
//! ];
//!
//! let phf = zerotrie::byte_phf::PerfectByteHashMap::from_bytes(&phf_example_bytes);
//!
//! // The PHF returns the index of the key or `None` if not found.
//! assert_eq!(phf.get(b'a'), Some(1));
//! assert_eq!(phf.get(b'b'), None);
//! assert_eq!(phf.get(b'c'), Some(2));
//! assert_eq!(phf.get(b'd'), None);
//! assert_eq!(phf.get(b'e'), Some(0));
//! assert_eq!(phf.get(b'f'), None);
//! assert_eq!(phf.get(b'g'), Some(3));
//! ```

#[cfg(feature = "alloc")]
mod builder;
#[cfg(feature = "alloc")]
Expand All @@ -12,11 +47,17 @@ pub use builder::find;
#[cfg(feature = "alloc")]
pub use cached_owned::PerfectByteHashMapCacheOwned;

/// The cutoff for the fast version of [`f1`].
const P_FAST_MAX: u8 = 11;

/// The cutoff for the fast version of [`f2`].
const Q_FAST_MAX: u8 = 95;

/// The maximum allowable value of `p`. This could be raised if found to be necessary.
#[cfg(feature = "alloc")] // used in the builder code
const P_REAL_MAX: u8 = 15;

/// The maximum allowable value of `q`. This could be raised if found to be necessary.
#[cfg(feature = "alloc")] // used in the builder code
const Q_REAL_MAX: u8 = 127;

Expand Down Expand Up @@ -44,7 +85,41 @@ fn debug_get(slice: &[u8], index: usize) -> Option<u8> {
}
}

/// Calculates the function `f1` for the PHF. For the exact formula, please read the code.
///
/// When `p == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// Invariant: n > 0
///
/// # Examples
///
/// ```
/// use zerotrie::byte_phf::f1;
/// const N: usize = 10;
///
/// // With p = 0:
/// assert_eq!(0, f1(0, 0, N));
/// assert_eq!(1, f1(1, 0, N));
/// assert_eq!(2, f1(2, 0, N));
/// assert_eq!(9, f1(9, 0, N));
/// assert_eq!(0, f1(10, 0, N));
/// assert_eq!(1, f1(11, 0, N));
/// assert_eq!(2, f1(12, 0, N));
/// assert_eq!(9, f1(19, 0, N));
///
/// // With p = 1:
/// assert_eq!(1, f1(0, 1, N));
/// assert_eq!(0, f1(1, 1, N));
/// assert_eq!(2, f1(2, 1, N));
/// assert_eq!(2, f1(9, 1, N));
/// assert_eq!(4, f1(10, 1, N));
/// assert_eq!(5, f1(11, 1, N));
/// assert_eq!(1, f1(12, 1, N));
/// assert_eq!(7, f1(19, 1, N));
/// ```
#[inline]
pub fn f1(byte: u8, p: u8, n: usize) -> usize {
let n = if n > 0 {
Expand All @@ -57,14 +132,51 @@ pub fn f1(byte: u8, p: u8, n: usize) -> usize {
byte as usize % n
} else {
let mut result = byte ^ p ^ byte.wrapping_shr(p as u32);
// In almost all cases, the PHF works with the above constant-time operation.
// However, to crack a few difficult cases, we fall back to the linear-time
// operation shown below.
for _ in P_FAST_MAX..p {
result = result ^ (result << 1) ^ (result >> 1);
}
result as usize % n
}
}

/// Calculates the function `f2` for the PHF. For the exact formula, please read the code.
///
/// When `q == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// Invariant: n > 0
///
/// # Examples
///
/// ```
/// use zerotrie::byte_phf::f2;
/// const N: usize = 10;
///
/// // With q = 0:
/// assert_eq!(0, f2(0, 0, N));
/// assert_eq!(1, f2(1, 0, N));
/// assert_eq!(2, f2(2, 0, N));
/// assert_eq!(9, f2(9, 0, N));
/// assert_eq!(0, f2(10, 0, N));
/// assert_eq!(1, f2(11, 0, N));
/// assert_eq!(2, f2(12, 0, N));
/// assert_eq!(9, f2(19, 0, N));
///
/// // With q = 1:
/// assert_eq!(1, f2(0, 1, N));
/// assert_eq!(0, f2(1, 1, N));
/// assert_eq!(3, f2(2, 1, N));
/// assert_eq!(8, f2(9, 1, N));
/// assert_eq!(1, f2(10, 1, N));
/// assert_eq!(0, f2(11, 1, N));
/// assert_eq!(3, f2(12, 1, N));
/// assert_eq!(8, f2(19, 1, N));
/// ```
#[inline]
pub fn f2(byte: u8, q: u8, n: usize) -> usize {
let n = if n > 0 {
Expand Down Expand Up @@ -278,6 +390,11 @@ mod tests {
expected: &[1, 0, 1, b'c', b'a'],
reordered_keys: "ca",
},
TestCase {
keys: "aceg",
expected: &[1, 0, 0, 1, 1, b'e', b'a', b'c', b'g'],
reordered_keys: "eacg",
},
TestCase {
keys: "abd",
expected: &[0, 0, 1, 3, b'a', b'b', b'd'],
Expand Down
31 changes: 29 additions & 2 deletions experimental/zerotrie/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@
//! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256
//! - Bits 9 and 10: width of the offset table (`W`)
//!
//! Note that N is always in the range [1, 256]. There can't be more than 256 edges because
//! there are only 256 unique u8 values.
//!
//! A few examples of the head node of the branch:
//!
//! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0
Expand All @@ -78,8 +81,32 @@
//!
//! ### Binary Search Branch Nodes
//!
//! Here, the head branch node is followed by N sorted bytes and then (W+1)*(N-1) bytes
//! for the offset table.
//! A binary search branch node is used when:
//!
//! 1. The trie is a `ZeroTrieSimpleAscii`, OR
//! 2. There are 15 or fewer items in the branch.
//!
//! The head branch node is followed by N sorted bytes. When evaluating a branch node, one byte
//! is consumed from the input. If it is one of the N sorted bytes (scanned using binary search),
//! the index `i` of the byte within the list is used to index into the offset table (described
//! below). If the byte is not in the list, the string is not in the trie, so return `None`.
//!
//! ### Perfect Hash Branch Nodes
//!
//! A perfect hash branch node is used when:
//!
//! 1. The trie is NOT a `ZeroTrieSimpleAscii`, AND
//! 2. There are 16 or more items in the branch.
//!
//! The head branch node is followed by 1 byte containing parameter `p`, N bytes containing
//! parameters `q`, and N bytes containing the bytes to match. From these parameters, either an
//! index within the hash table `i` is resolved and used as input to index into the offset
//! table (described below), or the value is determined to not be present and `None` is
//! returned. For more detail on resolving the perfect hash function, see [`crate::byte_phf`].
//!
//! ### Offset Tables
//!
//! Both types of branch node are followed by an offset table.

use crate::byte_phf::PerfectByteHashMap;
use crate::varint::read_varint_meta2;
Expand Down

0 comments on commit 1d55557

Please sign in to comment.