From 66f0bfe4bf063ca59bc9425fa6b30e23202ffb7b Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Mon, 9 Sep 2024 16:04:49 +0100 Subject: [PATCH] try reworking LazyIndexMap to fix lifetimes & other bugs --- crates/jiter/Cargo.toml | 1 + crates/jiter/src/lazy_index_map.rs | 366 ++++++++++++++++++++++------- crates/jiter/tests/main.rs | 8 +- 3 files changed, 286 insertions(+), 89 deletions(-) diff --git a/crates/jiter/Cargo.toml b/crates/jiter/Cargo.toml index ddcef5b..9d137cc 100644 --- a/crates/jiter/Cargo.toml +++ b/crates/jiter/Cargo.toml @@ -19,6 +19,7 @@ smallvec = "2.0.0-alpha.7" pyo3 = { workspace = true, optional = true, features = ["num-bigint"] } lexical-parse-float = { version = "0.8.5", features = ["format"] } bitvec = "1.0.1" +indexmap = "2.5.0" [features] python = ["dep:pyo3", "dep:pyo3-build-config"] diff --git a/crates/jiter/src/lazy_index_map.rs b/crates/jiter/src/lazy_index_map.rs index 5b40fea..922f8f9 100644 --- a/crates/jiter/src/lazy_index_map.rs +++ b/crates/jiter/src/lazy_index_map.rs @@ -1,18 +1,25 @@ use std::borrow::{Borrow, Cow}; +use std::cell::Cell; use std::fmt; -use std::hash::Hash; +use std::hash::{DefaultHasher, Hash, Hasher}; +use std::mem::MaybeUninit; use std::slice::Iter as SliceIter; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::OnceLock; -use ahash::AHashMap; -use smallvec::SmallVec; +use ahash::RandomState; +use bitvec::order::Lsb0; +use indexmap::IndexMap; /// Like [IndexMap](https://docs.rs/indexmap/latest/indexmap/) but only builds the lookup map when it's needed. +#[derive(Clone)] pub struct LazyIndexMap { - vec: SmallVec<(K, V), 8>, - map: OnceLock>, - last_find: AtomicUsize, + inner: LazyIndexMapInner, +} + +#[derive(Clone)] +enum LazyIndexMapInner { + Array(LazyIndexMapArray), + Map(IndexMap), } impl Default for LazyIndexMap @@ -25,23 +32,13 @@ where } } -impl Clone for LazyIndexMap { - fn clone(&self) -> Self { - Self { - vec: self.vec.clone(), - map: self.map.clone(), - last_find: AtomicUsize::new(0), - } - } -} - impl fmt::Debug for LazyIndexMap where K: Clone + fmt::Debug + Eq + Hash, V: fmt::Debug, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_map().entries(self.iter_unique()).finish() + f.debug_map().entries(self.iter()).finish() } } @@ -51,30 +48,68 @@ const HASHMAP_THRESHOLD: usize = 16; /// Like [IndexMap](https://docs.rs/indexmap/latest/indexmap/) but only builds the lookup map when it's needed. impl LazyIndexMap where - K: Clone + fmt::Debug + Eq + Hash, + K: fmt::Debug + Eq + Hash, V: fmt::Debug, { pub fn new() -> Self { Self { - vec: SmallVec::new(), - map: OnceLock::new(), - last_find: AtomicUsize::new(0), + inner: LazyIndexMapInner::Array(LazyIndexMapArray::new()), } } pub fn insert(&mut self, key: K, value: V) { - if let Some(map) = self.map.get_mut() { - map.insert(key.clone(), self.vec.len()); + match self { + Self { + inner: LazyIndexMapInner::Array(vec), + } => { + if let Some((key, value)) = vec.insert(key, value) { + // array is full, convert to map + let LazyIndexMapInner::Array(vec) = std::mem::replace( + &mut self.inner, + LazyIndexMapInner::Map(IndexMap::with_capacity_and_hasher( + HASHMAP_THRESHOLD + 1, + RandomState::default(), + )), + ) else { + unreachable!("known to be a vec"); + }; + let LazyIndexMapInner::Map(map) = &mut self.inner else { + unreachable!("just set to be a map"); + }; + for (k, v) in vec.into_complete_data() { + map.insert(k, v); + } + map.insert(key, value); + } + } + Self { + inner: LazyIndexMapInner::Map(map), + } => { + map.insert(key, value); + } } - self.vec.push((key, value)); } pub fn len(&self) -> usize { - self.get_map().len() + match self { + Self { + inner: LazyIndexMapInner::Array(vec), + } => vec.duplicates_mask()[..vec.data().len()].count_ones(), + Self { + inner: LazyIndexMapInner::Map(map), + } => map.len(), + } } pub fn is_empty(&self) -> bool { - self.vec.is_empty() + match self { + Self { + inner: LazyIndexMapInner::Array(vec), + } => vec.data().is_empty(), + Self { + inner: LazyIndexMapInner::Map(map), + } => map.is_empty(), + } } pub fn get(&self, key: &Q) -> Option<&V> @@ -82,93 +117,254 @@ where K: Borrow + PartialEq, Q: Hash + Eq + ?Sized, { - let vec_len = self.vec.len(); - // if the vec is longer than the threshold, we use the hashmap for lookups - if vec_len > HASHMAP_THRESHOLD { - self.get_map().get(key).map(|&i| &self.vec[i].1) - } else { - // otherwise we find the value in the vec - // we assume the most likely position for the match is at `last_find + 1` + match &self.inner { + LazyIndexMapInner::Array(vec) => vec.get(key), + LazyIndexMapInner::Map(map) => map.get(key), + } + } + + pub fn keys(&self) -> impl Iterator { + self.iter().map(|(k, _)| k) + } + + pub fn iter(&self) -> impl Iterator { + match &self.inner { + LazyIndexMapInner::Array(vec) => { + // SAFETY: data is known to be initialized up to len + let data = vec.data(); + let mask = vec.duplicates_mask().clone(); + LazyIndexMapIter::Vec { + iter: data.iter(), + mask: mask.into_iter(), + } + } + LazyIndexMapInner::Map(map) => LazyIndexMapIter::Map(map.iter()), + } + } +} + +mod index_map_vec { + use super::*; + + pub(super) struct LazyIndexMapArray { + data: Box<[MaybeUninit<(K, V)>; HASHMAP_THRESHOLD]>, + len: usize, + last_find: AtomicUsize, + duplicates_mask: DuplicatesMask, + } + + type DuplicatesMask = bitvec::BitArr!(for HASHMAP_THRESHOLD, in Cell); + + impl LazyIndexMapArray { + pub fn new() -> Self { + Self { + data: boxed_uninit_array(), + len: 0, + last_find: AtomicUsize::new(0), + duplicates_mask: DuplicatesMask::ZERO, + } + } + + pub fn data(&self) -> &[(K, V)] { + // SAFETY: data is known to be initialized up to len + unsafe { std::slice::from_raw_parts(self.data.as_ptr().cast::<(K, V)>(), self.len) } + } + } + + impl LazyIndexMapArray { + /// If the vec is full, returns the key-value pair that was not inserted. + pub fn insert(&mut self, key: K, value: V) -> Option<(K, V)> { + if self.len >= HASHMAP_THRESHOLD { + return Some((key, value)); + } + self.data[self.len].write((key, value)); + self.len += 1; + // clear cached mask + self.duplicates_mask = DuplicatesMask::ZERO; + None + } + + pub fn get(&self, key: &Q) -> Option<&V> + where + K: Borrow + PartialEq, + Q: Hash + Eq + ?Sized, + { + if self.len == 0 { + return None; + } + + let data = self.data(); + let mask = self.duplicates_mask(); + let first_try = self.last_find.load(Ordering::Relaxed) + 1; - for i in first_try..first_try + vec_len { - let index = i % vec_len; - let (k, v) = &self.vec[index]; - if k == key { + for i in first_try..first_try + data.len() { + let index = i % data.len(); + if !mask[index] { + continue; + } + let (k, v) = &data[index]; + if k.borrow() == key { self.last_find.store(index, Ordering::Relaxed); return Some(v); } } None } - } - pub fn keys(&self) -> impl Iterator { - self.vec.iter().map(|(k, _)| k) - } + pub fn into_complete_data(self) -> impl IntoIterator { + self.data + .into_iter() + .take(self.len) + // SAFETY: reading initialized section only + .map(|x| unsafe { x.assume_init() }) + } - pub fn iter(&self) -> SliceIter<'_, (K, V)> { - self.vec.iter() + pub fn duplicates_mask(&self) -> &DuplicatesMask { + let data = self.data(); + if self.duplicates_mask == DuplicatesMask::ZERO { + let new_mask = build_duplicates_mask(data); + // FIXME: is there a way to write the whole thing at once? + for i in 0..data.len() { + self.duplicates_mask.set_aliased(i, new_mask[i]); + } + } + &self.duplicates_mask + } } - pub fn iter_unique(&self) -> impl Iterator { - IterUnique { - vec: &self.vec, - map: self.get_map(), - index: 0, + impl<'j> LazyIndexMapArray, crate::JsonValue<'j>> { + pub fn to_static(&self) -> LazyIndexMapArray, crate::JsonValue<'static>> { + let mut new_data = boxed_uninit_array(); + for (i, (k, v)) in self.data().iter().enumerate() { + new_data[i] = MaybeUninit::new((Cow::Owned(k.to_string()), v.to_static())); + } + LazyIndexMapArray { + data: new_data, + len: self.len, + last_find: AtomicUsize::new(self.last_find.load(Ordering::Relaxed)), + duplicates_mask: self.duplicates_mask.clone(), + } } } - fn get_map(&self) -> &AHashMap { - self.map.get_or_init(|| { - self.vec - .iter() - .enumerate() - .map(|(index, (key, _))| (key.clone(), index)) - .collect() - }) + impl Clone for LazyIndexMapArray { + fn clone(&self) -> Self { + let mut new_data = boxed_uninit_array(); + for (i, value) in self.data().iter().enumerate() { + // SAFETY: initialized up to i + new_data[i] = MaybeUninit::new(value.clone()); + } + LazyIndexMapArray { + data: new_data, + len: self.len, + last_find: AtomicUsize::new(self.last_find.load(Ordering::Relaxed)), + duplicates_mask: self.duplicates_mask.clone(), + } + } } -} -impl<'j> LazyIndexMap, crate::JsonValue<'j>> { - pub(crate) fn to_static(&self) -> LazyIndexMap, crate::JsonValue<'static>> { - LazyIndexMap { - vec: self - .vec - .iter() - .map(|(k, v)| (k.to_string().into(), v.to_static())) - .collect(), - map: OnceLock::new(), - last_find: AtomicUsize::new(0), + fn build_duplicates_mask(data: &[(K, V)]) -> bitvec::BitArr!(for HASHMAP_THRESHOLD, in u16) { + let hashes_and_indices: &mut [(u64, usize)] = &mut [(0u64, 0usize); HASHMAP_THRESHOLD][..data.len()]; + let mut mask = bitvec::bitarr![u16, Lsb0; 1; HASHMAP_THRESHOLD]; + for (i, (k, _)) in data.iter().enumerate() { + // SAFETY: data is known to be initialized + let mut hasher = DefaultHasher::new(); + k.hash(&mut hasher); + let hash = hasher.finish(); + hashes_and_indices[i] = (hash, i); } + hashes_and_indices.sort_unstable(); + + for i in 0..data.len() { + let (hash, index) = hashes_and_indices[i]; + for (next_hash, next_index) in hashes_and_indices.iter().skip(i + 1) { + if *next_hash != hash { + break; + } + // is a duplicate key; prefer the later element + if data[*next_index].0 == data[index].0 { + mask.set(index, false); + break; + } + } + } + mask } -} -impl PartialEq for LazyIndexMap { - fn eq(&self, other: &Self) -> bool { - self.vec == other.vec + // in the future this should be Box::new([const { MaybeUninit::::uninit() }; 16]); + // waiting on inline const expressions to be on stable + fn boxed_uninit_array() -> Box<[MaybeUninit; 16]> { + Box::new([ + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + ]) } } -struct IterUnique<'a, K, V> { - vec: &'a SmallVec<(K, V), 8>, - map: &'a AHashMap, - index: usize, +use index_map_vec::LazyIndexMapArray; + +enum LazyIndexMapIter<'a, K, V> { + Vec { + iter: SliceIter<'a, (K, V)>, + // to mask duplicate entries + mask: ) as IntoIterator>::IntoIter, + }, + Map(indexmap::map::Iter<'a, K, V>), } -impl<'a, K: Hash + Eq, V> Iterator for IterUnique<'a, K, V> { +impl<'a, K, V> Iterator for LazyIndexMapIter<'a, K, V> { type Item = (&'a K, &'a V); fn next(&mut self) -> Option { - while self.index < self.vec.len() { - let (k, v) = &self.vec[self.index]; - if let Some(map_index) = self.map.get(k) { - if *map_index == self.index { - self.index += 1; - return Some((k, v)); + match self { + LazyIndexMapIter::Vec { iter, mask } => { + while let Some((k, v)) = iter.next() { + let is_not_duplicate = mask.next().expect("mask covers array length"); + if is_not_duplicate { + return Some((k, v)); + } } + None } - self.index += 1; + LazyIndexMapIter::Map(iter) => iter.next(), } - None + } +} + +impl<'j> LazyIndexMap, crate::JsonValue<'j>> { + pub(crate) fn to_static(&self) -> LazyIndexMap, crate::JsonValue<'static>> { + let inner = match &self.inner { + LazyIndexMapInner::Array(vec) => LazyIndexMapInner::Array(vec.to_static()), + LazyIndexMapInner::Map(map) => LazyIndexMapInner::Map( + map.iter() + .map(|(k, v)| (Cow::Owned(k.to_string()), v.to_static())) + .collect(), + ), + }; + LazyIndexMap { inner } + } +} + +impl PartialEq for LazyIndexMap +where + K: fmt::Debug + Eq + Hash, + V: fmt::Debug, +{ + fn eq(&self, other: &Self) -> bool { + self.iter().eq(other.iter()) } } diff --git a/crates/jiter/tests/main.rs b/crates/jiter/tests/main.rs index 65d8770..4284395 100644 --- a/crates/jiter/tests/main.rs +++ b/crates/jiter/tests/main.rs @@ -919,11 +919,11 @@ fn test_crazy_massive_int() { } #[test] -fn unique_iter_object() { +fn iter_object() { let value = JsonValue::parse(br#" {"x": 1, "x": 2} "#, false).unwrap(); if let JsonValue::Object(obj) = value { assert_eq!(obj.len(), 1); - let mut unique = obj.iter_unique(); + let mut unique = obj.iter(); let first = unique.next().unwrap(); assert_eq!(first.0, "x"); assert_eq!(first.1, &JsonValue::Int(2)); @@ -934,11 +934,11 @@ fn unique_iter_object() { } #[test] -fn unique_iter_object_repeat() { +fn iter_object_repeat() { let value = JsonValue::parse(br#" {"x": 1, "x": 1} "#, false).unwrap(); if let JsonValue::Object(obj) = value { assert_eq!(obj.len(), 1); - let mut unique = obj.iter_unique(); + let mut unique = obj.iter(); let first = unique.next().unwrap(); assert_eq!(first.0, "x"); assert_eq!(first.1, &JsonValue::Int(1));