Skip to content

Commit

Permalink
Add spacetimedb_primitives::col_list::ColSet (#1691)
Browse files Browse the repository at this point in the history
Signed-off-by: james gilles <jameshgilles@gmail.com>
Co-authored-by: Mazdak Farrokhzad <twingoow@gmail.com>
  • Loading branch information
kazimuth and Centril authored Sep 11, 2024
1 parent 3530498 commit 11283c3
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 5 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ spacetimedb
│ ├── spacetimedb_primitives
│ │ ├── bitflags
│ │ ├── either
│ │ ├── itertools
│ │ │ └── either
│ │ └── nohash_hasher
│ └── syn
│ ├── proc_macro2 (*)
Expand All @@ -56,8 +58,7 @@ spacetimedb
│ │ ├── quote (*)
│ │ └── syn (*)
│ ├── hex
│ ├── itertools
│ │ └── either
│ ├── itertools (*)
│ ├── spacetimedb_bindings_macro (*)
│ ├── spacetimedb_data_structures
│ │ ├── hashbrown
Expand Down
1 change: 1 addition & 0 deletions crates/primitives/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ description = "Primitives such as TableId and ColumnIndexAttribute"
bitflags.workspace = true
either.workspace = true
nohash-hasher.workspace = true
itertools.workspace = true

[dev-dependencies]
proptest.workspace = true
142 changes: 139 additions & 3 deletions crates/primitives/src/col_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ use core::{
hash::{Hash, Hasher},
iter,
mem::{size_of, ManuallyDrop},
ops::Deref,
ops::{Deref, DerefMut},
ptr::NonNull,
slice::from_raw_parts,
slice::{from_raw_parts, from_raw_parts_mut},
};
use either::Either;
use itertools::Itertools;

/// Constructs a `ColList` like so `col_list![0, 2]`.
///
Expand All @@ -31,7 +32,12 @@ macro_rules! col_list {
/// but packed into a `u64` in a way that takes advantage of the fact that
/// in almost all cases, we won't store a `ColId` larger than 62.
/// In the rare case that we store larger ids, we fall back to a thin vec approach.
/// We also fall back to a thin vec if the ids stored are not in sorted order, from low to high.
///
/// We also fall back to a thin vec if the ids stored are not in sorted order, from low to high,
/// or if the list contains duplicates.
///
/// If you want a set of columns, use [`ColSet`] instead. It is more likely to be compressed,
/// and so is a better choice if you don't require ordering information.
#[repr(C)]
pub union ColList {
/// Used to determine whether the list is stored inline or not.
Expand Down Expand Up @@ -194,6 +200,22 @@ impl ColList {
self.push_inner(col, self.last().map_or(true, |l| l < col));
}

/// Sort and deduplicate the list.
/// If the list is already sorted and deduplicated, does nothing.
/// This will typically result in an inline list unless there are large `ColId`s in play.
fn sort_dedup(&mut self) {
if let Err(heap) = self.as_inline_mut() {
heap.sort();

// Don't reallocate if the list is already sorted and deduplicated.
let is_deduped = is_sorted_and_deduped(heap);
let wants_inline = heap.last().unwrap_or(&ColId(0)).0 < Self::FIRST_HEAP_COL_U16;
if !is_deduped || wants_inline {
*self = Self::from_iter(heap.iter().copied().dedup());
}
}
}

/// Push `col` onto the list.
///
/// If `col >= 63` or `!preserves_set_order`,
Expand Down Expand Up @@ -311,6 +333,58 @@ impl fmt::Debug for ColList {
}
}

impl From<ColSet> for ColList {
fn from(value: ColSet) -> Self {
value.0
}
}

/// A compressed set of columns. Like a `ColList`, but guaranteed to be sorted and to contain no duplicate entries.
/// Dereferences to a `ColList` for convenience.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ColSet(ColList);

impl ColSet {
/// Check if a `ColSet` contains a given column.
pub fn contains(&self, needle: ColId) -> bool {
match self.as_inline() {
Ok(inline) => inline.contains(needle),
// We can use binary search because the vector is guaranteed to be sorted.
Err(heap) => heap.binary_search(&needle).is_ok(),
}
}

// Don't implement `insert` because repeated insertions will be O(n^2) if we want to keep the set sorted on the heap.
// Use iterator methods to create a new `ColSet` instead.
}

impl<C: Into<ColId>> FromIterator<C> for ColSet {
fn from_iter<T: IntoIterator<Item = C>>(iter: T) -> Self {
Self::from(iter.into_iter().collect::<ColList>())
}
}

impl From<ColList> for ColSet {
fn from(mut list: ColList) -> Self {
list.sort_dedup();
Self(list)
}
}

impl Deref for ColSet {
type Target = ColList;

fn deref(&self) -> &Self::Target {
&self.0
}
}

impl fmt::Debug for ColSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_set().entries(self.iter()).finish()
}
}

/// The inline version of a [`ColList`].
#[derive(Clone, Copy, PartialEq)]
struct ColListInline(u64);
Expand Down Expand Up @@ -508,6 +582,20 @@ impl Deref for ColListVec {
}
}

impl DerefMut for ColListVec {
fn deref_mut(&mut self) -> &mut Self::Target {
let len = self.len() as usize;
let ptr = self.0.as_ptr();
// SAFETY: `ptr + 2` is always in bounds of the allocation and `ptr <= isize::MAX`.
let ptr = unsafe { ptr.add(2) }.cast::<ColId>();
// SAFETY:
// - `ptr` is valid for reads and writes for `len * size_of::<ColId>` and it is properly aligned.
// - `len` elements are initialized.
// - `len * size_of::<ColId> <= isize::MAX` holds.
unsafe { from_raw_parts_mut(ptr, len) }
}
}

impl Drop for ColListVec {
fn drop(&mut self) {
let capacity = self.capacity();
Expand All @@ -529,6 +617,18 @@ impl Clone for ColListVec {
}
}

/// Check if a buffer is sorted and deduplicated.
fn is_sorted_and_deduped(data: &[ColId]) -> bool {
match data {
[] => true,
[mut prev, rest @ ..] => !rest.iter().any(|elem| {
let bad = prev >= *elem;
prev = *elem;
bad
}),
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -616,5 +716,41 @@ mod tests {
_ => prop_assert_eq!(list.as_singleton(), None),
}
}

#[test]
fn test_set_inlines(mut cols in vec((0..ColList::FIRST_HEAP_COL_U16).prop_map_into(), 1..100)) {
prop_assume!(!is_sorted_and_deduped(&cols[..]));

let list = ColList::from_iter(cols.iter().copied());
prop_assert!(!list.is_inline());
let set = ColSet::from(list);
prop_assert!(set.is_inline());

for col in cols.iter() {
prop_assert!(set.contains(*col));
}

cols.sort();
cols.dedup();
prop_assert_eq!(set.iter().collect::<Vec<_>>(), cols);
}

#[test]
fn test_set_heap(mut cols in vec((ColList::FIRST_HEAP_COL_U16..).prop_map_into(), 1..100)) {
prop_assume!(!is_sorted_and_deduped(&cols[..]));

let list = ColList::from_iter(cols.iter().copied());
prop_assert!(!list.is_inline());
let set = ColSet::from(list);
prop_assert!(!set.is_inline());

for col in cols.iter() {
prop_assert!(set.contains(*col));
}

cols.sort();
cols.dedup();
prop_assert_eq!(set.iter().collect::<Vec<_>>(), cols);
}
}
}

2 comments on commit 11283c3

@github-actions
Copy link

@github-actions github-actions bot commented on 11283c3 Sep 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarking failed. Please check the workflow run for details.

@github-actions
Copy link

@github-actions github-actions bot commented on 11283c3 Sep 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarking failed. Please check the workflow run for details.

Please sign in to comment.