Skip to content

Commit

Permalink
Datastore revamp 1: new indexing model & core datastructures
Browse files Browse the repository at this point in the history
  • Loading branch information
teh-cmc committed Apr 12, 2023
1 parent dd6f03e commit 1f1ca89
Show file tree
Hide file tree
Showing 37 changed files with 2,058 additions and 3,450 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,10 @@ polars-core = "0.27.1"
polars-lazy = "0.27.1"
polars-ops = "0.27.1"
puffin = "0.14"
smallvec = { version = "1.0", features = ["const_generics", "union"] }
thiserror = "1.0"
time = { version = "0.3", features = ["wasm-bindgen"] }
tinyvec = { version = "1.6", features = ["alloc", "rustc_1_55"] }
tokio = "1.24"
wgpu = { version = "0.15.1", default-features = false }
wgpu-core = { version = "0.15.1", default-features = false }
Expand Down
8 changes: 4 additions & 4 deletions crates/re_arrow_store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,17 @@ re_log.workspace = true

# External dependencies:
ahash.workspace = true
anyhow.workspace = true
arrow2 = { workspace = true, features = [
"compute_concatenate",
"compute_aggregate",
] }
arrow2_convert.workspace = true
document-features = "0.2"
indent = "0.1"
itertools = { workspace = true }
nohash-hasher = "0.2"
parking_lot.workspace = true
smallvec.workspace = true
static_assertions = "1.1"
thiserror.workspace = true

Expand All @@ -73,6 +74,7 @@ polars-ops = { workspace = true, optional = true, features = [


[dev-dependencies]
anyhow.workspace = true
criterion = "0.4"
mimalloc.workspace = true
polars-core = { workspace = true, features = [
Expand All @@ -85,9 +87,7 @@ polars-core = { workspace = true, features = [
"sort_multiple",
] }
rand = "0.8"
smallvec = { version = "1.0", features = ["const_generics", "union"] }
tinyvec = { version = "1.6", features = ["alloc", "rustc_1_55"] }

tinyvec.workspace = true

[lib]
bench = false
Expand Down
53 changes: 22 additions & 31 deletions crates/re_arrow_store/benches/data_store.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

use arrow2::array::{Array, UnionArray};
use arrow2::array::UnionArray;
use criterion::{criterion_group, criterion_main, Criterion};

use re_arrow_store::{DataStore, DataStoreConfig, LatestAtQuery, RangeQuery, TimeInt, TimeRange};
use re_log_types::{
component_types::{InstanceKey, Rect2D},
datagen::{build_frame_nr, build_some_instances, build_some_rects},
Component as _, ComponentName, DataRow, DataTable, EntityPath, MsgId, TimeType, Timeline,
Component as _, ComponentName, DataCell, DataRow, DataTable, EntityPath, MsgId, TimeType,
Timeline,
};

criterion_group!(benches, insert, latest_at, latest_at_missing, range);
Expand Down Expand Up @@ -73,10 +74,7 @@ fn insert(c: &mut Criterion) {
b.iter(|| {
insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
Expand All @@ -101,10 +99,11 @@ fn latest_at(c: &mut Criterion) {
group.bench_function("default", |b| {
let store = insert_table(Default::default(), InstanceKey::name(), &table);
b.iter(|| {
let results = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = results[0]
let cells = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = cells[0]
.as_ref()
.unwrap()
.as_arrow_ref()
.as_any()
.downcast_ref::<UnionArray>()
.unwrap();
Expand All @@ -116,21 +115,19 @@ fn latest_at(c: &mut Criterion) {
for &num_rows_per_bucket in num_rows_per_bucket() {
let store = insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
&table,
);
group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| {
b.iter(|| {
let results = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = results[0]
let cells = latest_data_at(&store, Rect2D::name(), &[Rect2D::name()]);
let rects = cells[0]
.as_ref()
.unwrap()
.as_arrow_ref()
.as_any()
.downcast_ref::<UnionArray>()
.unwrap();
Expand Down Expand Up @@ -180,10 +177,7 @@ fn latest_at_missing(c: &mut Criterion) {
for &num_rows_per_bucket in num_rows_per_bucket() {
let store = insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
Expand Down Expand Up @@ -236,25 +230,23 @@ fn range(c: &mut Criterion) {
for &num_rows_per_bucket in num_rows_per_bucket() {
let store = insert_table(
DataStoreConfig {
index_bucket_nb_rows: num_rows_per_bucket,
component_bucket_nb_rows: num_rows_per_bucket,
index_bucket_size_bytes: u64::MAX,
component_bucket_size_bytes: u64::MAX,
indexed_bucket_num_rows: num_rows_per_bucket,
..Default::default()
},
InstanceKey::name(),
&table,
);
group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| {
b.iter(|| {
let msgs = range_data(&store, [Rect2D::name()]);
for (cur_time, (time, results)) in msgs.enumerate() {
let rows = range_data(&store, [Rect2D::name()]);
for (cur_time, (time, cells)) in rows.enumerate() {
let time = time.unwrap();
assert_eq!(cur_time as i64, time.as_i64());

let rects = results[0]
let rects = cells[0]
.as_ref()
.unwrap()
.as_arrow_ref()
.as_any()
.downcast_ref::<UnionArray>()
.unwrap();
Expand Down Expand Up @@ -305,26 +297,25 @@ fn latest_data_at<const N: usize>(
store: &DataStore,
primary: ComponentName,
secondaries: &[ComponentName; N],
) -> [Option<Box<dyn Array>>; N] {
) -> [Option<DataCell>; N] {
let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence);
let timeline_query = LatestAtQuery::new(timeline_frame_nr, (NUM_ROWS / 2).into());
let ent_path = EntityPath::from("rects");

let row_indices = store
store
.latest_at(&timeline_query, &ent_path, primary, secondaries)
.unwrap_or_else(|| [(); N].map(|_| None));
store.get(secondaries, &row_indices)
.unwrap_or_else(|| [(); N].map(|_| None))
}

fn range_data<const N: usize>(
store: &DataStore,
components: [ComponentName; N],
) -> impl Iterator<Item = (Option<TimeInt>, [Option<Box<dyn Array>>; N])> + '_ {
) -> impl Iterator<Item = (Option<TimeInt>, [Option<DataCell>; N])> + '_ {
let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence);
let query = RangeQuery::new(timeline_frame_nr, TimeRange::new(0.into(), NUM_ROWS.into()));
let ent_path = EntityPath::from("rects");

store
.range(&query, &ent_path, components)
.map(move |(time, _, row_indices)| (time, store.get(&components, &row_indices)))
.map(move |(time, _, cells)| (time, cells))
}
7 changes: 2 additions & 5 deletions crates/re_arrow_store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,14 @@ pub mod polars_util;
pub mod test_util;

pub use self::arrow_util::ArrayExt;
pub use self::store::{
DataStore, DataStoreConfig, IndexBucket, IndexRowNr, IndexTable, RowIndex, RowIndexKind,
};
pub use self::store::{DataStore, DataStoreConfig};
pub use self::store_gc::GarbageCollectionTarget;
pub use self::store_read::{LatestAtQuery, RangeQuery};
pub use self::store_stats::DataStoreStats;
pub use self::store_write::{WriteError, WriteResult};

pub(crate) use self::store::{
ComponentBucket, ComponentTable, IndexBucketIndices, PersistentComponentTable,
PersistentIndexTable, SecondaryIndex, TimeIndex,
IndexedBucket, IndexedBucketInner, IndexedTable, PersistentIndexedTable,
};

// Re-exports
Expand Down
36 changes: 18 additions & 18 deletions crates/re_arrow_store/src/polars_util.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use arrow2::array::Array;
use itertools::Itertools;
use polars_core::{prelude::*, series::Series};
use polars_ops::prelude::*;
use re_log_types::{ComponentName, EntityPath, TimeInt};
use re_log_types::{ComponentName, DataCell, EntityPath, TimeInt};

use crate::{ArrayExt, DataStore, LatestAtQuery, RangeQuery};

Expand Down Expand Up @@ -38,12 +37,11 @@ pub fn latest_component(
let cluster_key = store.cluster_key();

let components = &[cluster_key, primary];
let row_indices = store
let cells = store
.latest_at(query, ent_path, primary, components)
.unwrap_or([None; 2]);
let results = store.get(components, &row_indices);
.unwrap_or([(); 2].map(|_| None));

dataframe_from_results(components, results)
dataframe_from_cells(&cells)
}

/// Queries any number of components and their cluster keys from their respective point-of-views,
Expand Down Expand Up @@ -161,12 +159,11 @@ pub fn range_components<'a, const N: usize>(
.chain(
store
.range(query, ent_path, components)
.map(move |(time, _, row_indices)| {
let results = store.get(&components, &row_indices);
.map(move |(time, _, cells)| {
(
time,
row_indices[primary_col].is_some(), // is_primary
dataframe_from_results(&components, results),
cells[primary_col].is_some(), // is_primary
dataframe_from_cells(&cells),
)
}),
)
Expand Down Expand Up @@ -200,16 +197,19 @@ pub fn range_components<'a, const N: usize>(

// --- Joins ---

pub fn dataframe_from_results<const N: usize>(
components: &[ComponentName; N],
results: [Option<Box<dyn Array>>; N],
// TODO(#1619): none of this mess should be here

pub fn dataframe_from_cells<const N: usize>(
cells: &[Option<DataCell>; N],
) -> SharedResult<DataFrame> {
let series: Result<Vec<_>, _> = components
let series: Result<Vec<_>, _> = cells
.iter()
.zip(results)
.filter_map(|(component, col)| col.map(|col| (component, col)))
.map(|(&component, col)| {
Series::try_from((component.as_str(), col.as_ref().clean_for_polars()))
.flatten()
.map(|cell| {
Series::try_from((
cell.component_name().as_str(),
cell.as_arrow_ref().clean_for_polars(),
))
})
.collect();

Expand Down
Loading

0 comments on commit 1f1ca89

Please sign in to comment.