diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 5b7d0e5e7ac7..49eaecbb22c3 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -39,7 +39,7 @@ NOTE: `.rrd` files do not yet guarantee any backwards or forwards compatibility. ## Technologies we use ### Apache Arrow -[Apache Arrow](https://arrow.apache.org/) is a language-independent columnar memory format for arbitrary data. We use it to encode the log data when transmitting it over the network or storing it in an `.rrd` file. We also use it in our in-RAM data store, [`re_data_store`](crates/re_data_store/README.md). +[Apache Arrow](https://arrow.apache.org/) is a language-independent columnar memory format for arbitrary data. We use it to encode the log data when transmitting it over the network or storing it in an `.rrd` file. We also use it in our in-RAM data store, [`re_chunk_store`](crates/re_chunk_store/README.md). In Rust, we use the [`arrow2` crate](https://crates.io/crates/arrow2). @@ -88,11 +88,11 @@ Of course, this will only take us so far. In the future we plan on caching queri Here is an overview of the crates included in the project: - - - - - + + + + + @@ -160,7 +160,7 @@ Update instructions: | Crate | Description | |----------------------|--------------------------------------------------------------------------| | re_entity_db | In-memory storage of Rerun entities | -| re_query | Querying data in the re_data_store | +| re_query | Querying data in the re_chunk_store | | re_types | The built-in Rerun data types, component types, and archetypes. | | re_types_blueprint | The core traits and types that power Rerun's Blueprint sub-system. | | re_log_encoding | Helpers for encoding and transporting Rerun log messages | @@ -171,7 +171,7 @@ Update instructions: | Crate | Description | |-----------------|-----------------------------------------------------------------------------------------------| | re_chunk | A chunk of Rerun data, encoded using Arrow. Used for logging, transport, storage and compute. | -| re_data_store | An in-memory time series database for Rerun log data, based on Apache Arrow. | +| re_chunk_store | An in-memory time series database for Rerun log data, based on Apache Arrow. | | re_log_types | The basic building blocks of the Rerun data types and tables. | | re_types_core | The core traits and types that power Rerun's data model. | | re_format_arrow | Formatting of Apache Arrow tables. | diff --git a/Cargo.lock b/Cargo.lock index 9502048a834c..ba77905d55b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4305,12 +4305,46 @@ dependencies = [ "re_tracing", "re_tuid", "re_types_core", + "serde", + "serde_bytes", "similar-asserts", "smallvec", "static_assertions", "thiserror", ] +[[package]] +name = "re_chunk_store" +version = "0.17.0-alpha.9" +dependencies = [ + "ahash", + "anyhow", + "criterion", + "document-features", + "indent", + "insta", + "itertools 0.13.0", + "mimalloc", + "nohash-hasher", + "once_cell", + "parking_lot", + "rand", + "re_arrow2", + "re_chunk", + "re_format", + "re_format_arrow", + "re_log", + "re_log_types", + "re_tracing", + "re_types", + "re_types_core", + "similar-asserts", + "smallvec", + "thiserror", + "tinyvec", + "web-time", +] + [[package]] name = "re_context_menu" version = "0.17.0-alpha.9" @@ -4356,6 +4390,7 @@ dependencies = [ "rayon", "re_build_info", "re_build_tools", + "re_chunk", "re_log", "re_log_encoding", "re_log_types", @@ -4383,37 +4418,6 @@ dependencies = [ "re_ws_comms", ] -[[package]] -name = "re_data_store" -version = "0.17.0-alpha.9" -dependencies = [ - "ahash", - "anyhow", - "criterion", - "document-features", - "indent", - "insta", - "itertools 0.13.0", - "mimalloc", - "nohash-hasher", - "once_cell", - "parking_lot", - "rand", - "re_arrow2", - "re_format", - "re_format_arrow", - "re_log", - "re_log_types", - "re_tracing", - "re_types", - "re_types_core", - "similar-asserts", - "smallvec", - "thiserror", - "tinyvec", - "web-time", -] - [[package]] name = "re_data_ui" version = "0.17.0-alpha.9" @@ -4426,7 +4430,7 @@ dependencies = [ "egui_plot", "image", "itertools 0.13.0", - "re_data_store", + "re_chunk_store", "re_entity_db", "re_error", "re_format", @@ -4498,7 +4502,8 @@ dependencies = [ "parking_lot", "rand", "re_build_info", - "re_data_store", + "re_chunk", + "re_chunk_store", "re_format", "re_int_histogram", "re_log", @@ -4574,6 +4579,7 @@ dependencies = [ "mimalloc", "parking_lot", "re_build_info", + "re_chunk", "re_log", "re_log_types", "re_smart_channel", @@ -4666,7 +4672,8 @@ dependencies = [ "paste", "rand", "re_arrow2", - "re_data_store", + "re_chunk", + "re_chunk_store", "re_error", "re_format", "re_log", @@ -4770,8 +4777,8 @@ dependencies = [ "re_build_info", "re_build_tools", "re_chunk", + "re_chunk_store", "re_data_loader", - "re_data_store", "re_log", "re_log_encoding", "re_log_types", @@ -4811,8 +4818,9 @@ dependencies = [ "itertools 0.13.0", "nohash-hasher", "once_cell", + "re_chunk", + "re_chunk_store", "re_context_menu", - "re_data_store", "re_data_ui", "re_entity_db", "re_log", @@ -4850,7 +4858,7 @@ dependencies = [ "ahash", "egui", "nohash-hasher", - "re_data_store", + "re_chunk_store", "re_entity_db", "re_log", "re_log_types", @@ -4868,7 +4876,7 @@ version = "0.17.0-alpha.9" dependencies = [ "egui", "egui_plot", - "re_data_store", + "re_chunk_store", "re_entity_db", "re_log", "re_log_types", @@ -4887,7 +4895,7 @@ version = "0.17.0-alpha.9" dependencies = [ "egui", "egui_extras", - "re_data_store", + "re_chunk_store", "re_data_ui", "re_entity_db", "re_log_types", @@ -4914,7 +4922,7 @@ dependencies = [ "mimalloc", "nohash-hasher", "once_cell", - "re_data_store", + "re_chunk_store", "re_data_ui", "re_entity_db", "re_error", @@ -4944,7 +4952,7 @@ dependencies = [ "egui", "half 2.3.1", "ndarray", - "re_data_store", + "re_chunk_store", "re_data_ui", "re_entity_db", "re_log", @@ -4967,7 +4975,7 @@ version = "0.17.0-alpha.9" dependencies = [ "egui", "egui_commonmark", - "re_data_store", + "re_chunk_store", "re_renderer", "re_space_view", "re_tracing", @@ -4982,7 +4990,7 @@ version = "0.17.0-alpha.9" dependencies = [ "egui", "egui_extras", - "re_data_store", + "re_chunk_store", "re_data_ui", "re_entity_db", "re_log", @@ -5004,7 +5012,7 @@ dependencies = [ "egui_plot", "itertools 0.13.0", "rayon", - "re_data_store", + "re_chunk_store", "re_format", "re_log", "re_log_types", @@ -5036,8 +5044,8 @@ version = "0.17.0-alpha.9" dependencies = [ "egui", "itertools 0.13.0", + "re_chunk_store", "re_context_menu", - "re_data_store", "re_data_ui", "re_entity_db", "re_format", @@ -5220,9 +5228,10 @@ dependencies = [ "re_blueprint_tree", "re_build_info", "re_build_tools", + "re_chunk", + "re_chunk_store", "re_data_loader", "re_data_source", - "re_data_store", "re_data_ui", "re_edit_ui", "re_entity_db", @@ -5294,8 +5303,9 @@ dependencies = [ "nohash-hasher", "once_cell", "parking_lot", + "re_chunk", + "re_chunk_store", "re_data_source", - "re_data_store", "re_entity_db", "re_error", "re_format", @@ -5354,7 +5364,8 @@ dependencies = [ "nohash-hasher", "once_cell", "parking_lot", - "re_data_store", + "re_chunk", + "re_chunk_store", "re_entity_db", "re_log", "re_log_types", @@ -5482,6 +5493,7 @@ dependencies = [ "re_analytics", "re_build_info", "re_build_tools", + "re_chunk", "re_crash_handler", "re_data_source", "re_entity_db", @@ -5498,6 +5510,7 @@ dependencies = [ "re_viewer", "re_web_viewer_server", "re_ws_comms", + "similar-asserts", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 542bfa83b1c5..822b1d8c1a5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ re_build_info = { path = "crates/re_build_info", version = "=0.17.0-alpha.9", de re_build_tools = { path = "crates/re_build_tools", version = "=0.17.0-alpha.9", default-features = false } re_case = { path = "crates/re_case", version = "=0.17.0-alpha.9", default-features = false } re_chunk = { path = "crates/re_chunk", version = "=0.17.0-alpha.9", default-features = false } +re_chunk_store = { path = "crates/re_chunk_store", version = "=0.17.0-alpha.9", default-features = false } re_context_menu = { path = "crates/re_context_menu", version = "=0.17.0-alpha.9", default-features = false } re_crash_handler = { path = "crates/re_crash_handler", version = "=0.17.0-alpha.9", default-features = false } re_data_loader = { path = "crates/re_data_loader", version = "=0.17.0-alpha.9", default-features = false } diff --git a/crates/re_analytics/src/event.rs b/crates/re_analytics/src/event.rs index f6e7fe520896..ad3ee04b08f5 100644 --- a/crates/re_analytics/src/event.rs +++ b/crates/re_analytics/src/event.rs @@ -75,7 +75,7 @@ pub struct OpenRecording { pub data_source: Option<&'static str>, } -/// Basic information about a recording's data store. +/// Basic information about a recording's chunk store. pub struct StoreInfo { /// Name of the application. /// diff --git a/crates/re_chunk/Cargo.toml b/crates/re_chunk/Cargo.toml index e9f1c56ac4c9..f32fb079e859 100644 --- a/crates/re_chunk/Cargo.toml +++ b/crates/re_chunk/Cargo.toml @@ -22,6 +22,16 @@ all-features = true [features] default = [] +## Enable (de)serialization using serde. +serde = [ + "dep:serde", + "dep:serde_bytes", + "re_log_types/serde", + "re_string_interner/serde", + "re_tuid/serde", + "re_types_core/serde", +] + [dependencies] @@ -40,10 +50,8 @@ re_types_core.workspace = true ahash.workspace = true anyhow.workspace = true arrow2 = { workspace = true, features = [ - "io_ipc", - "io_print", - "compute_comparison", "compute_concatenate", + "compute_filter", ] } backtrace.workspace = true document-features.workspace = true @@ -55,12 +63,17 @@ smallvec.workspace = true static_assertions.workspace = true thiserror.workspace = true +# Optional dependencies: +serde = { workspace = true, optional = true, features = ["derive", "rc"] } +serde_bytes = { workspace = true, optional = true } + # Native dependencies: [target.'cfg(not(target_arch = "wasm32"))'.dependencies] crossbeam.workspace = true [dev-dependencies] +re_log = { workspace = true, features = ["setup"] } criterion.workspace = true mimalloc.workspace = true similar-asserts.workspace = true diff --git a/crates/re_chunk/examples/latest_at.rs b/crates/re_chunk/examples/latest_at.rs new file mode 100644 index 000000000000..2aed6de34767 --- /dev/null +++ b/crates/re_chunk/examples/latest_at.rs @@ -0,0 +1,73 @@ +use re_chunk::{Chunk, LatestAtQuery, RowId, Timeline}; +use re_log_types::example_components::{MyColor, MyLabel, MyPoint}; +use re_types_core::Loggable as _; + +// --- + +fn main() -> anyhow::Result<()> { + let chunk = create_chunk()?; + + eprintln!("Data:\n{chunk}"); + + let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 4); + + // Find all relevant data for a query: + let chunk = chunk.latest_at(&query, MyPoint::name()); + eprintln!("{:?} @ {query:?}:\n{chunk}", MyPoint::name()); + + // And then slice it as appropriate: + let chunk = chunk + .timeline_sliced(Timeline::log_time()) + .component_sliced(MyPoint::name()); + eprintln!("Sliced down to specific timeline and component:\n{chunk}"); + + Ok(()) +} + +fn create_chunk() -> anyhow::Result { + let mut chunk = Chunk::builder("my/entity".into()) + .with_component_batches( + RowId::new(), + [ + (Timeline::log_time(), 1000), + (Timeline::new_sequence("frame"), 1), + ], + [ + &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)] as _, // + ], + ) + .with_component_batches( + RowId::new(), + [ + (Timeline::log_time(), 1032), + (Timeline::new_sequence("frame"), 3), + ], + [ + &[MyColor::from_rgb(1, 1, 1)] as _, // + &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ] as _, // + ], + ) + .with_component_batches( + RowId::new(), + [ + (Timeline::log_time(), 1064), + (Timeline::new_sequence("frame"), 5), + ], + [ + &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ] as _, // + ], + ) + .build()?; + + chunk.sort_if_unsorted(); + + Ok(chunk) +} diff --git a/crates/re_chunk/examples/range.rs b/crates/re_chunk/examples/range.rs new file mode 100644 index 000000000000..822d1901c399 --- /dev/null +++ b/crates/re_chunk/examples/range.rs @@ -0,0 +1,79 @@ +use re_chunk::{Chunk, RangeQuery, RowId, Timeline}; +use re_log_types::{ + example_components::{MyColor, MyLabel, MyPoint}, + ResolvedTimeRange, +}; +use re_types_core::Loggable as _; + +// --- + +fn main() -> anyhow::Result<()> { + let chunk = create_chunk()?; + + eprintln!("Data:\n{chunk}"); + + let query = RangeQuery::new( + Timeline::new_sequence("frame"), + ResolvedTimeRange::EVERYTHING, + ); + + // Find all relevant data for a query: + let chunk = chunk.range(&query, MyPoint::name()); + eprintln!("{:?} @ {query:?}:\n{chunk}", MyPoint::name()); + + // And then slice it as appropriate: + let chunk = chunk + .timeline_sliced(Timeline::log_time()) + .component_sliced(MyPoint::name()); + eprintln!("Sliced down to specific timeline and component:\n{chunk}"); + + Ok(()) +} + +fn create_chunk() -> anyhow::Result { + let mut chunk = Chunk::builder("my/entity".into()) + .with_component_batches( + RowId::new(), + [ + (Timeline::log_time(), 1000), + (Timeline::new_sequence("frame"), 1), + ], + [ + &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)] as _, // + ], + ) + .with_component_batches( + RowId::new(), + [ + (Timeline::log_time(), 1032), + (Timeline::new_sequence("frame"), 3), + ], + [ + &[MyColor::from_rgb(1, 1, 1)] as _, // + &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ] as _, // + ], + ) + .with_component_batches( + RowId::new(), + [ + (Timeline::log_time(), 1064), + (Timeline::new_sequence("frame"), 5), + ], + [ + &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ] as _, // + ], + ) + .build()?; + + chunk.sort_if_unsorted(); + + Ok(chunk) +} diff --git a/crates/re_chunk/src/batcher.rs b/crates/re_chunk/src/batcher.rs index 284eb5800b41..182a799891a3 100644 --- a/crates/re_chunk/src/batcher.rs +++ b/crates/re_chunk/src/batcher.rs @@ -5,14 +5,14 @@ use std::{ time::{Duration, Instant}, }; -use arrow2::array::Array as ArrowArray; +use arrow2::array::{Array as ArrowArray, PrimitiveArray as ArrowPrimitiveArray}; use crossbeam::channel::{Receiver, Sender}; use nohash_hasher::IntMap; -use re_log_types::{EntityPath, RowId, TimePoint, Timeline}; +use re_log_types::{EntityPath, ResolvedTimeRange, TimeInt, TimePoint, Timeline}; use re_types_core::{ComponentName, SizeBytes as _}; -use crate::{arrays_to_list_array, chunk::ChunkResult, Chunk, ChunkId, ChunkTimeline}; +use crate::{Chunk, ChunkId, ChunkResult, ChunkTimeline, RowId}; // --- @@ -551,6 +551,10 @@ fn batching_thread(config: ChunkBatcherConfig, rx_cmd: Receiver, tx_chu re_format::format_bytes(config.flush_num_bytes as _), ); + // Set to `true` when a flush is triggered for a reason other than hitting the time threshold, + // so that the next tick will not unncessarily fire early. + let mut skip_next_tick = false; + use crossbeam::select; loop { select! { @@ -574,12 +578,15 @@ fn batching_thread(config: ChunkBatcherConfig, rx_cmd: Receiver, tx_chu if acc.pending_rows.len() as u64 >= config.flush_num_rows { do_flush_all(acc, &tx_chunk, "rows", config.max_chunk_rows_if_unsorted); + skip_next_tick = true; } else if acc.pending_num_bytes >= config.flush_num_bytes { do_flush_all(acc, &tx_chunk, "bytes", config.max_chunk_rows_if_unsorted); + skip_next_tick = true; } }, Command::Flush(oneshot) => { + skip_next_tick = true; for acc in accs.values_mut() { do_flush_all(acc, &tx_chunk, "manual", config.max_chunk_rows_if_unsorted); } @@ -591,9 +598,13 @@ fn batching_thread(config: ChunkBatcherConfig, rx_cmd: Receiver, tx_chu }, recv(rx_tick) -> _ => { - // TODO(cmc): It would probably be better to have a ticker per entity path. Maybe. At some point. - for acc in accs.values_mut() { - do_flush_all(acc, &tx_chunk, "tick", config.max_chunk_rows_if_unsorted); + if skip_next_tick { + skip_next_tick = false; + } else { + // TODO(cmc): It would probably be better to have a ticker per entity path. Maybe. At some point. + for acc in accs.values_mut() { + do_flush_all(acc, &tx_chunk, "tick", config.max_chunk_rows_if_unsorted); + } } }, }; @@ -678,23 +689,26 @@ impl PendingRow { let timelines = timepoint .into_iter() - .filter_map(|(timeline, time)| { - ChunkTimeline::new(Some(true), vec![time]).map(|time_chunk| (timeline, time_chunk)) + .map(|(timeline, time)| { + let times = ArrowPrimitiveArray::::from_vec(vec![time.as_i64()]); + let time_chunk = ChunkTimeline::new(Some(true), timeline, times); + (timeline, time_chunk) }) .collect(); let components = components .into_iter() .filter_map(|(component_name, array)| { - arrays_to_list_array(&[Some(&*array as _)]).map(|array| (component_name, array)) + crate::util::arrays_to_list_array_opt(&[Some(&*array as _)]) + .map(|array| (component_name, array)) }) .collect(); - Chunk::new( + Chunk::from_native_row_ids( ChunkId::new(), entity_path, Some(true), - vec![row_id], + &[row_id], timelines, components, ) @@ -772,7 +786,7 @@ impl PendingRow { re_tracing::profile_scope!("iterate per datatype set"); let mut row_ids: Vec = Vec::with_capacity(rows.len()); - let mut timelines: BTreeMap = BTreeMap::default(); + let mut timelines: BTreeMap = BTreeMap::default(); // Create all the logical list arrays that we're going to need, accounting for the // possibility of sparse components in the data. @@ -798,22 +812,27 @@ impl PendingRow { // the pre-configured `max_chunk_rows_if_unsorted` threshold, then split _even_ // further! for (&timeline, _) in row_timepoint { - let time_chunk = timelines.entry(timeline).or_default(); + let time_chunk = timelines + .entry(timeline) + .or_insert_with(|| PendingChunkTimeline::new(timeline)); if !row_ids.is_empty() // just being extra cautious && row_ids.len() as u64 >= max_chunk_rows_if_unsorted - && !time_chunk.is_sorted() + && !time_chunk.is_sorted { - chunks.push(Chunk::new( + chunks.push(Chunk::from_native_row_ids( ChunkId::new(), entity_path.clone(), Some(true), - std::mem::take(&mut row_ids), - std::mem::take(&mut timelines), + &std::mem::take(&mut row_ids), + std::mem::take(&mut timelines) + .into_iter() + .map(|(timeline, time_chunk)| (timeline, time_chunk.finish())) + .collect(), std::mem::take(&mut components) .into_iter() .filter_map(|(component_name, arrays)| { - arrays_to_list_array(&arrays) + crate::util::arrays_to_list_array_opt(&arrays) .map(|list_array| (component_name, list_array)) }) .collect(), @@ -826,7 +845,9 @@ impl PendingRow { row_ids.push(*row_id); for (&timeline, &time) in row_timepoint { - let time_chunk = timelines.entry(timeline).or_default(); + let time_chunk = timelines + .entry(timeline) + .or_insert_with(|| PendingChunkTimeline::new(timeline)); time_chunk.push(time); } @@ -841,16 +862,19 @@ impl PendingRow { } } - chunks.push(Chunk::new( + chunks.push(Chunk::from_native_row_ids( ChunkId::new(), entity_path.clone(), Some(true), - row_ids, - timelines, + &std::mem::take(&mut row_ids), + timelines + .into_iter() + .map(|(timeline, time_chunk)| (timeline, time_chunk.finish())) + .collect(), components .into_iter() .filter_map(|(component_name, arrays)| { - arrays_to_list_array(&arrays) + crate::util::arrays_to_list_array_opt(&arrays) .map(|list_array| (component_name, list_array)) }) .collect(), @@ -862,6 +886,58 @@ impl PendingRow { } } +/// Helper class used to buffer time data. +/// +/// See [`PendingRow::many_into_chunks`] for usage. +struct PendingChunkTimeline { + timeline: Timeline, + times: Vec, + is_sorted: bool, + time_range: ResolvedTimeRange, +} + +impl PendingChunkTimeline { + fn new(timeline: Timeline) -> Self { + Self { + timeline, + times: Default::default(), + is_sorted: true, + time_range: ResolvedTimeRange::EMPTY, + } + } + + /// Push a single time value at the end of this chunk. + fn push(&mut self, time: TimeInt) { + let Self { + timeline: _, + times, + is_sorted, + time_range, + } = self; + + *is_sorted &= times.last().copied().unwrap_or(TimeInt::MIN.as_i64()) <= time.as_i64(); + time_range.set_min(TimeInt::min(time_range.min(), time)); + time_range.set_max(TimeInt::max(time_range.max(), time)); + times.push(time.as_i64()); + } + + fn finish(self) -> ChunkTimeline { + let Self { + timeline, + times, + is_sorted, + time_range, + } = self; + + ChunkTimeline { + timeline, + times: ArrowPrimitiveArray::::from_vec(times).to(timeline.datatype()), + is_sorted, + time_range, + } + } +} + // --- // NOTE: @@ -871,12 +947,8 @@ impl PendingRow { #[cfg(test)] mod tests { use crossbeam::channel::TryRecvError; - use itertools::Itertools as _; - use re_log_types::{ - example_components::{MyPoint, MyPoint64}, - TimeInt, - }; + use re_log_types::example_components::{MyPoint, MyPoint64}; use re_types_core::Loggable as _; use super::*; @@ -922,7 +994,7 @@ mod tests { chunks.push(chunk); } - chunks.sort_by_key(|chunk| chunk.row_id_range().0); + chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0); // Make the programmer's life easier if this test fails. eprintln!("Chunks:"); @@ -938,22 +1010,92 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - [42, 43, 44] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![42, 43, 44]), + ), + )]; + let expected_components = [( + MyPoint::name(), + crate::util::arrays_to_list_array_opt(&[&*points1, &*points2, &*points3].map(Some)) + .unwrap(), )]; + let expected_chunk = Chunk::from_native_row_ids( + chunks[0].id, + entity_path1.clone(), + None, + &expected_row_ids, + expected_timelines.into_iter().collect(), + expected_components.into_iter().collect(), + )?; + + eprintln!("Expected:\n{expected_chunk}"); + eprintln!("Got:\n{}", chunks[0]); + assert_eq!(expected_chunk, chunks[0]); + } + + Ok(()) + } + + /// A bunch of rows that don't fit any of the split conditions should end up together. + #[test] + fn simple_static() -> anyhow::Result<()> { + let batcher = ChunkBatcher::new(ChunkBatcherConfig::NEVER)?; + + let timeless = TimePoint::default(); + + let points1 = MyPoint::to_arrow([MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)])?; + let points2 = MyPoint::to_arrow([MyPoint::new(10.0, 20.0), MyPoint::new(30.0, 40.0)])?; + let points3 = MyPoint::to_arrow([MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)])?; + + let components1 = [(MyPoint::name(), points1.clone())]; + let components2 = [(MyPoint::name(), points2.clone())]; + let components3 = [(MyPoint::name(), points3.clone())]; + + let row1 = PendingRow::new(timeless.clone(), components1.into()); + let row2 = PendingRow::new(timeless.clone(), components2.into()); + let row3 = PendingRow::new(timeless.clone(), components3.into()); + + let entity_path1: EntityPath = "a/b/c".into(); + batcher.push_row(entity_path1.clone(), row1.clone()); + batcher.push_row(entity_path1.clone(), row2.clone()); + batcher.push_row(entity_path1.clone(), row3.clone()); + + let chunks_rx = batcher.chunks(); + drop(batcher); // flush and close + + let mut chunks = Vec::new(); + loop { + let chunk = match chunks_rx.try_recv() { + Ok(chunk) => chunk, + Err(TryRecvError::Empty) => panic!("expected chunk, got none"), + Err(TryRecvError::Disconnected) => break, + }; + chunks.push(chunk); + } + + chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0); + + // Make the programmer's life easier if this test fails. + eprintln!("Chunks:"); + for chunk in &chunks { + eprintln!("{chunk}"); + } + + assert_eq!(1, chunks.len()); + + { + let expected_row_ids = vec![row1.row_id, row2.row_id, row3.row_id]; + let expected_timelines = []; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points1, &*points2, &*points3].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points1, &*points2, &*points3].map(Some)) + .unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[0].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1008,7 +1150,7 @@ mod tests { chunks.push(chunk); } - chunks.sort_by_key(|chunk| chunk.row_id_range().0); + chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0); // Make the programmer's life easier if this test fails. eprintln!("Chunks:"); @@ -1024,22 +1166,19 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - [42, 44] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![42, 44]), + ), )]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points1, &*points3].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points1, &*points3].map(Some)).unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[0].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1055,19 +1194,19 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - std::iter::once(43).map(TimeInt::new_temporal).collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![43]), + ), )]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points2].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points2].map(Some)).unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[1].id, entity_path2.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1126,7 +1265,7 @@ mod tests { chunks.push(chunk); } - chunks.sort_by_key(|chunk| chunk.row_id_range().0); + chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0); // Make the programmer's life easier if this test fails. eprintln!("Chunks:"); @@ -1142,19 +1281,19 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - std::iter::once(42).map(TimeInt::new_temporal).collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![42]), + ), )]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points1].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points1].map(Some)).unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[0].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1171,34 +1310,28 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - [43, 44] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![43, 44]), + ), ), ( timeline2, ChunkTimeline::new( Some(true), - [1000, 1001] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline2, + ArrowPrimitiveArray::from_vec(vec![1000, 1001]), + ), ), ]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points2, &*points3].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points2, &*points3].map(Some)).unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[1].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1253,7 +1386,7 @@ mod tests { chunks.push(chunk); } - chunks.sort_by_key(|chunk| chunk.row_id_range().0); + chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0); // Make the programmer's life easier if this test fails. eprintln!("Chunks:"); @@ -1269,22 +1402,19 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - [42, 44] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![42, 44]), + ), )]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points1, &*points3].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points1, &*points3].map(Some)).unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[0].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1300,19 +1430,19 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - std::iter::once(43).map(TimeInt::new_temporal).collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![43]), + ), )]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points2].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points2].map(Some)).unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[1].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1385,7 +1515,7 @@ mod tests { chunks.push(chunk); } - chunks.sort_by_key(|chunk| chunk.row_id_range().0); + chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0); // Make the programmer's life easier if this test fails. eprintln!("Chunks:"); @@ -1402,35 +1532,31 @@ mod tests { timeline1, ChunkTimeline::new( Some(false), - [45, 42, 43, 44] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![45, 42, 43, 44]), + ), ), ( timeline2, ChunkTimeline::new( Some(false), - [1003, 1000, 1001, 1002] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline2, + ArrowPrimitiveArray::from_vec(vec![1003, 1000, 1001, 1002]), + ), ), ]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points1, &*points2, &*points3, &*points4].map(Some)) - .unwrap(), + crate::util::arrays_to_list_array_opt( + &[&*points1, &*points2, &*points3, &*points4].map(Some), + ) + .unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[0].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1503,7 +1629,7 @@ mod tests { chunks.push(chunk); } - chunks.sort_by_key(|chunk| chunk.row_id_range().0); + chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0); // Make the programmer's life easier if this test fails. eprintln!("Chunks:"); @@ -1520,34 +1646,29 @@ mod tests { timeline1, ChunkTimeline::new( Some(false), - [45, 42, 43] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![45, 42, 43]), + ), ), ( timeline2, ChunkTimeline::new( Some(false), - [1003, 1000, 1001] - .into_iter() - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline2, + ArrowPrimitiveArray::from_vec(vec![1003, 1000, 1001]), + ), ), ]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points1, &*points2, &*points3].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points1, &*points2, &*points3].map(Some)) + .unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[0].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; @@ -1564,30 +1685,28 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - std::iter::once(44).map(TimeInt::new_temporal).collect_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::from_vec(vec![44]), + ), ), ( timeline2, ChunkTimeline::new( Some(true), - std::iter::once(1002) - .map(TimeInt::new_temporal) - .collect_vec(), - ) - .unwrap(), + timeline2, + ArrowPrimitiveArray::from_vec(vec![1002]), + ), ), ]; let expected_components = [( MyPoint::name(), - arrays_to_list_array(&[&*points4].map(Some)).unwrap(), + crate::util::arrays_to_list_array_opt(&[&*points4].map(Some)).unwrap(), )]; - let expected_chunk = Chunk::new( + let expected_chunk = Chunk::from_native_row_ids( chunks[1].id, entity_path1.clone(), None, - expected_row_ids, + &expected_row_ids, expected_timelines.into_iter().collect(), expected_components.into_iter().collect(), )?; diff --git a/crates/re_chunk/src/builder.rs b/crates/re_chunk/src/builder.rs new file mode 100644 index 000000000000..496f59cc2633 --- /dev/null +++ b/crates/re_chunk/src/builder.rs @@ -0,0 +1,345 @@ +use std::collections::BTreeMap; + +use arrow2::{ + array::{Array as ArrowArray, PrimitiveArray as ArrowPrimitiveArray}, + datatypes::DataType as ArrowDatatype, +}; +use itertools::Itertools; + +use nohash_hasher::IntMap; +use re_log_types::{EntityPath, TimeInt, TimePoint, Timeline}; +use re_types_core::{AsComponents, ComponentBatch, ComponentName}; + +use crate::{Chunk, ChunkId, ChunkResult, ChunkTimeline, RowId}; + +// --- + +/// Helper to incrementally build a [`Chunk`]. +/// +/// Can be created using [`Chunk::builder`]. +pub struct ChunkBuilder { + id: ChunkId, + entity_path: EntityPath, + + row_ids: Vec, + timelines: BTreeMap, + components: BTreeMap>>>, +} + +impl Chunk { + /// Initializes a new [`ChunkBuilder`]. + #[inline] + pub fn builder(entity_path: EntityPath) -> ChunkBuilder { + ChunkBuilder::new(ChunkId::new(), entity_path) + } + + /// Initializes a new [`ChunkBuilder`]. + /// + /// The final [`Chunk`] will have the specified `id`. + #[inline] + pub fn builder_with_id(id: ChunkId, entity_path: EntityPath) -> ChunkBuilder { + ChunkBuilder::new(id, entity_path) + } +} + +impl ChunkBuilder { + /// Initializes a new [`ChunkBuilder`]. + /// + /// See also [`Chunk::builder`]. + #[inline] + pub fn new(id: ChunkId, entity_path: EntityPath) -> Self { + Self { + id, + entity_path, + + row_ids: Vec::new(), + timelines: BTreeMap::new(), + components: BTreeMap::new(), + } + } + + /// Add a row's worth of data using the given sparse component data. + pub fn with_sparse_row( + mut self, + row_id: RowId, + timepoint: impl Into, + components: impl IntoIterator>)>, + ) -> Self { + let components = components.into_iter().collect_vec(); + + // Align all columns by appending null values for rows where we don't have data. + for (component_name, _) in &components { + let arrays = self.components.entry(*component_name).or_default(); + arrays.extend( + std::iter::repeat(None).take(self.row_ids.len().saturating_sub(arrays.len())), + ); + } + + self.row_ids.push(row_id); + + for (timeline, time) in timepoint.into() { + self.timelines + .entry(timeline) + .or_insert_with(|| ChunkTimeline::builder(timeline)) + .with_row(time); + } + + for (component_name, array) in components { + self.components + .entry(component_name) + .or_default() + .push(array); + } + + // Align all columns by appending null values for rows where we don't have data. + for arrays in self.components.values_mut() { + arrays.extend( + std::iter::repeat(None).take(self.row_ids.len().saturating_sub(arrays.len())), + ); + } + + self + } + + /// Add a row's worth of data using the given component data. + #[inline] + pub fn with_row( + self, + row_id: RowId, + timepoint: impl Into, + components: impl IntoIterator)>, + ) -> Self { + self.with_sparse_row( + row_id, + timepoint, + components + .into_iter() + .map(|(component_name, array)| (component_name, Some(array))), + ) + } + + /// Add a row's worth of data by destructuring an archetype into component columns. + #[inline] + pub fn with_archetype( + self, + row_id: RowId, + timepoint: impl Into, + as_components: &dyn AsComponents, + ) -> Self { + let batches = as_components.as_component_batches(); + self.with_component_batches( + row_id, + timepoint, + batches.iter().map(|batch| batch.as_ref()), + ) + } + + /// Add a row's worth of data by serializing a single [`ComponentBatch`]. + #[inline] + pub fn with_component_batch( + self, + row_id: RowId, + timepoint: impl Into, + component_batch: &dyn ComponentBatch, + ) -> Self { + self.with_row( + row_id, + timepoint, + component_batch + .to_arrow() + .ok() + .map(|array| (component_batch.name(), array)), + ) + } + + /// Add a row's worth of data by serializing many [`ComponentBatch`]es. + #[inline] + pub fn with_component_batches<'a>( + self, + row_id: RowId, + timepoint: impl Into, + component_batches: impl IntoIterator, + ) -> Self { + self.with_row( + row_id, + timepoint, + component_batches.into_iter().filter_map(|component_batch| { + component_batch + .to_arrow() + .ok() + .map(|array| (component_batch.name(), array)) + }), + ) + } + + /// Add a row's worth of data by serializing many sparse [`ComponentBatch`]es. + #[inline] + pub fn with_sparse_component_batches<'a>( + self, + row_id: RowId, + timepoint: impl Into, + component_batches: impl IntoIterator)>, + ) -> Self { + self.with_sparse_row( + row_id, + timepoint, + component_batches + .into_iter() + .map(|(component_name, component_batch)| { + ( + component_name, + component_batch.and_then(|batch| batch.to_arrow().ok()), + ) + }), + ) + } + + /// Builds and returns the final [`Chunk`]. + /// + /// The arrow datatype of each individual column will be guessed by inspecting the data. + /// + /// If any component column turns out to be fully sparse (i.e. only null values), that column + /// will be stripped out (how could we guess its datatype without any single value to inspect)? + /// + /// This is generally the desired behavior but, if you want to make sure to keep fully sparse + /// columns (can be useful e.g. for testing purposes), see [`ChunkBuilder::build_with_datatypes`] + /// instead. + /// + /// This returns an error if the chunk fails to `sanity_check`. + #[inline] + pub fn build(self) -> ChunkResult { + let Self { + id, + entity_path, + row_ids, + timelines, + components, + } = self; + + Chunk::from_native_row_ids( + id, + entity_path, + None, + &row_ids, + timelines + .into_iter() + .map(|(timeline, time_chunk)| (timeline, time_chunk.build())) + .collect(), + components + .into_iter() + .filter_map(|(component_name, arrays)| { + let arrays = arrays.iter().map(|array| array.as_deref()).collect_vec(); + crate::util::arrays_to_list_array_opt(&arrays) + .map(|list_array| (component_name, list_array)) + }) + .collect(), + ) + } + + /// Builds and returns the final [`Chunk`]. + /// + /// The arrow datatype of each individual column will be guessed by inspecting the data. + /// + /// If any component column turns out to be fully sparse (i.e. only null values), `datatypes` + /// will be used as a fallback. + /// + /// If any component column turns out to be fully sparse (i.e. only null values) _and_ doesn't + /// have an explicit datatype passed in, that column will be stripped out (how could we guess + /// its datatype without any single value to inspect)? + /// + /// You should rarely want to keep fully sparse columns around outside of testing scenarios. + /// See [`Self::build`]. + /// + /// This returns an error if the chunk fails to `sanity_check`. + #[inline] + pub fn build_with_datatypes( + self, + datatypes: &IntMap, + ) -> ChunkResult { + let Self { + id, + entity_path, + row_ids, + timelines, + components, + } = self; + + Chunk::from_native_row_ids( + id, + entity_path, + None, + &row_ids, + timelines + .into_iter() + .map(|(timeline, time_chunk)| (timeline, time_chunk.build())) + .collect(), + components + .into_iter() + .filter_map(|(component_name, arrays)| { + let arrays = arrays.iter().map(|array| array.as_deref()).collect_vec(); + + // If we know the datatype in advance, we're able to keep even fully sparse + // columns around. + if let Some(datatype) = datatypes.get(&component_name) { + crate::util::arrays_to_list_array(datatype.clone(), &arrays) + .map(|list_array| (component_name, list_array)) + } else { + crate::util::arrays_to_list_array_opt(&arrays) + .map(|list_array| (component_name, list_array)) + } + }) + .collect(), + ) + } +} + +// --- + +/// Helper to incrementally build a [`ChunkTimeline`]. +/// +/// Can be created using [`ChunkTimeline::builder`]. +pub struct ChunkTimelineBuilder { + timeline: Timeline, + + times: Vec, +} + +impl ChunkTimeline { + /// Initializes a new [`ChunkTimelineBuilder`]. + #[inline] + pub fn builder(timeline: Timeline) -> ChunkTimelineBuilder { + ChunkTimelineBuilder::new(timeline) + } +} + +impl ChunkTimelineBuilder { + /// Initializes a new [`ChunkTimelineBuilder`]. + /// + /// See also [`ChunkTimeline::builder`]. + #[inline] + pub fn new(timeline: Timeline) -> Self { + Self { + timeline, + times: Vec::new(), + } + } + + /// Add a row's worth of time data using the given timestamp. + #[inline] + pub fn with_row(&mut self, time: TimeInt) -> &mut Self { + let Self { timeline: _, times } = self; + + times.push(time.as_i64()); + + self + } + + /// Builds and returns the final [`ChunkTimeline`]. + #[inline] + pub fn build(self) -> ChunkTimeline { + let Self { timeline, times } = self; + + let times = ArrowPrimitiveArray::::from_vec(times).to(timeline.datatype()); + ChunkTimeline::new(None, timeline, times) + } +} diff --git a/crates/re_chunk/src/chunk.rs b/crates/re_chunk/src/chunk.rs index 4d0b84b00e5c..651ec3af60e0 100644 --- a/crates/re_chunk/src/chunk.rs +++ b/crates/re_chunk/src/chunk.rs @@ -1,9 +1,18 @@ -use std::collections::BTreeMap; +use std::{ + collections::BTreeMap, + sync::atomic::{AtomicU64, Ordering}, +}; -use arrow2::array::Array as ArrowArray; +use arrow2::array::{ + Array as ArrowArray, ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray, + StructArray as ArrowStructArray, +}; -use re_log_types::{EntityPath, ResolvedTimeRange, RowId, TimeInt, TimePoint, Timeline}; -use re_types_core::{ComponentName, SerializationError}; +use itertools::{izip, Itertools}; +use re_log_types::{EntityPath, ResolvedTimeRange, TimeInt, TimePoint, Timeline}; +use re_types_core::{ComponentName, Loggable, LoggableBatch, SerializationError, SizeBytes}; + +use crate::{ChunkId, RowId}; // --- @@ -16,18 +25,12 @@ pub enum ChunkError { #[error(transparent)] Serialization(#[from] SerializationError), - - #[error("Chunks cannot be empty")] - Empty, } pub type ChunkResult = Result; // --- -/// Unique identifier for a [`Chunk`], using a [`re_tuid::Tuid`]. -pub type ChunkId = re_tuid::Tuid; - /// Dense arrow-based storage of N rows of multi-component multi-temporal data for a specific entity. /// /// This is our core datastructure for logging, storing, querying and transporting data around. @@ -37,16 +40,23 @@ pub type ChunkId = re_tuid::Tuid; /// /// This is the in-memory representation of a chunk, optimized for efficient manipulation of the /// data within. For transport, see [`crate::TransportChunk`] instead. -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct Chunk { pub(crate) id: ChunkId, + pub(crate) entity_path: EntityPath, + /// The heap size of this chunk in bytes. + /// + /// Must be cached as it is very costly to compute, and needs to be computed repeatedly on the + /// hot path (e.g. during garbage collection). + pub(crate) heap_size_bytes: AtomicU64, + /// Is the chunk as a whole sorted by [`RowId`]? pub(crate) is_sorted: bool, /// The respective [`RowId`]s for each row of data. - pub(crate) row_ids: Vec, + pub(crate) row_ids: ArrowStructArray, /// The time columns. /// @@ -60,92 +70,238 @@ pub struct Chunk { /// Each `ListArray` must be the same length as `row_ids`. /// /// Sparse so that we can e.g. log a `Position` at one timestamp but not a `Color`. - pub(crate) components: BTreeMap>, + // + // TODO(#6576): support non-list based columns? + pub(crate) components: BTreeMap>, } -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct ChunkTimeline { - /// Every single timestamp for this timeline. - /// - /// * This might or might not be sorted, depending on how the data was logged. - /// * This is guaranteed to always be dense, because chunks are split anytime a timeline is - /// added or removed. - /// * This can never contain `TimeInt::STATIC`, since static data doesn't even have timelines. - // - // TODO(cmc): maybe this would be better as raw i64s so getting time columns in and out of - // chunks is just a blind memcpy… it's probably not worth the hassle for now though. - // We'll see how things evolve as we start putting chunks in the backend. - pub(crate) times: Vec, +impl PartialEq for Chunk { + #[inline] + fn eq(&self, other: &Self) -> bool { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted, + row_ids, + timelines, + components, + } = self; - /// Is [`Self::times`] sorted? - /// - /// This is completely independent of [`Chunk::is_sorted`]: a timeline doesn't necessarily - /// follow the global [`RowId`]-based order, although it does in most cases (happy path). - pub(crate) is_sorted: bool, + *id == other.id + && *entity_path == other.entity_path + && *is_sorted == other.is_sorted + && *row_ids == other.row_ids + && *timelines == other.timelines + && *components == other.components + } +} - /// The time range covered by [`Self::times`]. +impl Chunk { + /// Returns `true` is two [`Chunk`]s are similar, although not byte-for-byte equal. /// - /// Not necessarily contiguous! Just the min and max value found in [`Self::times`]. - pub(crate) time_range: ResolvedTimeRange, + /// In particular, this ignores chunks and row IDs, as well as temporal timestamps. + /// + /// Useful for tests. + pub fn are_similar(lhs: &Self, rhs: &Self) -> bool { + let Self { + id: _, + entity_path, + heap_size_bytes: _, + is_sorted: _, + row_ids: _, + timelines, + components, + } = lhs; + + *entity_path == rhs.entity_path + && timelines.keys().collect_vec() == rhs.timelines.keys().collect_vec() + && { + let timelines: BTreeMap<_, _> = timelines + .iter() + .filter(|(timeline, _time_chunk)| { + timeline.typ() != re_log_types::TimeType::Time + }) + .collect(); + let rhs_timelines: BTreeMap<_, _> = rhs + .timelines + .iter() + .filter(|(timeline, _time_chunk)| { + timeline.typ() != re_log_types::TimeType::Time + }) + .collect(); + timelines == rhs_timelines + } + && *components == rhs.components + } } -impl Default for ChunkTimeline { +impl Clone for Chunk { #[inline] - fn default() -> Self { + fn clone(&self) -> Self { Self { - times: Default::default(), - is_sorted: true, - time_range: ResolvedTimeRange::EMPTY, + id: self.id, + entity_path: self.entity_path.clone(), + heap_size_bytes: AtomicU64::new(self.heap_size_bytes.load(Ordering::Relaxed)), + is_sorted: self.is_sorted, + row_ids: self.row_ids.clone(), + timelines: self.timelines.clone(), + components: self.components.clone(), } } } -#[cfg(test)] // do not ever use this outside internal testing, it's extremely slow and hackish -impl PartialEq for Chunk { +impl Chunk { + /// Clones the chunk and assign new IDs to the resulting chunk and its rows. + /// + /// `first_row_id` will become the [`RowId`] of the first row in the duplicated chunk. + /// Each row after that will be monotonically increasing. #[inline] - fn eq(&self, rhs: &Self) -> bool { - let Self { - id: _, // we're comparing the contents - entity_path, - is_sorted, + pub fn clone_as(&self, id: ChunkId, first_row_id: RowId) -> Self { + let row_ids = std::iter::from_fn({ + let mut row_id = first_row_id; + move || { + let yielded = row_id; + row_id = row_id.next(); + Some(yielded) + } + }) + .take(self.row_ids.len()) + .collect_vec(); + + #[allow(clippy::unwrap_used)] + let row_ids = ::to_arrow(&row_ids) + // Unwrap: native RowIds cannot fail to serialize. + .unwrap() + .as_any() + .downcast_ref::() + // Unwrap: RowId schema is known in advance to be a struct array -- always. + .unwrap() + .clone(); + + Self { + id, row_ids, - timelines, - components, - } = self; + ..self.clone() + } + } - use itertools::Itertools as _; + /// Clones the chunk into a new chunk without any time data. + #[inline] + pub fn into_static(mut self) -> Self { + self.timelines.clear(); + self + } - *entity_path == rhs.entity_path - && *is_sorted == rhs.is_sorted - && *row_ids == rhs.row_ids - && *timelines == rhs.timelines - && components.keys().collect_vec() == rhs.components.keys().collect_vec() - && components.iter().all(|(component_name, list_array)| { - let Some(rhs_list_array) = rhs - .components - .get(component_name) - .map(|list_array| &**list_array) - else { - return false; - }; - - // `arrow2::compute::comparison` has very limited support for the different arrow - // types, so we just do our best here. - // This is just a testing/debugging tool. - if arrow2::compute::comparison::can_eq(list_array.data_type()) { - arrow2::compute::comparison::eq(&**list_array, rhs_list_array) - .values_iter() - .all(|v| v) - } else { - list_array.data_type() == rhs_list_array.data_type() - && list_array.len() == rhs_list_array.len() - } + /// Computes the time range covered by each individual component column on each timeline. + /// + /// This is different from the time range covered by the [`Chunk`] as a whole because component + /// columns are potentially sparse. + /// + /// This is crucial for indexing and queries to work properly. + // + // TODO(cmc): This needs to be stored in chunk metadata and transported across IPC. + #[inline] + pub fn time_range_per_component( + &self, + ) -> BTreeMap> { + re_tracing::profile_function!(); + + self.timelines + .iter() + .map(|(&timeline, time_chunk)| { + ( + timeline, + time_chunk.time_range_per_component(&self.components), + ) }) + .collect() + } + + /// Computes the `RowId` range covered by each individual component column on each timeline. + /// + /// This is different from the `RowId` range covered by the [`Chunk`] as a whole because component + /// columns are potentially sparse. + /// + /// This is crucial for indexing and queries to work properly. + // + // TODO(cmc): This needs to be stored in chunk metadata and transported across IPC. + pub fn row_id_range_per_component(&self) -> BTreeMap { + re_tracing::profile_function!(); + + let row_ids = self.row_ids().collect_vec(); + + if self.is_sorted() { + self.components + .iter() + .filter_map(|(component_name, list_array)| { + let mut row_id_min = None; + let mut row_id_max = None; + + for (i, &row_id) in row_ids.iter().enumerate() { + if list_array.is_valid(i) { + row_id_min = Some(row_id); + } + } + for (i, &row_id) in row_ids.iter().enumerate().rev() { + if list_array.is_valid(i) { + row_id_max = Some(row_id); + } + } + + Some((*component_name, (row_id_min?, row_id_max?))) + }) + .collect() + } else { + self.components + .iter() + .filter_map(|(component_name, list_array)| { + let mut row_id_min = Some(RowId::MAX); + let mut row_id_max = Some(RowId::ZERO); + + for (i, &row_id) in row_ids.iter().enumerate() { + if list_array.is_valid(i) && Some(row_id) > row_id_min { + row_id_min = Some(row_id); + } + } + for (i, &row_id) in row_ids.iter().enumerate().rev() { + if list_array.is_valid(i) && Some(row_id) < row_id_max { + row_id_max = Some(row_id); + } + } + + Some((*component_name, (row_id_min?, row_id_max?))) + }) + .collect() + } } } -#[cfg(test)] // do not ever use this outside internal testing, it's extremely slow and hackish -impl Eq for Chunk {} +// --- + +#[derive(Debug, Clone, PartialEq)] +pub struct ChunkTimeline { + pub(crate) timeline: Timeline, + + /// Every single timestamp for this timeline. + /// + /// * This might or might not be sorted, depending on how the data was logged. + /// * This is guaranteed to always be dense, because chunks are split anytime a timeline is + /// added or removed. + /// * This cannot ever contain `TimeInt::STATIC`, since static data doesn't even have timelines. + pub(crate) times: ArrowPrimitiveArray, + + /// Is [`Self::times`] sorted? + /// + /// This is completely independent of [`Chunk::is_sorted`]: a timeline doesn't necessarily + /// follow the global [`RowId`]-based order, although it does in most cases (happy path). + pub(crate) is_sorted: bool, + + /// The time range covered by [`Self::times`]. + /// + /// Not necessarily contiguous! Just the min and max value found in [`Self::times`]. + pub(crate) time_range: ResolvedTimeRange, +} impl Chunk { /// Creates a new [`Chunk`]. @@ -155,27 +311,23 @@ impl Chunk { /// /// Iff you know for sure whether the data is already appropriately sorted or not, specify `is_sorted`. /// When left unspecified (`None`), it will be computed in O(n) time. + /// + /// For a row-oriented constructor, see [`Self::builder`]. pub fn new( id: ChunkId, entity_path: EntityPath, is_sorted: Option, - row_ids: Vec, + row_ids: ArrowStructArray, timelines: BTreeMap, - components: BTreeMap>, + components: BTreeMap>, ) -> ChunkResult { - if row_ids.is_empty() { - return Err(ChunkError::Empty); - } - let mut chunk = Self { id, entity_path, + heap_size_bytes: AtomicU64::new(0), is_sorted: false, row_ids, - timelines: timelines - .into_iter() - .filter(|(_, time_chunk)| !time_chunk.times.is_empty()) - .collect(), + timelines, components, }; @@ -186,14 +338,50 @@ impl Chunk { Ok(chunk) } + /// Creates a new [`Chunk`]. + /// + /// This will fail if the passed in data is malformed in any way -- see [`Self::sanity_check`] + /// for details. + /// + /// Iff you know for sure whether the data is already appropriately sorted or not, specify `is_sorted`. + /// When left unspecified (`None`), it will be computed in O(n) time. + /// + /// For a row-oriented constructor, see [`Self::builder`]. + pub fn from_native_row_ids( + id: ChunkId, + entity_path: EntityPath, + is_sorted: Option, + row_ids: &[RowId], + timelines: BTreeMap, + components: BTreeMap>, + ) -> ChunkResult { + let row_ids = row_ids + .to_arrow() + // NOTE: impossible, but better safe than sorry. + .map_err(|err| ChunkError::Malformed { + reason: format!("RowIds failed to serialize: {err}"), + })? + .as_any() + .downcast_ref::() + // NOTE: impossible, but better safe than sorry. + .ok_or_else(|| ChunkError::Malformed { + reason: "RowIds failed to downcast".to_owned(), + })? + .clone(); + + Self::new(id, entity_path, is_sorted, row_ids, timelines, components) + } + /// Simple helper for [`Self::new`] for static data. + /// + /// For a row-oriented constructor, see [`Self::builder`]. #[inline] pub fn new_static( id: ChunkId, entity_path: EntityPath, is_sorted: Option, - row_ids: Vec, - components: BTreeMap>, + row_ids: ArrowStructArray, + components: BTreeMap>, ) -> ChunkResult { Self::new( id, @@ -204,58 +392,74 @@ impl Chunk { components, ) } + + #[inline] + pub fn empty(id: ChunkId, entity_path: EntityPath) -> Self { + Self { + id, + entity_path, + heap_size_bytes: Default::default(), + is_sorted: true, + row_ids: ArrowStructArray::new_empty(RowId::arrow_datatype()), + timelines: Default::default(), + components: Default::default(), + } + } } impl ChunkTimeline { /// Creates a new [`ChunkTimeline`]. /// - /// Returns `None` if `times` is empty. - /// /// Iff you know for sure whether the data is already appropriately sorted or not, specify `is_sorted`. /// When left unspecified (`None`), it will be computed in O(n) time. - pub fn new(is_sorted: Option, times: Vec) -> Option { + /// + /// For a row-oriented constructor, see [`Self::builder`]. + pub fn new( + is_sorted: Option, + timeline: Timeline, + times: ArrowPrimitiveArray, + ) -> Self { re_tracing::profile_function!(format!("{} times", times.len())); - if times.is_empty() { - return None; - } + let times = times.to(timeline.datatype()); + let time_slice = times.values().as_slice(); let is_sorted = - is_sorted.unwrap_or_else(|| times.windows(2).all(|times| times[0] <= times[1])); + is_sorted.unwrap_or_else(|| time_slice.windows(2).all(|times| times[0] <= times[1])); let time_range = if is_sorted { - // NOTE: The 'or' in 'unwrap_or' is never hit, but better safe than sorry. - let min_time = times.first().copied().unwrap_or(TimeInt::MIN); - let max_time = times.last().copied().unwrap_or(TimeInt::MAX); + // NOTE: The 'or' in 'map_or' is never hit, but better safe than sorry. + let min_time = time_slice + .first() + .copied() + .map_or(TimeInt::MIN, TimeInt::new_temporal); + let max_time = time_slice + .last() + .copied() + .map_or(TimeInt::MAX, TimeInt::new_temporal); ResolvedTimeRange::new(min_time, max_time) } else { // NOTE: Do the iteration multiple times in a cache-friendly way rather than the opposite. // NOTE: The 'or' in 'unwrap_or' is never hit, but better safe than sorry. - let min_time = times.iter().min().copied().unwrap_or(TimeInt::MIN); - let max_time = times.iter().max().copied().unwrap_or(TimeInt::MAX); + let min_time = time_slice + .iter() + .min() + .copied() + .map_or(TimeInt::MIN, TimeInt::new_temporal); + let max_time = time_slice + .iter() + .max() + .copied() + .map_or(TimeInt::MAX, TimeInt::new_temporal); ResolvedTimeRange::new(min_time, max_time) }; - Some(Self { - times, - is_sorted, - time_range, - }) - } - - /// Push a single time value at the end of this chunk. - #[inline] - pub fn push(&mut self, time: TimeInt) { - let Self { + Self { + timeline, times, is_sorted, time_range, - } = self; - - *is_sorted &= times.last().copied().unwrap_or(TimeInt::MIN) <= time; - time_range.set_min(TimeInt::min(time_range.min(), time)); - time_range.set_max(TimeInt::max(time_range.max(), time)); - times.push(time); + } } } @@ -278,6 +482,7 @@ impl Chunk { let Self { id: _, entity_path: _, // not an actual column + heap_size_bytes: _, is_sorted: _, row_ids: _, timelines, @@ -308,23 +513,106 @@ impl Chunk { self.row_ids.len() } - /// Returns the [`RowId`]-range in this [`Chunk`]. + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows() == 0 + } + + /// Returns the [`RowId`]s in their raw-est form: a tuple of (times, counters) arrays. + #[inline] + pub fn row_ids_raw(&self) -> (&ArrowPrimitiveArray, &ArrowPrimitiveArray) { + let [times, counters] = self.row_ids.values() else { + panic!("RowIds are corrupt -- this should be impossible (sanity checked)"); + }; + + #[allow(clippy::unwrap_used)] + let times = times + .as_any() + .downcast_ref::>() + .unwrap(); // sanity checked + + #[allow(clippy::unwrap_used)] + let counters = counters + .as_any() + .downcast_ref::>() + .unwrap(); // sanity checked + + (times, counters) + } + + #[inline] + pub fn row_ids(&self) -> impl Iterator + '_ { + let (times, counters) = self.row_ids_raw(); + izip!(times.values().as_slice(), counters.values().as_slice()) + .map(|(&time, &counter)| RowId::from_u128((time as u128) << 64 | (counter as u128))) + } + + /// Returns the [`RowId`]-range covered by this [`Chunk`]. + /// + /// `None` if the chunk `is_empty`. /// /// This is O(1) if the chunk is sorted, O(n) otherwise. #[inline] - pub fn row_id_range(&self) -> (RowId, RowId) { - #[allow(clippy::unwrap_used)] // cannot create empty chunks - if self.is_sorted() { + pub fn row_id_range(&self) -> Option<(RowId, RowId)> { + if self.is_empty() { + return None; + } + + let (times, counters) = self.row_ids_raw(); + let (times, counters) = (times.values().as_slice(), counters.values().as_slice()); + + #[allow(clippy::unwrap_used)] // checked above + let (index_min, index_max) = if self.is_sorted() { ( - self.row_ids.first().copied().unwrap(), - self.row_ids.last().copied().unwrap(), + ( + times.first().copied().unwrap(), + counters.first().copied().unwrap(), + ), + ( + times.last().copied().unwrap(), + counters.last().copied().unwrap(), + ), ) } else { ( - self.row_ids.iter().min().copied().unwrap(), - self.row_ids.iter().max().copied().unwrap(), + ( + times.iter().min().copied().unwrap(), + counters.iter().min().copied().unwrap(), + ), + ( + times.iter().max().copied().unwrap(), + counters.iter().max().copied().unwrap(), + ), ) - } + }; + + let (time_min, counter_min) = index_min; + let (time_max, counter_max) = index_max; + + Some(( + RowId::from_u128((time_min as u128) << 64 | (counter_min as u128)), + RowId::from_u128((time_max as u128) << 64 | (counter_max as u128)), + )) + } + + #[inline] + pub fn is_static(&self) -> bool { + self.timelines.is_empty() + } + + #[inline] + pub fn timelines(&self) -> &BTreeMap { + &self.timelines + } + + #[inline] + pub fn component_names(&self) -> impl Iterator + '_ { + self.components.keys().copied() + } + + #[inline] + pub fn components(&self) -> &BTreeMap> { + &self.components } /// Computes the maximum value for each and every timeline present across this entire chunk, @@ -349,7 +637,137 @@ impl std::fmt::Display for Chunk { } } -// TODO(cmc): sizebytes impl + sizebytes caching + sizebytes in transport metadata +impl ChunkTimeline { + #[inline] + pub fn time_range(&self) -> ResolvedTimeRange { + self.time_range + } + + #[inline] + pub fn times(&self) -> impl DoubleEndedIterator + '_ { + self.times + .values() + .as_slice() + .iter() + .copied() + .map(TimeInt::new_temporal) + } + + #[inline] + pub fn times_raw(&self) -> &[i64] { + self.times.values().as_slice() + } + + #[inline] + pub fn num_rows(&self) -> usize { + self.times.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows() == 0 + } + + /// Computes the time range covered by each individual component column. + /// + /// This is different from the time range covered by the [`ChunkTimeline`] as a whole + /// because component columns are potentially sparse. + /// + /// This is crucial for indexing and queries to work properly. + // + // TODO(cmc): This needs to be stored in chunk metadata and transported across IPC. + pub fn time_range_per_component( + &self, + components: &BTreeMap>, + ) -> BTreeMap { + let times = self.times_raw(); + components + .iter() + .filter_map(|(&component_name, list_array)| { + if let Some(validity) = list_array.validity() { + // _Potentially_ sparse + + if validity.is_empty() { + return None; + } + + let is_dense = validity.unset_bits() == 0; + if is_dense { + return Some((component_name, self.time_range)); + } + + let mut time_min = TimeInt::MAX; + for (i, time) in times.iter().copied().enumerate() { + if validity.get(i).unwrap_or(false) { + time_min = TimeInt::new_temporal(time); + break; + } + } + + let mut time_max = TimeInt::MIN; + for (i, time) in times.iter().copied().enumerate().rev() { + if validity.get(i).unwrap_or(false) { + time_max = TimeInt::new_temporal(time); + break; + } + } + + Some((component_name, ResolvedTimeRange::new(time_min, time_max))) + } else { + // Dense + + Some((component_name, self.time_range)) + } + }) + .collect() + } +} + +impl re_types_core::SizeBytes for Chunk { + #[inline] + fn heap_size_bytes(&self) -> u64 { + let Self { + id, + entity_path, + heap_size_bytes, + is_sorted, + row_ids, + timelines, + components, + } = self; + + let mut size_bytes = heap_size_bytes.load(Ordering::Relaxed); + + if size_bytes == 0 { + size_bytes = id.heap_size_bytes() + + entity_path.heap_size_bytes() + + is_sorted.heap_size_bytes() + + row_ids.heap_size_bytes() + + timelines.heap_size_bytes() + + components.heap_size_bytes(); + heap_size_bytes.store(size_bytes, Ordering::Relaxed); + } + + size_bytes + } +} + +impl re_types_core::SizeBytes for ChunkTimeline { + #[inline] + fn heap_size_bytes(&self) -> u64 { + let Self { + timeline, + times, + is_sorted, + time_range, + } = self; + + timeline.heap_size_bytes() + + times.heap_size_bytes() // cheap + + is_sorted.heap_size_bytes() + + time_range.heap_size_bytes() + } +} // TODO(cmc): methods to merge chunks (compaction). @@ -365,27 +783,51 @@ impl Chunk { let Self { id: _, entity_path: _, + heap_size_bytes, is_sorted, row_ids, timelines, components, } = self; - if row_ids.is_empty() || components.is_empty() { - return Err(ChunkError::Empty); + #[allow(clippy::collapsible_if)] // readability + if cfg!(debug_assertions) { + let measured = self.heap_size_bytes(); + let advertised = heap_size_bytes.load(Ordering::Relaxed); + if advertised != measured { + return Err(ChunkError::Malformed { + reason: format!( + "Chunk advertises a heap size of {} but we measure {} instead", + re_format::format_bytes(advertised as _), + re_format::format_bytes(measured as _), + ), + }); + } } // Row IDs - #[allow(clippy::collapsible_if)] // readability - if cfg!(debug_assertions) { - if *is_sorted != self.is_sorted_uncached() { + { + if *row_ids.data_type().to_logical_type() != RowId::arrow_datatype() { return Err(ChunkError::Malformed { reason: format!( - "Chunk is marked as {}sorted but isn't: {row_ids:?}", - if *is_sorted { "" } else { "un" }, + "RowId data has the wrong datatype: expected {:?} but got {:?} instead", + RowId::arrow_datatype(), + *row_ids.data_type(), ), }); } + + #[allow(clippy::collapsible_if)] // readability + if cfg!(debug_assertions) { + if *is_sorted != self.is_sorted_uncached() { + return Err(ChunkError::Malformed { + reason: format!( + "Chunk is marked as {}sorted but isn't: {row_ids:?}", + if *is_sorted { "" } else { "un" }, + ), + }); + } + } } // Timelines @@ -432,6 +874,18 @@ impl Chunk { ), }); } + + let validity_is_empty = list_array + .validity() + .map_or(false, |validity| validity.is_empty()); + if !self.is_empty() && validity_is_empty { + return Err(ChunkError::Malformed { + reason: format!( + "All component batches in a chunk must contain at least one non-null entry.\ + Found a completely empty column for {component_name}", + ), + }); + } } Ok(()) @@ -444,11 +898,25 @@ impl ChunkTimeline { /// Costly checks are only run in debug builds. pub fn sanity_check(&self) -> ChunkResult<()> { let Self { + timeline, times, is_sorted, time_range, } = self; + if *times.data_type() != timeline.datatype() { + return Err(ChunkError::Malformed { + reason: format!( + "Time data for timeline {} has the wrong datatype: expected {:?} but got {:?} instead", + timeline.name(), + timeline.datatype(), + *times.data_type(), + ), + }); + } + + let times = times.values().as_slice(); + #[allow(clippy::collapsible_if)] // readability if cfg!(debug_assertions) { if *is_sorted != times.windows(2).all(|times| times[0] <= times[1]) { @@ -463,26 +931,27 @@ impl ChunkTimeline { #[allow(clippy::collapsible_if)] // readability if cfg!(debug_assertions) { - let is_tight_bound = times.iter().any(|&time| time == time_range.min()) - && times.iter().any(|&time| time == time_range.max()); - if !is_tight_bound { + let is_tight_lower_bound = times.iter().any(|&time| time == time_range.min().as_i64()); + let is_tight_upper_bound = times.iter().any(|&time| time == time_range.max().as_i64()); + let is_tight_bound = is_tight_lower_bound && is_tight_upper_bound; + + if !self.is_empty() && !is_tight_bound { return Err(ChunkError::Malformed { reason: "Chunk timeline's cached time range isn't a tight bound.".to_owned(), }); } for &time in times { - if time < time_range.min() || time > time_range.max() { + if time < time_range.min().as_i64() || time > time_range.max().as_i64() { return Err(ChunkError::Malformed { reason: format!( "Chunk timeline's cached time range is wrong.\ - Found a time value of {} while its time range is {time_range:?}", - time.as_i64(), + Found a time value of {time} while its time range is {time_range:?}", ), }); } - if time.is_static() { + if time == TimeInt::STATIC.as_i64() { return Err(ChunkError::Malformed { reason: "A chunk's timeline should never contain a static time value." .to_owned(), diff --git a/crates/re_chunk/src/id.rs b/crates/re_chunk/src/id.rs new file mode 100644 index 000000000000..c35b08f34717 --- /dev/null +++ b/crates/re_chunk/src/id.rs @@ -0,0 +1,249 @@ +/// A unique ID for a [`crate::Chunk`]. +/// +/// `Chunk`s are the atomic unit of ingestion, transport, storage, events and GC in Rerun. +/// +/// Internally, a [`crate::Chunk`] is made up of rows, which are themselves uniquely identified by +/// their [`RowId`]. +/// +/// There is no relationship whatsoever between a [`ChunkId`] and the [`RowId`]s within that chunk. +/// +/// ### Uniqueness +/// +/// [`ChunkId`] are assumed unique within a single Recording. +/// +/// The chunk store will treat two chunks with the same [`ChunkId`] as the same, and only keep one +/// of them (which one is kept is an arbitrary and unstable implementation detail). +/// +/// This makes it easy to build and maintain secondary indices around [`RowId`]s with few to no +/// extraneous state tracking. +/// +/// ### Garbage collection +/// +/// Garbage collection is handled at the chunk level by first ordering the chunks based on the minimum +/// [`RowId`] present in each of them. +/// Garbage collection therefore happens (roughly) in the logger's wall-clock order. +/// +/// This has very important implications when inserting data far into the past or into the future: +/// think carefully about your `RowId`s in these cases. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +pub struct ChunkId(pub(crate) re_tuid::Tuid); + +impl std::fmt::Display for ChunkId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl ChunkId { + pub const ZERO: Self = Self(re_tuid::Tuid::ZERO); + pub const MAX: Self = Self(re_tuid::Tuid::MAX); + + /// Create a new unique [`ChunkId`] based on the current time. + #[allow(clippy::new_without_default)] + #[inline] + pub fn new() -> Self { + Self(re_tuid::Tuid::new()) + } + + /// Returns the next logical [`ChunkId`]. + /// + /// Beware: wrong usage can easily lead to conflicts. + /// Prefer [`ChunkId::new`] when unsure. + #[must_use] + #[inline] + pub fn next(&self) -> Self { + Self(self.0.next()) + } + + /// Returns the `n`-next logical [`ChunkId`]. + /// + /// This is equivalent to calling [`ChunkId::next`] `n` times. + /// Wraps the monotonically increasing back to zero on overflow. + /// + /// Beware: wrong usage can easily lead to conflicts. + /// Prefer [`ChunkId::new`] when unsure. + #[must_use] + #[inline] + pub fn incremented_by(&self, n: u64) -> Self { + Self(self.0.incremented_by(n)) + } + + /// When the `ChunkId` was created, in nanoseconds since unix epoch. + #[inline] + pub fn nanoseconds_since_epoch(&self) -> u64 { + self.0.nanoseconds_since_epoch() + } + + #[inline] + pub fn from_u128(id: u128) -> Self { + Self(re_tuid::Tuid::from_u128(id)) + } + + #[inline] + pub fn as_u128(&self) -> u128 { + self.0.as_u128() + } +} + +impl re_types_core::SizeBytes for ChunkId { + #[inline] + fn heap_size_bytes(&self) -> u64 { + 0 + } + + #[inline] + fn is_pod() -> bool { + true + } +} + +impl std::ops::Deref for ChunkId { + type Target = re_tuid::Tuid; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::ops::DerefMut for ChunkId { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +re_types_core::delegate_arrow_tuid!(ChunkId as "rerun.controls.ChunkId"); + +// --- + +/// A unique ID for a row's worth of data within a chunk. +/// +/// There is no relationship whatsoever between a [`ChunkId`] and the [`RowId`]s within that chunk. +/// +/// ### Uniqueness +/// +/// Duplicated [`RowId`]s within a single recording is considered undefined behavior. +/// +/// While it is benign in most cases, care has to be taken when manually crafting [`RowId`]s. +/// Ideally: don't do so and stick to [`RowId::new`] instead to avoid bad surprises. +/// +/// This makes it easy to build and maintain secondary indices around [`RowId`]s with few to no +/// extraneous state tracking. +/// +/// ### Query +/// +/// Queries (both latest-at & range semantics) will defer to `RowId` order as a tie-breaker when +/// looking at several rows worth of data that rest at the exact same timestamp. +/// +/// In pseudo-code: +/// ```text +/// rr.set_time_sequence("frame", 10) +/// +/// rr.log("my_entity", point1, row_id=#1) +/// rr.log("my_entity", point2, row_id=#0) +/// +/// rr.query("my_entity", at=("frame", 10)) # returns `point1` +/// ``` +/// +/// Think carefully about your `RowId`s when logging a lot of data at the same timestamp. +/// +/// ### Garbage collection +/// +/// Garbage collection is handled at the chunk level by first ordering the chunks based on the minimum +/// [`RowId`] present in each of them. +/// Garbage collection therefore happens (roughly) in the logger's wall-clock order. +/// +/// This has very important implications when inserting data far into the past or into the future: +/// think carefully about your `RowId`s in these cases. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +pub struct RowId(pub(crate) re_tuid::Tuid); + +impl std::fmt::Display for RowId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl RowId { + pub const ZERO: Self = Self(re_tuid::Tuid::ZERO); + pub const MAX: Self = Self(re_tuid::Tuid::MAX); + + /// Create a new unique [`RowId`] based on the current time. + #[allow(clippy::new_without_default)] + #[inline] + pub fn new() -> Self { + Self(re_tuid::Tuid::new()) + } + + /// Returns the next logical [`RowId`]. + /// + /// Beware: wrong usage can easily lead to conflicts. + /// Prefer [`RowId::new`] when unsure. + #[must_use] + #[inline] + pub fn next(&self) -> Self { + Self(self.0.next()) + } + + /// Returns the `n`-next logical [`RowId`]. + /// + /// This is equivalent to calling [`RowId::next`] `n` times. + /// Wraps the monotonically increasing back to zero on overflow. + /// + /// Beware: wrong usage can easily lead to conflicts. + /// Prefer [`RowId::new`] when unsure. + #[must_use] + #[inline] + pub fn incremented_by(&self, n: u64) -> Self { + Self(self.0.incremented_by(n)) + } + + /// When the `RowId` was created, in nanoseconds since unix epoch. + #[inline] + pub fn nanoseconds_since_epoch(&self) -> u64 { + self.0.nanoseconds_since_epoch() + } + + #[inline] + pub fn from_u128(id: u128) -> Self { + Self(re_tuid::Tuid::from_u128(id)) + } + + #[inline] + pub fn as_u128(&self) -> u128 { + self.0.as_u128() + } +} + +impl re_types_core::SizeBytes for RowId { + #[inline] + fn heap_size_bytes(&self) -> u64 { + 0 + } + + #[inline] + fn is_pod() -> bool { + true + } +} + +impl std::ops::Deref for RowId { + type Target = re_tuid::Tuid; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::ops::DerefMut for RowId { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +re_types_core::delegate_arrow_tuid!(RowId as "rerun.controls.RowId"); diff --git a/crates/re_chunk/src/iter.rs b/crates/re_chunk/src/iter.rs new file mode 100644 index 000000000000..6c0345fec79a --- /dev/null +++ b/crates/re_chunk/src/iter.rs @@ -0,0 +1,55 @@ +use arrow2::array::Array as ArrowArray; + +use re_log_types::{TimeInt, Timeline}; +use re_types_core::ComponentName; + +use crate::{Chunk, RowId}; + +// --- + +impl Chunk { + /// Returns an iterator over the rows of the [`Chunk`]. + /// + /// Each yielded item is a component batch with its associated index ([`RowId`] + data time). + /// + /// Iterating a [`Chunk`] on a row basis is very wasteful, performance-wise. + /// Prefer columnar access when possible. + // + // TODO(cmc): a row-based iterator is obviously not what we want -- one of the benefits of + // chunks is to amortize the cost of downcasting & "deserialization". + // But at the moment we still need to run with the native deserialization cache, which expects + // row-based data. + // As soon as we remove the native cache and start exposing `Chunk`s directly to downstream + // systems, we will look into ergonomic ways to do columnar access. + pub fn iter_rows( + &self, + timeline: &Timeline, + component_name: &ComponentName, + ) -> impl Iterator>)> + '_ { + let Self { + id: _, + entity_path: _, + heap_size_bytes: _, + is_sorted: _, + row_ids: _, + timelines, + components, + } = self; + + let row_ids = self.row_ids(); + + let data_times = timelines + .get(timeline) + .into_iter() + .flat_map(|time_chunk| time_chunk.times().collect::>()) + // If there's no time data, then the associate data time must be `TimeInt::STATIC`. + .chain(std::iter::repeat(TimeInt::STATIC)); + + let arrays = components + .get(component_name) + .into_iter() + .flat_map(|list_array| list_array.into_iter()); + + itertools::izip!(data_times, row_ids, arrays) + } +} diff --git a/crates/re_chunk/src/latest_at.rs b/crates/re_chunk/src/latest_at.rs new file mode 100644 index 000000000000..3940131bb5c8 --- /dev/null +++ b/crates/re_chunk/src/latest_at.rs @@ -0,0 +1,170 @@ +use arrow2::array::Array as ArrowArray; + +use re_log_types::{TimeInt, Timeline}; +use re_types_core::ComponentName; + +use crate::{Chunk, RowId}; + +// --- + +/// A query at a given time, for a given timeline. +/// +/// Get the latest version of the data available at this time. +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct LatestAtQuery { + timeline: Timeline, + at: TimeInt, +} + +impl std::fmt::Debug for LatestAtQuery { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!( + "", + self.timeline.typ().format_utc(self.at), + self.timeline.name(), + )) + } +} + +impl LatestAtQuery { + /// The returned query is guaranteed to never include [`TimeInt::STATIC`]. + #[inline] + pub fn new(timeline: Timeline, at: impl TryInto) -> Self { + let at = at.try_into().unwrap_or(TimeInt::MIN); + Self { timeline, at } + } + + #[inline] + pub const fn latest(timeline: Timeline) -> Self { + Self { + timeline, + at: TimeInt::MAX, + } + } + + #[inline] + pub fn timeline(&self) -> Timeline { + self.timeline + } + + #[inline] + pub fn at(&self) -> TimeInt { + self.at + } +} + +// --- + +impl Chunk { + /// Runs a [`LatestAtQuery`] filter on a [`Chunk`]. + /// + /// This behaves as a row-based filter: the result is a new [`Chunk`] that is vertically + /// sliced to only contain the row relevant for the specified `query`. + /// + /// The resulting [`Chunk`] is guaranteed to contain all the same columns has the queried + /// chunk: there is no horizontal slicing going on. + /// + /// An empty [`Chunk`] (i.e. 0 rows, but N columns) is returned if the `query` yields nothing. + /// + /// Because the resulting chunk doesn't discard any column information, you can find extra relevant + /// information by inspecting the data, for examples timestamps on other timelines. + /// See [`Self::timeline_sliced`] and [`Self::component_sliced`] if you do want to filter this + /// extra data. + pub fn latest_at(&self, query: &LatestAtQuery, component_name: ComponentName) -> Self { + if self.is_empty() { + return self.clone(); + } + + re_tracing::profile_function!(format!("{query:?}")); + + let Some(component_list_array) = self.components.get(&component_name) else { + return self.emptied(); + }; + + let mut index = None; + + let is_static = self.is_static(); + let is_sorted_by_row_id = self.is_sorted(); + + if is_static { + if is_sorted_by_row_id { + // Static, row-sorted chunk + + for i in (0..self.num_rows()).rev() { + if !component_list_array.is_valid(i) { + continue; + } + + index = Some(i); + break; + } + } else { + // Static, row-unsorted chunk + + let mut closest_row_id = RowId::ZERO; + + for (i, row_id) in self.row_ids().enumerate() { + if !component_list_array.is_valid(i) { + continue; + } + + let is_closer_row_id = row_id > closest_row_id; + + if is_closer_row_id { + closest_row_id = row_id; + index = Some(i); + } + } + } + } else { + let Some(time_chunk) = self.timelines.get(&query.timeline()) else { + return self.emptied(); + }; + + let is_sorted_by_time = time_chunk.is_sorted(); + let times = time_chunk.times_raw(); + + if is_sorted_by_time { + // Temporal, row-sorted, time-sorted chunk + + let i = times + .partition_point(|&time| time <= query.at().as_i64()) + .saturating_sub(1); + + for i in (0..=i).rev() { + if !component_list_array.is_valid(i) { + continue; + } + + index = Some(i); + break; + } + } else { + // Temporal, unsorted chunk + + let mut closest_data_time = TimeInt::MIN; + let mut closest_row_id = RowId::ZERO; + + for (i, row_id) in self.row_ids().enumerate() { + if !component_list_array.is_valid(i) { + continue; + } + + let data_time = TimeInt::new_temporal(times[i]); + + let is_closer_time = data_time > closest_data_time && data_time <= query.at(); + let is_same_time_but_closer_row_id = + data_time == closest_data_time && row_id > closest_row_id; + + if is_closer_time || is_same_time_but_closer_row_id { + closest_data_time = data_time; + closest_row_id = row_id; + index = Some(i); + } + } + } + } + + index.map_or_else(|| self.emptied(), |i| self.row_sliced(i, 1)) + } +} diff --git a/crates/re_chunk/src/lib.rs b/crates/re_chunk/src/lib.rs index 85893096d22e..2f0719956c11 100644 --- a/crates/re_chunk/src/lib.rs +++ b/crates/re_chunk/src/lib.rs @@ -4,23 +4,46 @@ #![doc = document_features::document_features!()] //! +mod builder; mod chunk; +mod id; +mod iter; +mod latest_at; +mod range; mod shuffle; +mod slice; mod transport; -mod util; +pub mod util; #[cfg(not(target_arch = "wasm32"))] mod batcher; -pub use self::chunk::{Chunk, ChunkError, ChunkId, ChunkResult, ChunkTimeline}; +pub use self::builder::{ChunkBuilder, ChunkTimelineBuilder}; +pub use self::chunk::{Chunk, ChunkError, ChunkResult, ChunkTimeline}; +pub use self::id::{ChunkId, RowId}; +pub use self::latest_at::LatestAtQuery; +pub use self::range::RangeQuery; pub use self::transport::TransportChunk; -pub use self::util::arrays_to_list_array; #[cfg(not(target_arch = "wasm32"))] pub use self::batcher::{ ChunkBatcher, ChunkBatcherConfig, ChunkBatcherError, ChunkBatcherResult, PendingRow, }; +// Re-exports + +#[doc(no_inline)] +pub use arrow2::array::Array as ArrowArray; +#[doc(no_inline)] +pub use re_log_types::{EntityPath, TimeInt, TimePoint, Timeline, TimelineName}; +#[doc(no_inline)] +pub use re_types_core::ComponentName; + pub mod external { pub use arrow2; + + pub use re_log_types; + + #[cfg(not(target_arch = "wasm32"))] + pub use crossbeam; } diff --git a/crates/re_chunk/src/range.rs b/crates/re_chunk/src/range.rs new file mode 100644 index 000000000000..abe485e15178 --- /dev/null +++ b/crates/re_chunk/src/range.rs @@ -0,0 +1,128 @@ +use re_log_types::{ResolvedTimeRange, TimeInt, Timeline}; +use re_types_core::ComponentName; + +use crate::Chunk; + +// --- Range --- + +/// A query over a time range, for a given timeline. +/// +/// Get all the data within this time interval, plus the latest one before the start of the +/// interval. +/// +/// Motivation: all data is considered alive until the next logging to the same component path. +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct RangeQuery { + pub timeline: Timeline, + pub range: ResolvedTimeRange, +} + +impl std::fmt::Debug for RangeQuery { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!( + " Self { + Self { timeline, range } + } + + #[inline] + pub const fn everything(timeline: Timeline) -> Self { + Self { + timeline, + range: ResolvedTimeRange::EVERYTHING, + } + } + + #[inline] + pub fn timeline(&self) -> Timeline { + self.timeline + } + + #[inline] + pub fn range(&self) -> ResolvedTimeRange { + self.range + } +} + +// --- + +impl Chunk { + /// Runs a [`RangeQuery`] filter on a [`Chunk`]. + /// + /// This behaves as a row-based filter: the result is a new [`Chunk`] that is vertically + /// sliced, sorted and filtered in order to only contain the row(s) relevant for the + /// specified `query`. + /// + /// The resulting [`Chunk`] is guaranteed to contain all the same columns has the queried + /// chunk: there is no horizontal slicing going on. + /// + /// An empty [`Chunk`] (i.e. 0 rows, but N columns) is returned if the `query` yields nothing. + /// + /// Because the resulting chunk doesn't discard any column information, you can find extra relevant + /// information by inspecting the data, for examples timestamps on other timelines. + /// See [`Self::timeline_sliced`] and [`Self::component_sliced`] if you do want to filter this + /// extra data. + // + // TODO(#3741): Since we don't have access to arrow's ListView yet, we must actually clone the + // data if the chunk requires sorting. + pub fn range(&self, query: &RangeQuery, component_name: ComponentName) -> Self { + if self.is_empty() { + return self.clone(); + } + + re_tracing::profile_function!(format!("{query:?}")); + + let is_static = self.is_static(); + + if is_static { + // NOTE: A given component for a given entity can only have one static entry associated + // with it, and this entry overrides everything else, which means it is functionally + // equivalent to just running a latest-at query. + self.latest_at( + &crate::LatestAtQuery::new(query.timeline(), TimeInt::MAX), + component_name, + ) + } else { + let Some(is_sorted_by_time) = self + .timelines + .get(&query.timeline()) + .map(|time_chunk| time_chunk.is_sorted()) + else { + return self.emptied(); + }; + + let chunk = self.densified(component_name); + + let chunk = if is_sorted_by_time { + // Temporal, row-sorted, time-sorted chunk + chunk + } else { + // Temporal, unsorted chunk + chunk.sorted_by_timeline_if_unsorted(&query.timeline()) + }; + + let Some(times) = chunk + .timelines + .get(&query.timeline()) + .map(|time_chunk| time_chunk.times_raw()) + else { + return chunk.emptied(); + }; + + let start_index = times.partition_point(|&time| time < query.range().min().as_i64()); + let end_index = times.partition_point(|&time| time <= query.range().max().as_i64()); + + chunk.row_sliced(start_index, end_index.saturating_sub(start_index)) + } + } +} diff --git a/crates/re_chunk/src/shuffle.rs b/crates/re_chunk/src/shuffle.rs index 7683fabf398a..636861b44d91 100644 --- a/crates/re_chunk/src/shuffle.rs +++ b/crates/re_chunk/src/shuffle.rs @@ -1,15 +1,19 @@ use arrow2::{ - array::{Array as ArrowArray, ListArray as ArrowListArray}, + array::{ + Array as ArrowArray, ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray, + StructArray, + }, offset::Offsets as ArrowOffsets, }; use itertools::Itertools as _; +use re_log_types::Timeline; use crate::{Chunk, ChunkTimeline}; // --- impl Chunk { - /// Is the chunk currently ascendingly sorted by [`re_log_types::RowId`]? + /// Is the chunk currently ascendingly sorted by [`crate::RowId`]? /// /// This is O(1) (cached). /// @@ -29,9 +33,9 @@ impl Chunk { pub fn is_sorted_uncached(&self) -> bool { re_tracing::profile_function!(); - self.row_ids - .windows(2) - .all(|row_ids| row_ids[0] <= row_ids[1]) + self.row_ids() + .tuple_windows::<(_, _)>() + .all(|row_ids| row_ids.0 <= row_ids.1) } /// Sort the chunk, if needed. @@ -45,17 +49,20 @@ impl Chunk { re_tracing::profile_function!(); + #[cfg(not(target_arch = "wasm32"))] let now = std::time::Instant::now(); let swaps = { re_tracing::profile_scope!("swaps"); - let mut swaps = (0..self.row_ids.len()).collect::>(); - swaps.sort_by_key(|&i| self.row_ids[i]); + let row_ids = self.row_ids().collect_vec(); + let mut swaps = (0..row_ids.len()).collect::>(); + swaps.sort_by_key(|&i| row_ids[i]); swaps }; self.shuffle_with(&swaps); + #[cfg(not(target_arch = "wasm32"))] re_log::trace!( entity_path = %self.entity_path, num_rows = self.row_ids.len(), @@ -68,6 +75,49 @@ impl Chunk { self.sanity_check().unwrap(); } + /// Returns a new [`Chunk`] that is sorted by `(, RowId)`. + /// + /// The underlying arrow data will be copied and shuffled in memory in order to make it contiguous. + /// + /// This is a no-op if the underlying timeline is already sorted appropriately (happy path). + pub fn sorted_by_timeline_if_unsorted(&self, timeline: &Timeline) -> Self { + re_tracing::profile_function!(); + + let mut chunk = self.clone(); + + let Some(time_chunk) = chunk.timelines.get(timeline) else { + return chunk; + }; + + #[cfg(not(target_arch = "wasm32"))] + let now = std::time::Instant::now(); + + let swaps = { + re_tracing::profile_scope!("swaps"); + let row_ids = chunk.row_ids().collect_vec(); + let times = time_chunk.times_raw().to_vec(); + let mut swaps = (0..times.len()).collect::>(); + swaps.sort_by_key(|&i| (times[i], row_ids[i])); + swaps + }; + + chunk.shuffle_with(&swaps); + + #[cfg(not(target_arch = "wasm32"))] + re_log::trace!( + entity_path = %chunk.entity_path, + num_rows = chunk.row_ids.len(), + elapsed = ?now.elapsed(), + "chunk sorted", + ); + + #[cfg(debug_assertions)] + #[allow(clippy::unwrap_used)] // dev only + chunk.sanity_check().unwrap(); + + chunk + } + /// Randomly shuffles the chunk using the given `seed`. /// /// The underlying arrow data will be copied and shuffled in memory in order to make it contiguous. @@ -75,6 +125,7 @@ impl Chunk { pub fn shuffle_random(&mut self, seed: u64) { re_tracing::profile_function!(); + #[cfg(not(target_arch = "wasm32"))] let now = std::time::Instant::now(); use rand::{seq::SliceRandom as _, SeedableRng as _}; @@ -89,6 +140,7 @@ impl Chunk { self.shuffle_with(&swaps); + #[cfg(not(target_arch = "wasm32"))] re_log::trace!( entity_path = %self.entity_path, num_rows = self.row_ids.len(), @@ -108,42 +160,59 @@ impl Chunk { pub(crate) fn shuffle_with(&mut self, swaps: &[usize]) { re_tracing::profile_function!(); - let Self { - id: _, - entity_path: _, - is_sorted: _, - row_ids, - timelines, - components, - } = self; - // Row IDs { re_tracing::profile_scope!("row ids"); - let original = row_ids.clone(); + let (times, counters) = self.row_ids_raw(); + let (times, counters) = (times.values(), counters.values()); + + let mut sorted_times = times.to_vec(); + let mut sorted_counters = counters.to_vec(); for (to, from) in swaps.iter().copied().enumerate() { - row_ids[to] = original[from]; + sorted_times[to] = times[from]; + sorted_counters[to] = counters[from]; } + + let times = ArrowPrimitiveArray::::from_vec(sorted_times).boxed(); + let counters = ArrowPrimitiveArray::::from_vec(sorted_counters).boxed(); + + self.row_ids = StructArray::new( + self.row_ids.data_type().clone(), + vec![times, counters], + None, + ); } + let Self { + id: _, + entity_path: _, + heap_size_bytes: _, + is_sorted: _, + row_ids: _, + timelines, + components, + } = self; + // Timelines { re_tracing::profile_scope!("timelines"); for info in timelines.values_mut() { let ChunkTimeline { + timeline, times, is_sorted, time_range: _, } = info; - let original = times.clone(); + let mut sorted = times.values().to_vec(); for (to, from) in swaps.iter().copied().enumerate() { - times[to] = original[from]; + sorted[to] = times.values()[from]; } - *is_sorted = times.windows(2).all(|times| times[0] <= times[1]); + *is_sorted = sorted.windows(2).all(|times| times[0] <= times[1]); + *times = ArrowPrimitiveArray::::from_vec(sorted).to(timeline.datatype()); } } @@ -153,16 +222,10 @@ impl Chunk { re_tracing::profile_scope!("components (offsets & data)"); { for original in components.values_mut() { - #[allow(clippy::unwrap_used)] // a chunk's column is always a list array - let original_list = original - .as_any() - .downcast_ref::>() - .unwrap(); - let sorted_arrays = swaps .iter() .copied() - .map(|from| original_list.value(from)) + .map(|from| original.value(from)) .collect_vec(); let sorted_arrays = sorted_arrays .iter() @@ -176,12 +239,11 @@ impl Chunk { .unwrap(); #[allow(clippy::unwrap_used)] // these are slices of the same outer array let values = arrow2::compute::concatenate::concatenate(&sorted_arrays).unwrap(); - let validity = original_list + let validity = original .validity() .map(|validity| swaps.iter().map(|&from| validity.get_bit(from)).collect()); - *original = - ArrowListArray::::new(datatype, offsets.into(), values, validity).boxed(); + *original = ArrowListArray::::new(datatype, offsets.into(), values, validity); } } @@ -209,7 +271,9 @@ impl ChunkTimeline { #[inline] pub fn is_sorted_uncached(&self) -> bool { re_tracing::profile_function!(); - self.times.windows(2).all(|times| times[0] <= times[1]) + self.times_raw() + .windows(2) + .all(|times| times[0] <= times[1]) } } @@ -217,11 +281,11 @@ impl ChunkTimeline { mod tests { use re_log_types::{ example_components::{MyColor, MyPoint}, - EntityPath, RowId, TimeInt, Timeline, + EntityPath, Timeline, }; use re_types_core::Loggable as _; - use crate::{arrays_to_list_array, ChunkId}; + use crate::{ChunkId, RowId}; use super::*; @@ -230,72 +294,62 @@ mod tests { let entity_path: EntityPath = "a/b/c".into(); let timeline1 = Timeline::new_temporal("log_time"); - let timeline2 = Timeline::new_temporal("frame_nr"); + let timeline2 = Timeline::new_sequence("frame_nr"); - let points1 = MyPoint::to_arrow([ + let points1 = vec![ MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0), MyPoint::new(5.0, 6.0), - ])?; - let points2 = None; - let points3 = MyPoint::to_arrow([MyPoint::new(10.0, 20.0)])?; - let points4 = MyPoint::to_arrow([MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)])?; + ]; + let points3 = vec![MyPoint::new(10.0, 20.0)]; + let points4 = vec![MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)]; - let colors1 = MyColor::to_arrow([ + let colors1 = vec![ MyColor::from_rgb(1, 2, 3), MyColor::from_rgb(4, 5, 6), MyColor::from_rgb(7, 8, 9), - ])?; - let colors2 = MyColor::to_arrow([MyColor::from_rgb(10, 20, 30)])?; - let colors3 = None; - let colors4 = MyColor::to_arrow([ + ]; + let colors2 = vec![MyColor::from_rgb(10, 20, 30)]; + let colors4 = vec![ MyColor::from_rgb(101, 102, 103), MyColor::from_rgb(104, 105, 106), - ])?; - - let timelines = [ - ( - timeline1, - ChunkTimeline::new( - Some(true), - [1000, 1001, 1002, 1003].map(TimeInt::new_temporal).to_vec(), - ) - .unwrap(), - ), - ( - timeline2, - ChunkTimeline::new( - Some(true), - [42, 43, 44, 45].map(TimeInt::new_temporal).to_vec(), - ) - .unwrap(), - ), ]; - let components = [ - ( - MyPoint::name(), - arrays_to_list_array(&[Some(&*points1), points2, Some(&*points3), Some(&*points4)]) - .unwrap(), - ), - ( - MyPoint::name(), - arrays_to_list_array(&[Some(&*colors1), Some(&*colors2), colors3, Some(&*colors4)]) - .unwrap(), - ), - ]; - - let row_ids = vec![RowId::new(), RowId::new(), RowId::new(), RowId::new()]; - { - let chunk_sorted = Chunk::new( - ChunkId::new(), - entity_path.clone(), - Some(true), - row_ids.clone(), - timelines.clone().into_iter().collect(), - components.clone().into_iter().collect(), - )?; + let chunk_sorted = Chunk::builder(entity_path.clone()) + .with_sparse_component_batches( + RowId::new(), + [(timeline1, 1000), (timeline2, 42)], + [ + (MyPoint::name(), Some(&points1 as _)), + (MyColor::name(), Some(&colors1 as _)), + ], + ) + .with_sparse_component_batches( + RowId::new(), + [(timeline1, 1001), (timeline2, 43)], + [ + (MyPoint::name(), None), + (MyColor::name(), Some(&colors2 as _)), + ], + ) + .with_sparse_component_batches( + RowId::new(), + [(timeline1, 1002), (timeline2, 44)], + [ + (MyPoint::name(), Some(&points3 as _)), + (MyColor::name(), None), + ], + ) + .with_sparse_component_batches( + RowId::new(), + [(timeline1, 1003), (timeline2, 45)], + [ + (MyPoint::name(), Some(&points4 as _)), + (MyColor::name(), Some(&colors4 as _)), + ], + ) + .build()?; eprintln!("{chunk_sorted}"); @@ -329,4 +383,183 @@ mod tests { Ok(()) } + + #[test] + fn sort_time() -> anyhow::Result<()> { + let entity_path: EntityPath = "a/b/c".into(); + + let timeline1 = Timeline::new_temporal("log_time"); + let timeline2 = Timeline::new_sequence("frame_nr"); + + let chunk_id = ChunkId::new(); + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + let row_id4 = RowId::new(); + + let points1 = vec![ + MyPoint::new(1.0, 2.0), + MyPoint::new(3.0, 4.0), + MyPoint::new(5.0, 6.0), + ]; + let points3 = vec![MyPoint::new(10.0, 20.0)]; + let points4 = vec![MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)]; + + let colors1 = vec![ + MyColor::from_rgb(1, 2, 3), + MyColor::from_rgb(4, 5, 6), + MyColor::from_rgb(7, 8, 9), + ]; + let colors2 = vec![MyColor::from_rgb(10, 20, 30)]; + let colors4 = vec![ + MyColor::from_rgb(101, 102, 103), + MyColor::from_rgb(104, 105, 106), + ]; + + { + let chunk_unsorted_timeline2 = Chunk::builder_with_id(chunk_id, entity_path.clone()) + .with_sparse_component_batches( + row_id1, + [(timeline1, 1000), (timeline2, 45)], + [ + (MyPoint::name(), Some(&points1 as _)), + (MyColor::name(), Some(&colors1 as _)), + ], + ) + .with_sparse_component_batches( + row_id2, + [(timeline1, 1001), (timeline2, 44)], + [ + (MyPoint::name(), None), + (MyColor::name(), Some(&colors2 as _)), + ], + ) + .with_sparse_component_batches( + row_id3, + [(timeline1, 1002), (timeline2, 43)], + [ + (MyPoint::name(), Some(&points3 as _)), + (MyColor::name(), None), + ], + ) + .with_sparse_component_batches( + row_id4, + [(timeline1, 1003), (timeline2, 42)], + [ + (MyPoint::name(), Some(&points4 as _)), + (MyColor::name(), Some(&colors4 as _)), + ], + ) + .build()?; + + eprintln!("unsorted:\n{chunk_unsorted_timeline2}"); + + assert!(chunk_unsorted_timeline2.is_sorted()); + assert!(chunk_unsorted_timeline2.is_sorted_uncached()); + + assert!(chunk_unsorted_timeline2 + .timelines() + .get(&timeline1) + .unwrap() + .is_sorted()); + assert!(chunk_unsorted_timeline2 + .timelines() + .get(&timeline1) + .unwrap() + .is_sorted_uncached()); + + assert!(!chunk_unsorted_timeline2 + .timelines() + .get(&timeline2) + .unwrap() + .is_sorted()); + assert!(!chunk_unsorted_timeline2 + .timelines() + .get(&timeline2) + .unwrap() + .is_sorted_uncached()); + + let chunk_sorted_timeline2 = + chunk_unsorted_timeline2.sorted_by_timeline_if_unsorted(&timeline2); + + eprintln!("sorted:\n{chunk_sorted_timeline2}"); + + assert!(!chunk_sorted_timeline2.is_sorted()); + assert!(!chunk_sorted_timeline2.is_sorted_uncached()); + + assert!(!chunk_sorted_timeline2 + .timelines() + .get(&timeline1) + .unwrap() + .is_sorted()); + assert!(!chunk_sorted_timeline2 + .timelines() + .get(&timeline1) + .unwrap() + .is_sorted_uncached()); + + assert!(chunk_sorted_timeline2 + .timelines() + .get(&timeline2) + .unwrap() + .is_sorted()); + assert!(chunk_sorted_timeline2 + .timelines() + .get(&timeline2) + .unwrap() + .is_sorted_uncached()); + + let chunk_sorted_timeline2_expected = + Chunk::builder_with_id(chunk_id, entity_path.clone()) + .with_sparse_component_batches( + row_id4, + [(timeline1, 1003), (timeline2, 42)], + [ + (MyPoint::name(), Some(&points4 as _)), + (MyColor::name(), Some(&colors4 as _)), + ], + ) + .with_sparse_component_batches( + row_id3, + [(timeline1, 1002), (timeline2, 43)], + [ + (MyPoint::name(), Some(&points3 as _)), + (MyColor::name(), None), + ], + ) + .with_sparse_component_batches( + row_id2, + [(timeline1, 1001), (timeline2, 44)], + [ + (MyPoint::name(), None), + (MyColor::name(), Some(&colors2 as _)), + ], + ) + .with_sparse_component_batches( + row_id1, + [(timeline1, 1000), (timeline2, 45)], + [ + (MyPoint::name(), Some(&points1 as _)), + (MyColor::name(), Some(&colors1 as _)), + ], + ) + .build()?; + + eprintln!("expected:\n{chunk_sorted_timeline2}"); + + assert_eq!( + chunk_sorted_timeline2_expected, + chunk_sorted_timeline2, + "{}", + similar_asserts::SimpleDiff::from_str( + &format!("{chunk_sorted_timeline2_expected}"), + &format!("{chunk_sorted_timeline2}"), + "got", + "expected", + ), + ); + } + + Ok(()) + } } diff --git a/crates/re_chunk/src/slice.rs b/crates/re_chunk/src/slice.rs new file mode 100644 index 000000000000..345b195efa41 --- /dev/null +++ b/crates/re_chunk/src/slice.rs @@ -0,0 +1,495 @@ +use arrow2::array::{ + Array as ArrowArray, BooleanArray as ArrowBooleanArray, ListArray, + PrimitiveArray as ArrowPrimitiveArray, StructArray, +}; + +use itertools::Itertools; +use nohash_hasher::IntSet; +use re_log_types::Timeline; +use re_types_core::ComponentName; + +use crate::{Chunk, ChunkTimeline}; + +// --- + +// NOTE: Not worth writing tests for all of these, until some subtle bug comes up. +// Most of them are indirectly stressed by our higher-level query tests anyhow. + +impl Chunk { + /// Slices the [`Chunk`] vertically. + /// + /// The result is a new [`Chunk`] with the same columns and (potentially) less rows. + /// + /// This cannot fail nor panic: `index` and `len` will be capped so that they cannot + /// run out of bounds. + /// This can result in an empty [`Chunk`] being returned if the slice is completely OOB. + #[inline] + pub fn row_sliced(&self, index: usize, len: usize) -> Self { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted, + row_ids, + timelines, + components, + } = self; + + // NOTE: Bound checking costs are completely dwarfed by everything else, and preventing the + // viewer from crashing is more important than anything else in any case. + + if index >= self.num_rows() { + return self.emptied(); + } + + let end_offset = usize::min(index.saturating_add(len), self.num_rows()); + let len = end_offset.saturating_sub(index); + + if len == 0 { + return self.emptied(); + } + + let is_sorted = *is_sorted || (len < 2); + + let mut chunk = Self { + id: *id, + entity_path: entity_path.clone(), + heap_size_bytes: Default::default(), + is_sorted, + row_ids: row_ids.clone().sliced(index, len), + timelines: timelines + .iter() + .map(|(timeline, time_chunk)| (*timeline, time_chunk.row_sliced(index, len))) + .collect(), + components: components + .iter() + .map(|(component_name, list_array)| { + (*component_name, list_array.clone().sliced(index, len)) + }) + .collect(), + }; + + // We can know for sure whether the resulting chunk is already sorted (see conditional + // above), but the reverse is not true. + // + // Consider e.g. slicing the following chunk on `(1..=3)`: + // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐ + // │ frame ┆ example.MyColor ┆ example.MyPoint │ + // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡ + // │ 3 ┆ [4278255873] ┆ - │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 2 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 3 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │ + // └──────────────┴───────────────────┴────────────────────────────────────────────┘ + // + // The original chunk is unsorted, but the new sliced one actually ends up being sorted. + chunk.is_sorted = is_sorted || chunk.is_sorted_uncached(); + + #[cfg(debug_assertions)] + #[allow(clippy::unwrap_used)] // debug-only + chunk.sanity_check().unwrap(); + + chunk + } + + /// Slices the [`Chunk`] horizontally by keeping only the selected `timeline`. + /// + /// The result is a new [`Chunk`] with the same rows and (at-most) one timeline column. + /// All non-timeline columns will be kept as-is. + /// + /// If `timeline` is not found within the [`Chunk`], the end result will be the same as the + /// current chunk but without any timeline column. + #[inline] + pub fn timeline_sliced(&self, timeline: Timeline) -> Self { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted, + row_ids, + timelines, + components, + } = self; + + let chunk = Self { + id: *id, + entity_path: entity_path.clone(), + heap_size_bytes: Default::default(), + is_sorted: *is_sorted, + row_ids: row_ids.clone(), + timelines: timelines + .get_key_value(&timeline) + .map(|(timeline, time_chunk)| (*timeline, time_chunk.clone())) + .into_iter() + .collect(), + components: components.clone(), + }; + + #[cfg(debug_assertions)] + #[allow(clippy::unwrap_used)] // debug-only + chunk.sanity_check().unwrap(); + + chunk + } + + /// Slices the [`Chunk`] horizontally by keeping only the selected `component_name`. + /// + /// The result is a new [`Chunk`] with the same rows and (at-most) one component column. + /// All non-component columns will be kept as-is. + /// + /// If `component_name` is not found within the [`Chunk`], the end result will be the same as the + /// current chunk but without any component column. + #[inline] + pub fn component_sliced(&self, component_name: ComponentName) -> Self { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted, + row_ids, + timelines, + components, + } = self; + + let chunk = Self { + id: *id, + entity_path: entity_path.clone(), + heap_size_bytes: Default::default(), + is_sorted: *is_sorted, + row_ids: row_ids.clone(), + timelines: timelines.clone(), + components: components + .get_key_value(&component_name) + .map(|(component_name, list_array)| (*component_name, list_array.clone())) + .into_iter() + .collect(), + }; + + #[cfg(debug_assertions)] + #[allow(clippy::unwrap_used)] // debug-only + chunk.sanity_check().unwrap(); + + chunk + } + + /// Slices the [`Chunk`] horizontally by keeping only the selected timelines. + /// + /// The result is a new [`Chunk`] with the same rows and (at-most) the selected timeline columns. + /// All non-timeline columns will be kept as-is. + /// + /// If none of the selected timelines exist in the [`Chunk`], the end result will be the same as the + /// current chunk but without any timeline column. + #[inline] + pub fn timelines_sliced(&self, timelines_to_keep: &IntSet) -> Self { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted, + row_ids, + timelines, + components, + } = self; + + let chunk = Self { + id: *id, + entity_path: entity_path.clone(), + heap_size_bytes: Default::default(), + is_sorted: *is_sorted, + row_ids: row_ids.clone(), + timelines: timelines + .iter() + .filter(|(timeline, _)| timelines_to_keep.contains(timeline)) + .map(|(timeline, time_chunk)| (*timeline, time_chunk.clone())) + .collect(), + components: components.clone(), + }; + + #[cfg(debug_assertions)] + #[allow(clippy::unwrap_used)] // debug-only + chunk.sanity_check().unwrap(); + + chunk + } + + /// Slices the [`Chunk`] horizontally by keeping only the selected `component_names`. + /// + /// The result is a new [`Chunk`] with the same rows and (at-most) the selected component columns. + /// All non-component columns will be kept as-is. + /// + /// If none of the `component_names` exist in the [`Chunk`], the end result will be the same as the + /// current chunk but without any component column. + #[inline] + pub fn components_sliced(&self, component_names: &IntSet) -> Self { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted, + row_ids, + timelines, + components, + } = self; + + let chunk = Self { + id: *id, + entity_path: entity_path.clone(), + heap_size_bytes: Default::default(), + is_sorted: *is_sorted, + row_ids: row_ids.clone(), + timelines: timelines.clone(), + components: components + .iter() + .filter(|(component_name, _)| component_names.contains(component_name)) + .map(|(component_name, list_array)| (*component_name, list_array.clone())) + .collect(), + }; + + #[cfg(debug_assertions)] + #[allow(clippy::unwrap_used)] // debug-only + chunk.sanity_check().unwrap(); + + chunk + } + + /// Densifies the [`Chunk`] vertically based on the `component_name` column. + /// + /// Densifying here means dropping all rows where the associated value in the `component_name` + /// column is null. + /// + /// The result is a new [`Chunk`] where the `component_name` column is guaranteed to be dense. + /// + /// If `component_name` doesn't exist in this [`Chunk`], or if it is already dense, this method + /// is a no-op. + #[inline] + pub fn densified(&self, component_name: ComponentName) -> Self { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted, + row_ids, + timelines, + components, + } = self; + + if self.is_empty() { + return self.clone(); + } + + let Some(component_list_array) = components.get(&component_name) else { + return self.clone(); + }; + + let Some(validity) = component_list_array.validity() else { + return self.clone(); + }; + + let mask = validity.iter().collect_vec(); + let is_sorted = *is_sorted || (mask.iter().filter(|&&b| b).count() < 2); + let validity_filter = ArrowBooleanArray::from_slice(mask); + + let mut chunk = Self { + id: *id, + entity_path: entity_path.clone(), + heap_size_bytes: Default::default(), + is_sorted, + row_ids: crate::util::filter_array(row_ids, &validity_filter), + timelines: timelines + .iter() + .map(|(&timeline, time_chunk)| (timeline, time_chunk.filtered(&validity_filter))) + .collect(), + components: components + .iter() + .map(|(&component_name, list_array)| { + ( + component_name, + crate::util::filter_array(list_array, &validity_filter), + ) + }) + .collect(), + }; + + // We can know for sure whether the resulting chunk is already sorted (see conditional + // above), but the reverse is not true. + // + // Consider e.g. densifying the following chunk on `example.MyPoint`: + // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐ + // │ frame ┆ example.MyColor ┆ example.MyPoint │ + // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡ + // │ 3 ┆ [4278255873] ┆ - │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │ + // └──────────────┴───────────────────┴────────────────────────────────────────────┘ + // + // The original chunk is unsorted, but the new filtered one actually ends up being sorted. + chunk.is_sorted = is_sorted || chunk.is_sorted_uncached(); + + #[cfg(debug_assertions)] + #[allow(clippy::unwrap_used)] // debug-only + chunk.sanity_check().unwrap(); + + chunk + } + + /// Empties the [`Chunk`] vertically. + /// + /// The result is a new [`Chunk`] with the same columns but zero rows. + #[inline] + pub fn emptied(&self) -> Self { + let Self { + id, + entity_path, + heap_size_bytes: _, + is_sorted: _, + row_ids, + timelines, + components, + } = self; + + Self { + id: *id, + entity_path: entity_path.clone(), + heap_size_bytes: Default::default(), + is_sorted: true, + row_ids: StructArray::new_empty(row_ids.data_type().clone()), + timelines: timelines + .iter() + .map(|(&timeline, time_chunk)| (timeline, time_chunk.emptied())) + .collect(), + components: components + .iter() + .map(|(&component_name, list_array)| { + ( + component_name, + ListArray::new_empty(list_array.data_type().clone()), + ) + }) + .collect(), + } + } +} + +impl ChunkTimeline { + /// Slices the [`ChunkTimeline`] vertically. + /// + /// The result is a new [`ChunkTimeline`] with the same timelines and (potentially) less rows. + /// + /// This cannot fail nor panic: `index` and `len` will be capped so that they cannot + /// run out of bounds. + /// This can result in an empty [`ChunkTimeline`] being returned if the slice is completely OOB. + #[inline] + pub fn row_sliced(&self, index: usize, len: usize) -> Self { + let Self { + timeline, + times, + is_sorted, + time_range: _, + } = self; + + // NOTE: Bound checking costs are completely dwarfed by everything else, and preventing the + // viewer from crashing is more important than anything else in any case. + + if index >= self.num_rows() { + return self.emptied(); + } + + let end_offset = usize::min(index.saturating_add(len), self.num_rows()); + let len = end_offset.saturating_sub(index); + + if len == 0 { + return self.emptied(); + } + + let is_sorted = *is_sorted || (len < 2); + + // We can know for sure whether the resulting chunk is already sorted (see conditional + // above), but the reverse is not true. + // + // Consider e.g. slicing the following chunk on `(1..=3)`: + // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐ + // │ frame ┆ example.MyColor ┆ example.MyPoint │ + // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡ + // │ 3 ┆ [4278255873] ┆ - │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 2 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 3 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │ + // └──────────────┴───────────────────┴────────────────────────────────────────────┘ + // + // The original chunk is unsorted, but the new sliced one actually ends up being sorted. + let is_sorted_opt = is_sorted.then_some(is_sorted); + + Self::new( + is_sorted_opt, + *timeline, + ArrowPrimitiveArray::sliced(times.clone(), index, len), + ) + } + + /// Empties the [`ChunkTimeline`] vertically. + /// + /// The result is a new [`ChunkTimeline`] with the same columns but zero rows. + #[inline] + pub fn emptied(&self) -> Self { + let Self { + timeline, + times, + is_sorted: _, + time_range: _, + } = self; + + Self::new( + Some(true), + *timeline, + ArrowPrimitiveArray::new_empty(times.data_type().clone()), + ) + } + + /// Runs a filter compute kernel on the time data with the specified `mask`. + #[inline] + pub(crate) fn filtered(&self, filter: &ArrowBooleanArray) -> Self { + let Self { + timeline, + times, + is_sorted, + time_range: _, + } = self; + + let is_sorted = *is_sorted || filter.values_iter().filter(|&b| b).count() < 2; + + // We can know for sure whether the resulting chunk is already sorted (see conditional + // above), but the reverse is not true. + // + // Consider e.g. densifying the following chunk on `example.MyPoint`: + // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐ + // │ frame ┆ example.MyColor ┆ example.MyPoint │ + // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡ + // │ 3 ┆ [4278255873] ┆ - │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │ + // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │ + // └──────────────┴───────────────────┴────────────────────────────────────────────┘ + // + // The original chunk is unsorted, but the new filtered one actually ends up being sorted. + let is_sorted_opt = is_sorted.then_some(is_sorted); + + Self::new( + is_sorted_opt, + *timeline, + crate::util::filter_array(times, filter), + ) + } +} diff --git a/crates/re_chunk/src/transport.rs b/crates/re_chunk/src/transport.rs index 977076a4081b..ca343bf04217 100644 --- a/crates/re_chunk/src/transport.rs +++ b/crates/re_chunk/src/transport.rs @@ -1,7 +1,10 @@ use std::collections::BTreeMap; use arrow2::{ - array::{Array as ArrowArray, PrimitiveArray as ArrowPrimitiveArray}, + array::{ + Array as ArrowArray, ListArray, PrimitiveArray as ArrowPrimitiveArray, + StructArray as ArrowStructArray, + }, chunk::Chunk as ArrowChunk, datatypes::{ DataType as ArrowDatatype, Field as ArrowField, Metadata as ArrowMetadata, @@ -9,10 +12,10 @@ use arrow2::{ }, }; -use re_log_types::{EntityPath, RowId, TimeInt, Timeline}; -use re_types_core::Loggable as _; +use re_log_types::{EntityPath, Timeline}; +use re_types_core::{Loggable as _, SizeBytes}; -use crate::{Chunk, ChunkError, ChunkId, ChunkResult, ChunkTimeline}; +use crate::{Chunk, ChunkError, ChunkId, ChunkResult, ChunkTimeline, RowId}; // --- @@ -53,6 +56,8 @@ impl std::fmt::Display for TransportChunk { } } +// TODO(#6572): Relying on Arrow's native schema metadata feature is bound to fail, we need to +// switch to something more powerful asap. impl TransportChunk { /// The key used to identify a Rerun [`ChunkId`] in chunk-level [`ArrowSchema`] metadata. pub const CHUNK_METADATA_KEY_ID: &'static str = "rerun.id"; @@ -60,6 +65,10 @@ impl TransportChunk { /// The key used to identify a Rerun [`EntityPath`] in chunk-level [`ArrowSchema`] metadata. pub const CHUNK_METADATA_KEY_ENTITY_PATH: &'static str = "rerun.entity_path"; + /// The key used to identify the size in bytes of the data, once loaded in memory, in chunk-level + /// [`ArrowSchema`] metadata. + pub const CHUNK_METADATA_KEY_HEAP_SIZE_BYTES: &'static str = "rerun.heap_size_bytes"; + /// The marker used to identify whether a chunk is sorted in chunk-level [`ArrowSchema`] metadata. /// /// The associated value is irrelevant -- if this marker is present, then it is true. @@ -103,6 +112,18 @@ impl TransportChunk { .into() } + /// Returns the appropriate chunk-level [`ArrowSchema`] metadata for the in-memory size in bytes. + #[inline] + pub fn chunk_metadata_heap_size_bytes(heap_size_bytes: u64) -> ArrowMetadata { + [ + ( + Self::CHUNK_METADATA_KEY_HEAP_SIZE_BYTES.to_owned(), + heap_size_bytes.to_string(), + ), // + ] + .into() + } + /// Returns the appropriate chunk-level [`ArrowSchema`] metadata for a Rerun [`EntityPath`]. #[inline] pub fn chunk_metadata_entity_path(entity_path: &EntityPath) -> ArrowMetadata { @@ -179,19 +200,18 @@ impl TransportChunk { impl TransportChunk { #[inline] pub fn id(&self) -> ChunkResult { - match self.schema.metadata.get(Self::CHUNK_METADATA_KEY_ID) { - Some(id) => { - let id = u128::from_str_radix(id, 16).map_err(|err| ChunkError::Malformed { - reason: format!("cannot deserialize chunk id: {err}"), - })?; - Ok(ChunkId::from_u128(id)) - } - None => Err(crate::ChunkError::Malformed { + if let Some(id) = self.schema.metadata.get(Self::CHUNK_METADATA_KEY_ID) { + let id = u128::from_str_radix(id, 16).map_err(|err| ChunkError::Malformed { + reason: format!("cannot deserialize chunk id: {err}"), + })?; + Ok(ChunkId::from_u128(id)) + } else { + Err(crate::ChunkError::Malformed { reason: format!( "chunk id missing from metadata ({:?})", self.schema.metadata ), - }), + }) } } @@ -212,6 +232,14 @@ impl TransportChunk { } } + #[inline] + pub fn heap_size_bytes(&self) -> Option { + self.schema + .metadata + .get(Self::CHUNK_METADATA_KEY_HEAP_SIZE_BYTES) + .and_then(|s| s.parse::().ok()) + } + /// Looks in the chunk metadata for the `IS_SORTED` marker. /// /// It is possible that a chunk is sorted but didn't set that marker. @@ -308,6 +336,7 @@ impl Chunk { let Self { id, entity_path, + heap_size_bytes: _, // use the method instead because of lazy initialization is_sorted, row_ids, timelines, @@ -329,6 +358,12 @@ impl Chunk { .metadata .extend(TransportChunk::chunk_metadata_entity_path(entity_path)); + schema + .metadata + .extend(TransportChunk::chunk_metadata_heap_size_bytes( + self.heap_size_bytes(), + )); + if *is_sorted { schema .metadata @@ -340,7 +375,6 @@ impl Chunk { { re_tracing::profile_scope!("row ids"); - let row_ids = RowId::to_arrow(row_ids)?; schema.fields.push( ArrowField::new( RowId::name().to_string(), @@ -349,7 +383,7 @@ impl Chunk { ) .with_metadata(TransportChunk::field_metadata_control_column()), ); - columns.push(row_ids); + columns.push(row_ids.clone().boxed()); } // Timelines @@ -358,21 +392,12 @@ impl Chunk { for (timeline, info) in timelines { let ChunkTimeline { + timeline: _, times, is_sorted, time_range: _, } = info; - let times = { - let values = times.iter().map(|time| time.as_i64()).collect(); - ArrowPrimitiveArray::new( - arrow2::types::PrimitiveType::Int64.into(), - values, - None, - ) - .to(timeline.datatype()) - }; - let field = ArrowField::new( timeline.name().to_string(), times.data_type().clone(), @@ -387,7 +412,7 @@ impl Chunk { }); schema.fields.push(field); - columns.push(Box::new(times)); + columns.push(times.clone().boxed() /* cheap */); } } @@ -400,7 +425,7 @@ impl Chunk { ArrowField::new(component_name.to_string(), data.data_type().clone(), true) .with_metadata(TransportChunk::field_metadata_data_column()), ); - columns.push(data.clone() /* refcounted (dyn Clone) */); + columns.push(data.clone().boxed()); } } @@ -410,34 +435,46 @@ impl Chunk { }) } - pub fn from_transport(chunk: &TransportChunk) -> ChunkResult { + pub fn from_transport(transport: &TransportChunk) -> ChunkResult { re_tracing::profile_function!(format!( "num_columns={} num_rows={}", - chunk.num_columns(), - chunk.num_rows() + transport.num_columns(), + transport.num_rows() )); // Metadata let (id, entity_path, is_sorted) = { re_tracing::profile_scope!("metadata"); - (chunk.id()?, chunk.entity_path()?, chunk.is_sorted()) + ( + transport.id()?, + transport.entity_path()?, + transport.is_sorted(), + ) }; // Row IDs let row_ids = { re_tracing::profile_scope!("row ids"); - let Some(column) = chunk.controls().find_map(|(field, column)| { + let Some(row_ids) = transport.controls().find_map(|(field, column)| { (field.name == RowId::name().as_str()).then_some(column) }) else { return Err(ChunkError::Malformed { - reason: format!("missing row_id column ({:?})", chunk.schema), + reason: format!("missing row_id column ({:?})", transport.schema), }); }; - RowId::from_arrow(&**column).map_err(|err| ChunkError::Malformed { - reason: format!("row_id column is not deserializable: {err}"), - })? + row_ids + .as_any() + .downcast_ref::() + .ok_or_else(|| ChunkError::Malformed { + reason: format!( + "RowId data has the wrong datatype: expected {:?} but got {:?} instead", + RowId::arrow_datatype(), + *row_ids.data_type(), + ), + })? + .clone() }; // Timelines @@ -446,7 +483,7 @@ impl Chunk { let mut timelines = BTreeMap::default(); - for (field, column) in chunk.timelines() { + for (field, column) in transport.timelines() { // See also [`Timeline::datatype`] let timeline = match column.data_type().to_logical_type() { ArrowDatatype::Int64 => Timeline::new_sequence(field.name.as_str()), @@ -492,22 +529,16 @@ impl Chunk { let time_chunk = ChunkTimeline::new( is_sorted.then_some(true), - times - .values_iter() - .copied() - .map(TimeInt::new_temporal) - .collect(), + timeline, + times.clone(), /* cheap */ ); - - if let Some(time_chunk) = time_chunk { - if timelines.insert(timeline, time_chunk).is_some() { - return Err(ChunkError::Malformed { - reason: format!( - "time column '{}' was specified more than once", - field.name, - ), - }); - } + if timelines.insert(timeline, time_chunk).is_some() { + return Err(ChunkError::Malformed { + reason: format!( + "time column '{}' was specified more than once", + field.name, + ), + }); } } @@ -518,16 +549,16 @@ impl Chunk { let components = { let mut components = BTreeMap::default(); - for (field, column) in chunk.components() { - if !matches!(column.data_type(), ArrowDatatype::List(_)) { - return Err(ChunkError::Malformed { + for (field, column) in transport.components() { + let column = column + .as_any() + .downcast_ref::>() + .ok_or_else(|| ChunkError::Malformed { reason: format!( - "component column '{}' is not deserializable ({:?})", - field.name, - column.data_type() + "The outer array in a chunked component batch must be a sparse list, got {:?}", + column.data_type(), ), - }); - } + })?; if components .insert( @@ -548,14 +579,52 @@ impl Chunk { components }; - Self::new( + let mut res = Self::new( id, entity_path, is_sorted.then_some(true), row_ids, timelines, components, - ) + )?; + + if let Some(heap_size_bytes) = transport.heap_size_bytes() { + res.heap_size_bytes = heap_size_bytes.into(); + } + + Ok(res) + } +} + +impl Chunk { + #[inline] + pub fn from_arrow_msg(msg: &re_log_types::ArrowMsg) -> ChunkResult { + let re_log_types::ArrowMsg { + chunk_id: _, + timepoint_max: _, + schema, + chunk, + on_release: _, + } = msg; + + Self::from_transport(&TransportChunk { + schema: schema.clone(), + data: chunk.clone(), + }) + } + + #[inline] + pub fn to_arrow_msg(&self) -> ChunkResult { + self.sanity_check()?; + + let transport = self.to_transport()?; + Ok(re_log_types::ArrowMsg { + chunk_id: re_tuid::Tuid::from_u128(self.id().as_u128()), + timepoint_max: self.timepoint_max(), + schema: transport.schema, + chunk: transport.data, + on_release: None, + }) } } @@ -563,11 +632,9 @@ impl Chunk { mod tests { use re_log_types::{ example_components::{MyColor, MyPoint}, - TimeInt, Timeline, + Timeline, }; - use crate::arrays_to_list_array; - use super::*; #[test] @@ -579,9 +646,9 @@ mod tests { timeline1, ChunkTimeline::new( Some(true), - [42, 43, 44, 45].map(TimeInt::new_temporal).to_vec(), - ) - .unwrap(), + timeline1, + ArrowPrimitiveArray::::from_vec(vec![42, 43, 44, 45]), + ), )) .collect(); @@ -607,7 +674,7 @@ mod tests { let components = [ (MyPoint::name(), { - let list_array = arrays_to_list_array(&[ + let list_array = crate::util::arrays_to_list_array_opt(&[ Some(&*points1), points2, Some(&*points3), @@ -618,9 +685,13 @@ mod tests { list_array }), (MyPoint::name(), { - let list_array = - arrays_to_list_array(&[Some(&*colors1), Some(&*colors2), colors3, colors4]) - .unwrap(); + let list_array = crate::util::arrays_to_list_array_opt(&[ + Some(&*colors1), + Some(&*colors2), + colors3, + colors4, + ]) + .unwrap(); assert_eq!(4, list_array.len()); list_array }), @@ -629,11 +700,11 @@ mod tests { let row_ids = vec![RowId::new(), RowId::new(), RowId::new(), RowId::new()]; for timelines in [timelines1, timelines2] { - let chunk_original = Chunk::new( + let chunk_original = Chunk::from_native_row_ids( ChunkId::new(), entity_path.clone(), None, - row_ids.clone(), + &row_ids, timelines.clone(), components.clone().into_iter().collect(), )?; @@ -651,6 +722,10 @@ mod tests { chunk_in_transport.entity_path()?, *chunk_after.entity_path() ); + assert_eq!( + chunk_in_transport.heap_size_bytes(), + Some(chunk_after.heap_size_bytes()), + ); assert_eq!( chunk_in_transport.num_columns(), chunk_original.num_columns() @@ -684,9 +759,9 @@ mod tests { chunk_after.num_components() ); - // eprintln!("{chunk_before}"); + eprintln!("{chunk_before}"); eprintln!("{chunk_in_transport}"); - // eprintln!("{chunk_after}"); + eprintln!("{chunk_after}"); assert_eq!(chunk_before, chunk_after); diff --git a/crates/re_chunk/src/util.rs b/crates/re_chunk/src/util.rs index 1fe1356ffbd2..092d3ff8e943 100644 --- a/crates/re_chunk/src/util.rs +++ b/crates/re_chunk/src/util.rs @@ -1,6 +1,7 @@ use arrow2::{ - array::{Array as ArrowArray, ListArray as ArrowListArray}, + array::{Array as ArrowArray, BooleanArray as ArrowBooleanArray, ListArray as ArrowListArray}, bitmap::Bitmap as ArrowBitmap, + datatypes::DataType as ArrowDataType, offset::Offsets as ArrowOffsets, }; use itertools::Itertools as _; @@ -12,29 +13,41 @@ use itertools::Itertools as _; /// All arrays must have the same datatype. /// /// Returns `None` if `arrays` is empty. -pub fn arrays_to_list_array(arrays: &[Option<&dyn ArrowArray>]) -> Option> { +#[inline] +pub fn arrays_to_list_array_opt(arrays: &[Option<&dyn ArrowArray>]) -> Option> { + let datatype = arrays + .iter() + .flatten() + .map(|array| array.data_type().clone()) + .next()?; + arrays_to_list_array(datatype, arrays) +} + +/// Create a sparse list-array out of an array of arrays. +/// +/// Returns `None` if any of the specified `arrays` doesn't match the given `array_datatype`. +/// +/// Returns an empty list if `arrays` is empty. +pub fn arrays_to_list_array( + array_datatype: ArrowDataType, + arrays: &[Option<&dyn ArrowArray>], +) -> Option> { let arrays_dense = arrays.iter().flatten().copied().collect_vec(); - if arrays_dense.is_empty() { - return None; - } + let data = if arrays_dense.is_empty() { + arrow2::array::new_empty_array(array_datatype.clone()) + } else { + arrow2::compute::concatenate::concatenate(&arrays_dense) + .map_err(|err| { + re_log::warn_once!("failed to concatenate arrays: {err}"); + err + }) + .ok()? + }; - let data = arrow2::compute::concatenate::concatenate(&arrays_dense) - .map_err(|err| { - re_log::warn_once!("failed to concatenate arrays: {err}"); - err - }) - .ok()?; - - let datatype = arrays_dense - .first() - .map(|array| array.data_type().clone())?; - debug_assert!(arrays_dense - .iter() - .all(|array| *array.data_type() == datatype)); - let datatype = ArrowListArray::::default_datatype(datatype); + let datatype = ArrowListArray::::default_datatype(array_datatype); - #[allow(clippy::unwrap_used)] // yes, there are indeed lengths + #[allow(clippy::unwrap_used)] // yes, these are indeed lengths let offsets = ArrowOffsets::try_from_lengths( arrays .iter() @@ -45,5 +58,57 @@ pub fn arrays_to_list_array(arrays: &[Option<&dyn ArrowArray>]) -> Option::new(datatype, offsets.into(), data, validity.into()).boxed()) + Some(ArrowListArray::::new( + datatype, + offsets.into(), + data, + validity.into(), + )) +} + +/// Given a sparse `ArrowListArray` (i.e. an array with a validity bitmap that contains at least +/// one falsy value), returns a dense `ArrowListArray` that only contains the non-null values from +/// the original list. +/// +/// This is a no-op if the original array is already dense. +pub fn sparse_list_array_to_dense_list_array( + list_array: &ArrowListArray, +) -> ArrowListArray { + if list_array.is_empty() { + return list_array.clone(); + } + + let is_empty = list_array + .validity() + .map_or(false, |validity| validity.is_empty()); + if is_empty { + return list_array.clone(); + } + + #[allow(clippy::unwrap_used)] // yes, these are indeed lengths + let offsets = + ArrowOffsets::try_from_lengths(list_array.iter().flatten().map(|array| array.len())) + .unwrap(); + + ArrowListArray::::new( + list_array.data_type().clone(), + offsets.into(), + list_array.values().clone(), + None, + ) +} + +/// Applies a filter kernel to the given `array`. +/// +/// Takes care of up- and down-casting the data back and forth on behalf of the caller. +pub fn filter_array(array: &A, filter: &ArrowBooleanArray) -> A { + #[allow(clippy::unwrap_used)] + arrow2::compute::filter::filter(array, filter) + // Unwrap: this literally cannot fail. + .unwrap() + .as_any() + .downcast_ref::() + // Unwrap: that's initial type that we got. + .unwrap() + .clone() } diff --git a/crates/re_chunk/tests/latest_at.rs b/crates/re_chunk/tests/latest_at.rs new file mode 100644 index 000000000000..596dc21e3a5e --- /dev/null +++ b/crates/re_chunk/tests/latest_at.rs @@ -0,0 +1,501 @@ +use arrow2::datatypes::DataType as ArrowDatatype; +use nohash_hasher::IntMap; + +use re_chunk::{Chunk, ComponentName, LatestAtQuery, RowId, TimePoint, Timeline}; +use re_log_types::example_components::{MyColor, MyLabel, MyPoint}; +use re_types_core::Loggable; + +// --- + +const ENTITY_PATH: &str = "my/entity"; + +fn datatypes() -> IntMap { + [ + (MyPoint::name(), MyPoint::arrow_datatype()), + (MyColor::name(), MyColor::arrow_datatype()), + (MyLabel::name(), MyLabel::arrow_datatype()), + ] + .into_iter() + .collect() +} + +#[test] +fn temporal_sorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint1 = [ + (Timeline::log_time(), 1000), + (Timeline::new_sequence("frame"), 1), + ]; + let timepoint2 = [ + (Timeline::log_time(), 1032), + (Timeline::new_sequence("frame"), 3), + ]; + let timepoint3 = [ + (Timeline::log_time(), 1064), + (Timeline::new_sequence("frame"), 5), + ]; + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id1, timepoint1, [points1 as _]) + .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _]) + .with_component_batches(row_id3, timepoint3, [points3 as _]) + .build()?; + + { + let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 2); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id1, + timepoint1, + [ + (MyPoint::name(), Some(points1 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = chunk.emptied(); + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = chunk.emptied(); + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + { + let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 4); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id1, + timepoint1, + [ + (MyPoint::name(), Some(points1 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + { + let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 6); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id3, + timepoint3, + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +#[test] +fn temporal_unsorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint1 = [ + (Timeline::log_time(), 1000), + (Timeline::new_sequence("frame"), 1), + ]; + let timepoint2 = [ + (Timeline::log_time(), 1032), + (Timeline::new_sequence("frame"), 3), + ]; + let timepoint3 = [ + (Timeline::log_time(), 1064), + (Timeline::new_sequence("frame"), 5), + ]; + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _]) + .with_component_batches(row_id1, timepoint1, [points1 as _]) + .with_component_batches(row_id3, timepoint3, [points3 as _]) + .build()?; + + { + let query = LatestAtQuery::new(Timeline::log_time(), 1000); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id1, + timepoint1, + [ + (MyPoint::name(), Some(points1 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = chunk.emptied(); + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = chunk.emptied(); + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + { + let query = LatestAtQuery::new(Timeline::log_time(), 1050); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id1, + timepoint1, + [ + (MyPoint::name(), Some(points1 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + { + let query = LatestAtQuery::new(Timeline::log_time(), 1100); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id3, + timepoint3, + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +#[test] +fn static_sorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint = TimePoint::default(); + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id1, timepoint.clone(), [points1 as _]) + .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _]) + .with_component_batches(row_id3, timepoint.clone(), [points3 as _]) + .build()?; + + for frame_nr in [2, 4, 6] { + let query = LatestAtQuery::new(Timeline::new_sequence("frame"), frame_nr); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id3, + timepoint.clone(), + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +#[test] +fn static_unsorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint = TimePoint::default(); + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id3, timepoint.clone(), [points3 as _]) + .with_component_batches(row_id1, timepoint.clone(), [points1 as _]) + .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _]) + .build()?; + + for log_time in [1000, 1050, 1100] { + let query = LatestAtQuery::new(Timeline::log_time(), log_time); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id3, + timepoint.clone(), + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +// --- + +fn query_and_compare( + (component_name, query): (ComponentName, &LatestAtQuery), + chunk: &Chunk, + expected: &Chunk, +) { + re_log::setup_logging(); + + let results = chunk.latest_at(query, component_name); + + eprintln!("Query: {component_name} @ {query:?}"); + eprintln!("Data:\n{chunk}"); + eprintln!("Expected:\n{expected}"); + eprintln!("Results:\n{results}"); + + assert_eq!( + *expected, + results, + "{}", + similar_asserts::SimpleDiff::from_str( + &format!("{results}"), + &format!("{expected}"), + // &format!("{results:#?}"), + // &format!("{expected:#?}"), + "got", + "expected", + ), + ); +} diff --git a/crates/re_chunk/tests/range.rs b/crates/re_chunk/tests/range.rs new file mode 100644 index 000000000000..8432b0b7c161 --- /dev/null +++ b/crates/re_chunk/tests/range.rs @@ -0,0 +1,475 @@ +use arrow2::datatypes::DataType as ArrowDatatype; +use nohash_hasher::IntMap; + +use re_chunk::{Chunk, ComponentName, RangeQuery, RowId, TimePoint, Timeline}; +use re_log_types::{ + example_components::{MyColor, MyLabel, MyPoint}, + ResolvedTimeRange, +}; +use re_types_core::Loggable as _; + +// --- + +const ENTITY_PATH: &str = "my/entity"; + +fn datatypes() -> IntMap { + [ + (MyPoint::name(), MyPoint::arrow_datatype()), + (MyColor::name(), MyColor::arrow_datatype()), + (MyLabel::name(), MyLabel::arrow_datatype()), + ] + .into_iter() + .collect() +} + +#[test] +fn temporal_sorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint1 = [ + (Timeline::log_time(), 1000), + (Timeline::new_sequence("frame"), 1), + ]; + let timepoint2 = [ + (Timeline::log_time(), 1032), + (Timeline::new_sequence("frame"), 3), + ]; + let timepoint3 = [ + (Timeline::log_time(), 1064), + (Timeline::new_sequence("frame"), 5), + ]; + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id1, timepoint1, [points1 as _]) + .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _]) + .with_component_batches(row_id3, timepoint3, [points3 as _]) + .build()?; + + { + let query = RangeQuery::new( + Timeline::new_sequence("frame"), + ResolvedTimeRange::EVERYTHING, + ); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id1, + timepoint1, + [ + (MyPoint::name(), Some(points1 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .with_sparse_component_batches( + row_id3, + timepoint3, + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + { + let query = RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050)); + + let expected = chunk.emptied(); + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +#[test] +fn temporal_unsorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint1 = [ + (Timeline::log_time(), 1000), + (Timeline::new_sequence("frame"), 1), + ]; + let timepoint2 = [ + (Timeline::log_time(), 1032), + (Timeline::new_sequence("frame"), 3), + ]; + let timepoint3 = [ + (Timeline::log_time(), 1064), + (Timeline::new_sequence("frame"), 5), + ]; + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _]) + .with_component_batches(row_id1, timepoint1, [points1 as _]) + .with_component_batches(row_id3, timepoint3, [points3 as _]) + .build()?; + + { + let query = RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::EVERYTHING); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id1, + timepoint1, + [ + (MyPoint::name(), Some(points1 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .with_sparse_component_batches( + row_id3, + timepoint3, + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + { + let query = RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050)); + + let expected = chunk.emptied(); + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint2, + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +#[test] +fn static_sorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint = TimePoint::default(); + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id1, timepoint.clone(), [points1 as _]) + .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _]) + .with_component_batches(row_id3, timepoint.clone(), [points3 as _]) + .build()?; + + let queries = [ + RangeQuery::new( + Timeline::new_sequence("frame"), + ResolvedTimeRange::EVERYTHING, + ), + RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050)), + ]; + + for query in queries { + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id3, + timepoint.clone(), + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +#[test] +fn static_unsorted() -> anyhow::Result<()> { + let row_id1 = RowId::new(); + let row_id2 = RowId::new(); + let row_id3 = RowId::new(); + + let timepoint = TimePoint::default(); + + let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)]; + let points3 = &[ + MyPoint::new(3.0, 3.0), + MyPoint::new(4.0, 4.0), + MyPoint::new(5.0, 5.0), + ]; + + let colors2 = &[MyColor::from_rgb(1, 1, 1)]; + + let labels2 = &[ + MyLabel("a".into()), + MyLabel("b".into()), + MyLabel("c".into()), + ]; + + let chunk = Chunk::builder(ENTITY_PATH.into()) + .with_component_batches(row_id3, timepoint.clone(), [points3 as _]) + .with_component_batches(row_id1, timepoint.clone(), [points1 as _]) + .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _]) + .build()?; + + let queries = [ + RangeQuery::new( + Timeline::new_sequence("frame"), + ResolvedTimeRange::EVERYTHING, + ), + RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050)), + ]; + + for query in queries { + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id3, + timepoint.clone(), + [ + (MyPoint::name(), Some(points3 as _)), + (MyColor::name(), None), + (MyLabel::name(), None), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyPoint::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyColor::name(), &query), &chunk, &expected); + + let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into()) + .with_sparse_component_batches( + row_id2, + timepoint.clone(), + [ + (MyPoint::name(), None), + (MyColor::name(), Some(colors2 as _)), + (MyLabel::name(), Some(labels2 as _)), + ], + ) + .build_with_datatypes(&datatypes())?; + query_and_compare((MyLabel::name(), &query), &chunk, &expected); + } + + Ok(()) +} + +// --- + +fn query_and_compare( + (component_name, query): (ComponentName, &RangeQuery), + chunk: &Chunk, + expected: &Chunk, +) { + re_log::setup_logging(); + + let results = chunk.range(query, component_name); + + eprintln!("Query: {component_name} @ {query:?}"); + eprintln!("Data:\n{chunk}"); + eprintln!("Expected:\n{expected}"); + eprintln!("Results:\n{results}"); + + assert_eq!( + *expected, + results, + "{}", + similar_asserts::SimpleDiff::from_str( + &format!("{results}"), + &format!("{expected}"), + // &format!("{results:#?}"), + // &format!("{expected:#?}"), + "got", + "expected", + ), + ); +} diff --git a/crates/re_data_store/Cargo.toml b/crates/re_chunk_store/Cargo.toml similarity index 81% rename from crates/re_data_store/Cargo.toml rename to crates/re_chunk_store/Cargo.toml index ef5937d27e79..43fb1a162d75 100644 --- a/crates/re_data_store/Cargo.toml +++ b/crates/re_chunk_store/Cargo.toml @@ -1,7 +1,7 @@ [package] -name = "re_data_store" +name = "re_chunk_store" authors.workspace = true -description = "An in-memory time series database for Rerun log data, based on Apache Arrow" +description = "A storage engine for Rerun's Chunks" edition.workspace = true homepage.workspace = true include.workspace = true @@ -28,6 +28,7 @@ deadlock_detection = ["parking_lot/deadlock_detection"] [dependencies] # Rerun dependencies: +re_chunk.workspace = true re_format.workspace = true re_format_arrow.workspace = true re_log = { workspace = true, features = ["setup"] } @@ -60,23 +61,3 @@ mimalloc.workspace = true rand = { workspace = true, features = ["std", "std_rng"] } similar-asserts.workspace = true tinyvec.workspace = true - -[lib] -bench = false - - -[[bench]] -name = "arrow2" -harness = false - -[[bench]] -name = "data_store" -harness = false - -[[bench]] -name = "gc" -harness = false - -[[bench]] -name = "vectors" -harness = false diff --git a/crates/re_chunk_store/README.md b/crates/re_chunk_store/README.md new file mode 100644 index 000000000000..35800660488a --- /dev/null +++ b/crates/re_chunk_store/README.md @@ -0,0 +1,12 @@ +# Rerun chunk store + +Part of the [`rerun`](https://github.com/rerun-io/rerun) family of crates. + +[![Latest version](https://img.shields.io/crates/v/re_chunk_store.svg)](https://crates.io/crates/re_chunk_store?speculative-link) +[![Documentation](https://docs.rs/re_chunk_store/badge.svg)](https://docs.rs/re_chunk_store?speculative-link) +![MIT](https://img.shields.io/badge/license-MIT-blue.svg) +![Apache](https://img.shields.io/badge/license-Apache-blue.svg) + +[Apache Arrow](https://arrow.apache.org/) is a language-independent columnar memory format for arbitrary data. + +The `re_chunk_store` crate is an in-memory time series database for Rerun log data. It is indexed by Entity path, component, timeline, and time. It supports out-of-order insertions, and fast `O(log(N))` queries. diff --git a/crates/re_data_store/src/store_event.rs b/crates/re_chunk_store/src/events.rs similarity index 59% rename from crates/re_data_store/src/store_event.rs rename to crates/re_chunk_store/src/events.rs index 5373d4a9d375..c021444f1569 100644 --- a/crates/re_data_store/src/store_event.rs +++ b/crates/re_chunk_store/src/events.rs @@ -1,52 +1,52 @@ -use nohash_hasher::IntMap; +use std::sync::Arc; -use re_log_types::{DataCell, EntityPath, RowId, StoreId, TimeInt, TimePoint, Timeline}; -use re_types_core::ComponentName; +use re_chunk::Chunk; +use re_log_types::StoreId; -use crate::StoreGeneration; +use crate::ChunkStoreGeneration; // Used all over in docstrings. #[allow(unused_imports)] -use crate::{DataStore, StoreSubscriber}; +use crate::{ChunkId, ChunkStore, ChunkStoreSubscriber, RowId}; // --- -/// The atomic unit of change in the Rerun [`DataStore`]. +/// The atomic unit of change in the Rerun [`ChunkStore`]. /// -/// A [`StoreEvent`] describes the changes caused by the addition or deletion of a -/// [`re_log_types::DataRow`] in the store. +/// A [`ChunkStoreEvent`] describes the changes caused by the addition or deletion of a +/// [`Chunk`] in the store. /// -/// Methods that mutate the [`DataStore`], such as [`DataStore::insert_row`] and [`DataStore::gc`], -/// return [`StoreEvent`]s that describe the changes. -/// You can also register your own [`StoreSubscriber`] in order to be notified of changes as soon as they +/// Methods that mutate the [`ChunkStore`], such as [`ChunkStore::insert_chunk`] and [`ChunkStore::gc`], +/// return [`ChunkStoreEvent`]s that describe the changes. +/// You can also register your own [`ChunkStoreSubscriber`] in order to be notified of changes as soon as they /// happen. /// -/// Refer to field-level documentation for more details and check out [`StoreDiff`] for a precise +/// Refer to field-level documentation for more details and check out [`ChunkStoreDiff`] for a precise /// definition of what an event involves. -#[derive(Debug, Clone, PartialEq)] -pub struct StoreEvent { - /// Which [`DataStore`] sent this event? +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ChunkStoreEvent { + /// Which [`ChunkStore`] sent this event? pub store_id: StoreId, /// What was the store's generation when it sent that event? - pub store_generation: StoreGeneration, + pub store_generation: ChunkStoreGeneration, /// Monotonically increasing ID of the event. /// /// This is on a per-store basis. /// - /// When handling a [`StoreEvent`], if this is the first time you process this [`StoreId`] and + /// When handling a [`ChunkStoreEvent`], if this is the first time you process this [`StoreId`] and /// the associated `event_id` is not `1`, it means you registered late and missed some updates. pub event_id: u64, /// What actually changed? /// - /// Refer to [`StoreDiff`] for more information. - pub diff: StoreDiff, + /// Refer to [`ChunkStoreDiff`] for more information. + pub diff: ChunkStoreDiff, } -impl std::ops::Deref for StoreEvent { - type Target = StoreDiff; +impl std::ops::Deref for ChunkStoreEvent { + type Target = ChunkStoreDiff; #[inline] fn deref(&self) -> &Self::Target { @@ -58,19 +58,19 @@ impl std::ops::Deref for StoreEvent { /// /// Reminder: ⚠ Do not confuse _a deletion_ and _a clear_ ⚠. /// -/// A deletion is the result of a row being completely removed from the store as part of the +/// A deletion is the result of a chunk being completely removed from the store as part of the /// garbage collection process. /// /// A clear, on the other hand, is the act of logging an empty [`re_types_core::ComponentBatch`], /// either directly using the logging APIs, or indirectly through the use of a /// [`re_types_core::archetypes::Clear`] archetype. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum StoreDiffKind { +pub enum ChunkStoreDiffKind { Addition, Deletion, } -impl StoreDiffKind { +impl ChunkStoreDiffKind { #[inline] pub fn delta(&self) -> i64 { match self { @@ -80,118 +80,69 @@ impl StoreDiffKind { } } -/// Describes an atomic change in the Rerun [`DataStore`]: a row has been added or deleted. +/// Describes an atomic change in the Rerun [`ChunkStore`]: a chunk has been added or deleted. /// -/// From a query model standpoint, the [`DataStore`] _always_ operates one row at a time: -/// - The contents of a row (i.e. its columns) are immutable past insertion, by virtue of -/// [`RowId`]s being unique and non-reusable. -/// - Similarly, garbage collection always removes _all the data_ associated with a row in one go: -/// there cannot be orphaned columns. When a row is gone, all data associated with it is gone too. +/// From a query model standpoint, the [`ChunkStore`] _always_ operates one chunk at a time: +/// - The contents of a chunk (i.e. its columns) are immutable past insertion, by virtue of +/// [`ChunkId`]s being unique and non-reusable. +/// - Similarly, garbage collection always removes _all the data_ associated with a chunk in one go: +/// there cannot be orphaned columns. When a chunk is gone, all data associated with it is gone too. /// /// Refer to field-level documentation for more information. -#[derive(Debug, Clone, PartialEq)] -pub struct StoreDiff { +#[derive(Debug, Clone)] +pub struct ChunkStoreDiff { /// Addition or deletion? /// /// The store's internals are opaque and don't necessarily reflect the query model (e.g. there /// might be data in the store that cannot by reached by any query). /// - /// A [`StoreDiff`] answers a logical question: "does there exist a query path which can return - /// data from that row?". + /// A [`ChunkStoreDiff`] answers a logical question: "does there exist a query path which can return + /// data from that chunk?". /// - /// An event of kind deletion only tells you that, from this point on, no query can return data from that row. + /// An event of kind deletion only tells you that, from this point on, no query can return data from that chunk. /// That doesn't necessarily mean that the data is actually gone, i.e. don't make assumptions of e.g. the size /// in bytes of the store based on these events. /// They are in "query-model space" and are not an accurate representation of what happens in storage space. - pub kind: StoreDiffKind, + pub kind: ChunkStoreDiffKind, - /// What's the row's [`RowId`]? - /// - /// [`RowId`]s are guaranteed to be unique within a single [`DataStore`]. - /// - /// Put another way, the same [`RowId`] can only appear twice in a [`StoreDiff`] event: - /// one addition and (optionally) one deletion (in that order!). - pub row_id: RowId, - - /// The time data associated with that row. - /// - /// Since insertions and deletions both work on a row-level basis, this is guaranteed to be the - /// same value for both the insertion and deletion events (if any). - /// - /// This is not a [`TimePoint`] for performance reasons. + /// The chunk that was added or removed. // - // NOTE: Empirical testing shows that a SmallVec isn't any better in the best case, and can be a - // significant performant drop at worst. - // pub times: SmallVec<[(Timeline, TimeInt); 5]>, // "5 timelines ought to be enough for anyone" - pub times: Vec<(Timeline, TimeInt)>, - - /// The [`EntityPath`] associated with that row. - /// - /// Since insertions and deletions both work on a row-level basis, this is guaranteed to be the - /// same value for both the insertion and deletion events (if any). - pub entity_path: EntityPath, - - /// All the [`DataCell`]s associated with that row. - /// - /// Since insertions and deletions both work on a row-level basis, this is guaranteed to be the - /// same set of values for both the insertion and deletion events (if any). - pub cells: IntMap, + // NOTE: We purposefully use an `Arc` instead of a `ChunkId` here because we want to make sure that all + // downstream subscribers get a chance to inspect the data in the chunk before it gets permanently + // deallocated. + pub chunk: Arc, } -impl StoreDiff { +impl PartialEq for ChunkStoreDiff { #[inline] - pub fn addition(row_id: impl Into, entity_path: impl Into) -> Self { - Self { - kind: StoreDiffKind::Addition, - row_id: row_id.into(), - times: Default::default(), - entity_path: entity_path.into(), - cells: Default::default(), - } + fn eq(&self, rhs: &Self) -> bool { + let Self { kind, chunk } = self; + *kind == rhs.kind && chunk.id() == rhs.chunk.id() } +} +impl Eq for ChunkStoreDiff {} + +impl ChunkStoreDiff { #[inline] - pub fn deletion(row_id: impl Into, entity_path: impl Into) -> Self { + pub fn addition(chunk: Arc) -> Self { Self { - kind: StoreDiffKind::Deletion, - row_id: row_id.into(), - times: Default::default(), - entity_path: entity_path.into(), - cells: Default::default(), + kind: ChunkStoreDiffKind::Addition, + chunk, } } #[inline] - pub fn at_timepoint(&mut self, timepoint: impl Into) -> &mut Self { - self.times.extend(timepoint.into()); - self - } - - #[inline] - pub fn at_timestamp( - &mut self, - timeline: impl Into, - time: impl Into, - ) -> &mut Self { - self.times.push((timeline.into(), time.into())); - self - } - - #[inline] - pub fn with_cells(&mut self, cells: impl IntoIterator) -> &mut Self { - self.cells - .extend(cells.into_iter().map(|cell| (cell.component_name(), cell))); - self - } - - #[inline] - pub fn timepoint(&self) -> TimePoint { - self.times.clone().into_iter().collect() + pub fn deletion(chunk: Arc) -> Self { + Self { + kind: ChunkStoreDiffKind::Deletion, + chunk, + } } #[inline] pub fn is_static(&self) -> bool { - self.times.is_empty() + self.chunk.is_static() } /// `-1` for deletions, `+1` for additions. @@ -202,21 +153,24 @@ impl StoreDiff { #[inline] pub fn num_components(&self) -> usize { - self.cells.len() + self.chunk.num_components() } } +// --- + #[cfg(test)] mod tests { use std::collections::BTreeMap; + use re_chunk::RowId; use re_log_types::{ example_components::{MyColor, MyIndex, MyPoint}, - DataRow, RowId, TimePoint, Timeline, + EntityPath, TimeInt, TimePoint, Timeline, }; - use re_types_core::Loggable as _; + use re_types_core::{ComponentName, Loggable as _}; - use crate::{DataStore, GarbageCollectionOptions}; + use crate::{ChunkStore, GarbageCollectionOptions}; use super::*; @@ -255,26 +209,33 @@ mod tests { } impl GlobalCounts { - fn on_events(&mut self, events: &[StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { for event in events { - let delta = event.delta(); + let delta_chunks = event.delta(); + let delta_rows = delta_chunks * event.chunk.num_rows() as i64; - *self.row_ids.entry(event.row_id).or_default() += delta; + for row_id in event.chunk.row_ids() { + *self.row_ids.entry(row_id).or_default() += delta_chunks; + } *self .entity_paths - .entry(event.entity_path.clone()) - .or_default() += delta; + .entry(event.chunk.entity_path().clone()) + .or_default() += delta_chunks; - for component_name in event.cells.keys() { + for (component_name, list_array) in event.chunk.components() { + let delta = + event.delta() * list_array.iter().filter(Option::is_some).count() as i64; *self.component_names.entry(*component_name).or_default() += delta; } if event.is_static() { - self.num_static += delta; + self.num_static += delta_rows; } else { - for &(timeline, time) in &event.times { - *self.timelines.entry(timeline).or_default() += delta; - *self.times.entry(time).or_default() += delta; + for (&timeline, time_chunk) in event.chunk.timelines() { + *self.timelines.entry(timeline).or_default() += delta_rows; + for time in time_chunk.times() { + *self.times.entry(time).or_default() += delta_chunks; + } } } } @@ -283,7 +244,7 @@ mod tests { #[test] fn store_events() -> anyhow::Result<()> { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -301,14 +262,11 @@ mod tests { (timeline_yet_another, 1), // ]); let entity_path1: EntityPath = "entity_a".into(); - let row1 = DataRow::from_component_batches( - row_id1, - timepoint1.clone(), - entity_path1.clone(), - [&MyIndex::from_iter(0..10) as _], - )?; + let chunk1 = Chunk::builder(entity_path1.clone()) + .with_component_batch(row_id1, timepoint1.clone(), &MyIndex::from_iter(0..10)) + .build()?; - view.on_events(&[store.insert_row(&row1)?]); + view.on_events(&[store.insert_chunk(&Arc::new(chunk1))?.unwrap()]); similar_asserts::assert_eq!( GlobalCounts::new( @@ -342,21 +300,18 @@ mod tests { (timeline_yet_another, 1), // ]); let entity_path2: EntityPath = "entity_b".into(); - let row2 = { + let chunk2 = { let num_instances = 3; let points: Vec<_> = (0..num_instances) .map(|i| MyPoint::new(0.0, i as f32)) .collect(); let colors = vec![MyColor::from(0xFF0000FF)]; - DataRow::from_component_batches( - row_id2, - timepoint2.clone(), - entity_path2.clone(), - [&points as _, &colors as _], - )? + Chunk::builder(entity_path2.clone()) + .with_component_batches(row_id2, timepoint2.clone(), [&points as _, &colors as _]) + .build()? }; - view.on_events(&[store.insert_row(&row2)?]); + view.on_events(&[store.insert_chunk(&Arc::new(chunk2))?.unwrap()]); similar_asserts::assert_eq!( GlobalCounts::new( @@ -390,21 +345,22 @@ mod tests { let row_id3 = RowId::new(); let timepoint3 = TimePoint::default(); - let row3 = { + let chunk3 = { let num_instances = 6; let colors = vec![MyColor::from(0x00DD00FF); num_instances]; - DataRow::from_component_batches( - row_id3, - timepoint3.clone(), - entity_path2.clone(), - [ - &MyIndex::from_iter(0..num_instances as _) as _, - &colors as _, - ], - )? + Chunk::builder(entity_path2.clone()) + .with_component_batches( + row_id3, + timepoint3.clone(), + [ + &MyIndex::from_iter(0..num_instances as _) as _, + &colors as _, + ], + ) + .build()? }; - view.on_events(&[store.insert_row(&row3)?]); + view.on_events(&[store.insert_chunk(&Arc::new(chunk3))?.unwrap()]); similar_asserts::assert_eq!( GlobalCounts::new( diff --git a/crates/re_chunk_store/src/gc.rs b/crates/re_chunk_store/src/gc.rs new file mode 100644 index 000000000000..0a54b1448aa1 --- /dev/null +++ b/crates/re_chunk_store/src/gc.rs @@ -0,0 +1,534 @@ +use std::{ + collections::{btree_map::Entry as BTreeMapEntry, BTreeSet}, + time::Duration, +}; + +use ahash::{HashMap, HashSet}; +use nohash_hasher::{IntMap, IntSet}; +use re_chunk::{Chunk, ChunkId}; +use web_time::Instant; + +use re_log_types::{EntityPath, TimeInt, Timeline}; +use re_types_core::{ComponentName, SizeBytes}; + +use crate::{ + store::ChunkIdSetPerTime, ChunkStore, ChunkStoreChunkStats, ChunkStoreDiff, ChunkStoreDiffKind, + ChunkStoreEvent, ChunkStoreStats, +}; + +// Used all over in docstrings. +#[allow(unused_imports)] +use crate::RowId; + +// --- + +#[derive(Debug, Clone, Copy)] +pub enum GarbageCollectionTarget { + /// Try to drop _at least_ the given fraction. + /// + /// The fraction must be a float in the range [0.0 : 1.0]. + DropAtLeastFraction(f64), + + /// GC Everything that isn't protected. + Everything, +} + +#[derive(Debug, Clone)] +pub struct GarbageCollectionOptions { + /// What target threshold should the GC try to meet. + pub target: GarbageCollectionTarget, + + /// How long the garbage collection in allowed to run for. + /// + /// Trades off latency for throughput: + /// - A smaller `time_budget` will clear less data in a shorter amount of time, allowing for a + /// more responsive UI at the cost of more GC overhead and more frequent runs. + /// - A larger `time_budget` will clear more data in a longer amount of time, increasing the + /// chance of UI freeze frames but decreasing GC overhead and running less often. + /// + /// The default is an unbounded time budget (i.e. throughput only). + pub time_budget: Duration, + + /// How many component revisions to preserve on each timeline. + pub protect_latest: usize, + + /// Components which should not be protected from GC when using + /// [`GarbageCollectionOptions::protect_latest`]. + // + // TODO(#6552): this should be removed in favor of a dedicated `remove_entity_path` API. + pub dont_protect_components: IntSet, + + /// Timelines which should not be protected from GC when using `protect_latest` + /// [`GarbageCollectionOptions::protect_latest`]. + // + // TODO(#6552): this should be removed in favor of a dedicated `remove_entity_path` API. + pub dont_protect_timelines: IntSet, +} + +impl GarbageCollectionOptions { + pub fn gc_everything() -> Self { + Self { + target: GarbageCollectionTarget::Everything, + time_budget: std::time::Duration::MAX, + protect_latest: 0, + dont_protect_components: Default::default(), + dont_protect_timelines: Default::default(), + } + } +} + +impl std::fmt::Display for GarbageCollectionTarget { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::DropAtLeastFraction(p) => { + write!(f, "DropAtLeast({:.3}%)", *p * 100.0) + } + Self::Everything => write!(f, "Everything"), + } + } +} + +impl ChunkStore { + /// Triggers a garbage collection according to the desired `target`. + /// + /// Returns the list of `Chunk`s that were purged from the store in the form of [`ChunkStoreEvent`]s. + /// + /// ## Semantics + /// + /// Garbage collection works on a chunk-level basis and is driven by [`RowId`] order + /// (specifically, the smallest `RowId` of each respective Chunk), i.e. the order defined + /// by the clients' wall-clocks, allowing it to drop data across the different timelines in + /// a fair, deterministic manner. + /// Similarly, out-of-order data is supported out of the box. + /// + /// The garbage collector doesn't deallocate data in and of itself: all it does is drop the + /// store's internal references to that data (the `Chunk`s), which will be deallocated once + /// their reference count reaches 0. + /// + /// ## Limitations + /// + /// The garbage collector has limited support for latest-at semantics. The configuration option: + /// [`GarbageCollectionOptions::protect_latest`] will protect the N latest values of each + /// component on each timeline. The only practical guarantee this gives is that a latest-at query + /// with a value of max-int will be unchanged. However, latest-at queries from other arbitrary + /// points in time may provide different results pre- and post- GC. + pub fn gc( + &mut self, + options: &GarbageCollectionOptions, + ) -> (Vec, ChunkStoreStats) { + re_tracing::profile_function!(); + + self.gc_id += 1; + + let stats_before = self.stats(); + + let total_size_bytes_before = stats_before.total().total_size_bytes as f64; + let total_num_chunks_before = stats_before.total().num_chunks; + let total_num_rows_before = stats_before.total().total_num_rows; + + let protected_chunk_ids = self.find_all_protected_chunk_ids( + options.protect_latest, + &options.dont_protect_components, + &options.dont_protect_timelines, + ); + + let diffs = match options.target { + GarbageCollectionTarget::DropAtLeastFraction(p) => { + assert!((0.0..=1.0).contains(&p)); + + let num_bytes_to_drop = total_size_bytes_before * p; + let target_size_bytes = total_size_bytes_before - num_bytes_to_drop; + + re_log::trace!( + kind = "gc", + id = self.gc_id, + %options.target, + total_num_chunks_before = re_format::format_uint(total_num_chunks_before), + total_num_rows_before = re_format::format_uint(total_num_rows_before), + total_size_bytes_before = re_format::format_bytes(total_size_bytes_before), + target_size_bytes = re_format::format_bytes(target_size_bytes), + drop_at_least_num_bytes = re_format::format_bytes(num_bytes_to_drop), + "starting GC" + ); + + self.gc_drop_at_least_num_bytes(options, num_bytes_to_drop, &protected_chunk_ids) + } + GarbageCollectionTarget::Everything => { + re_log::trace!( + kind = "gc", + id = self.gc_id, + %options.target, + total_num_rows_before = re_format::format_uint(total_num_rows_before), + total_size_bytes_before = re_format::format_bytes(total_size_bytes_before), + "starting GC" + ); + + self.gc_drop_at_least_num_bytes(options, f64::INFINITY, &protected_chunk_ids) + } + }; + + let stats_after = self.stats(); + let total_size_bytes_after = stats_after.total().total_size_bytes as f64; + let total_num_chunks_after = stats_after.total().num_chunks; + let total_num_rows_after = stats_after.total().total_num_rows; + + re_log::trace!( + kind = "gc", + id = self.gc_id, + %options.target, + total_num_chunks_before = re_format::format_uint(total_num_chunks_before), + total_num_rows_before = re_format::format_uint(total_num_rows_before), + total_size_bytes_before = re_format::format_bytes(total_size_bytes_before), + total_num_chunks_after = re_format::format_uint(total_num_chunks_after), + total_num_rows_after = re_format::format_uint(total_num_rows_after), + total_size_bytes_after = re_format::format_bytes(total_size_bytes_after), + "GC done" + ); + + let events: Vec<_> = diffs + .into_iter() + .map(|diff| ChunkStoreEvent { + store_id: self.id.clone(), + store_generation: self.generation(), + event_id: self + .event_id + .fetch_add(1, std::sync::atomic::Ordering::Relaxed), + diff, + }) + .collect(); + + { + if cfg!(debug_assertions) { + let any_event_other_than_deletion = events + .iter() + .any(|e| e.kind != ChunkStoreDiffKind::Deletion); + assert!(!any_event_other_than_deletion); + } + + Self::on_events(&events); + } + + (events, stats_before - stats_after) + } + + /// For each `EntityPath`, `Timeline`, `Component` find the N latest [`ChunkId`]s. + // + // TODO(jleibs): More complex functionality might required expanding this to also + // *ignore* specific entities, components, timelines, etc. for this protection. + fn find_all_protected_chunk_ids( + &self, + target_count: usize, + dont_protect_components: &IntSet, + dont_protect_timelines: &IntSet, + ) -> BTreeSet { + re_tracing::profile_function!(); + + if target_count == 0 { + return Default::default(); + } + + self.temporal_chunk_ids_per_entity + .values() + .flat_map(|temporal_chunk_ids_per_timeline| { + temporal_chunk_ids_per_timeline + .iter() + .filter_map(|(timeline, temporal_chunk_ids_per_component)| { + (!dont_protect_timelines.contains(timeline)) + .then_some(temporal_chunk_ids_per_component) + }) + .flat_map(|temporal_chunk_ids_per_component| { + temporal_chunk_ids_per_component + .iter() + .filter(|(component_name, _)| { + !dont_protect_components.contains(component_name) + }) + .flat_map(|(_, temporal_chunk_ids_per_time)| { + temporal_chunk_ids_per_time + .per_start_time + .last_key_value() + .map(|(_, chunk_ids)| chunk_ids.iter().copied()) + .into_iter() + .flatten() + .chain( + temporal_chunk_ids_per_time + .per_end_time + .last_key_value() + .map(|(_, chunk_ids)| chunk_ids.iter().copied()) + .into_iter() + .flatten(), + ) + .collect::>() + .into_iter() + .rev() + .take(target_count) + }) + }) + }) + .collect() + } + + fn gc_drop_at_least_num_bytes( + &mut self, + options: &GarbageCollectionOptions, + mut num_bytes_to_drop: f64, + protected_chunk_ids: &BTreeSet, + ) -> Vec { + re_tracing::profile_function!(re_format::format_bytes(num_bytes_to_drop)); + + type RemovableChunkIdPerTimePerComponentPerTimelinePerEntity = IntMap< + EntityPath, + IntMap>>>, + >; + + let mut chunk_ids_to_be_removed = + RemovableChunkIdPerTimePerComponentPerTimelinePerEntity::default(); + let mut chunk_ids_dangling = HashSet::default(); + + let start_time = Instant::now(); + + { + re_tracing::profile_scope!("mark"); + + for chunk_id in self + .chunk_ids_per_min_row_id + .values() + .flatten() + .filter(|chunk_id| !protected_chunk_ids.contains(chunk_id)) + { + if let Some(chunk) = self.chunks_per_chunk_id.get(chunk_id) { + // NOTE: Do _NOT_ use `chunk.total_size_bytes` as it is sitting behind an Arc + // and would count as amortized (i.e. 0 bytes). + num_bytes_to_drop -= ::total_size_bytes(chunk) as f64; + + // NOTE: We cannot blindly `retain` across all temporal tables, it's way too costly + // and slow. Rather we need to surgically remove the superfluous chunks. + let entity_path = chunk.entity_path(); + let per_timeline = chunk_ids_to_be_removed + .entry(entity_path.clone()) + .or_default(); + for (&timeline, time_chunk) in chunk.timelines() { + let per_component = per_timeline.entry(timeline).or_default(); + for component_name in chunk.component_names() { + let per_time = per_component.entry(component_name).or_default(); + + // NOTE: As usual, these are vectors of `ChunkId`s, as it is legal to + // have perfectly overlapping chunks. + let time_range = time_chunk.time_range(); + per_time + .entry(time_range.min()) + .or_default() + .push(chunk.id()); + if time_range.min() != time_range.max() { + per_time + .entry(time_range.max()) + .or_default() + .push(chunk.id()); + } + } + } + } else { + chunk_ids_dangling.insert(*chunk_id); + } + + // NOTE: There is no point in spending more than a fourth of the time budget on the + // mark phase or there is no way the sweep phase will have any time to do anything + // with the results anyhow. + if start_time.elapsed() >= options.time_budget / 4 || num_bytes_to_drop <= 0.0 { + break; + } + } + } + + { + re_tracing::profile_scope!("sweep"); + + let Self { + id: _, + config: _, + type_registry: _, + chunks_per_chunk_id, + chunk_ids_per_min_row_id: chunk_id_per_min_row_id, + temporal_chunk_ids_per_entity, + temporal_chunks_stats, + static_chunk_ids_per_entity: _, // we don't GC static data + static_chunks_stats: _, // we don't GC static data + insert_id: _, + query_id: _, + gc_id: _, + event_id: _, + } = self; + + let mut diffs = Vec::new(); + + // NOTE: Dangling chunks should never happen: it is the job of the GC to ensure that. + // + // In release builds, we still want to do the nice thing and clean them up as best as we + // can in order to prevent OOMs. + // + // We should really never be in there, so don't bother accounting that in the time + // budget. + debug_assert!( + chunk_ids_dangling.is_empty(), + "detected dangling chunks -- there's a GC bug" + ); + if !chunk_ids_dangling.is_empty() { + re_tracing::profile_scope!("dangling"); + + chunk_id_per_min_row_id.retain(|_row_id, chunk_ids| { + chunk_ids.retain(|chunk_id| !chunk_ids_dangling.contains(chunk_id)); + !chunk_ids.is_empty() + }); + + for temporal_chunk_ids_per_component in temporal_chunk_ids_per_entity.values_mut() { + for temporal_chunk_ids_per_timeline in + temporal_chunk_ids_per_component.values_mut() + { + for temporal_chunk_ids_per_time in + temporal_chunk_ids_per_timeline.values_mut() + { + let ChunkIdSetPerTime { + max_interval_length: _, + per_start_time, + per_end_time, + } = temporal_chunk_ids_per_time; + + // TODO(cmc): Technically, the optimal thing to do would be to + // recompute `max_interval_length` per time here. + // In practice, this adds a lot of complexity for likely very little + // performance benefit, since we expect the chunks to have similar + // interval lengths on the happy path. + + for chunk_ids in per_start_time.values_mut() { + chunk_ids.retain(|chunk_id| !chunk_ids_dangling.contains(chunk_id)); + } + for chunk_ids in per_end_time.values_mut() { + chunk_ids.retain(|chunk_id| !chunk_ids_dangling.contains(chunk_id)); + } + } + } + } + + diffs.extend( + chunk_ids_dangling + .into_iter() + .filter_map(|chunk_id| chunks_per_chunk_id.remove(&chunk_id)) + .map(ChunkStoreDiff::deletion), + ); + } + + if !chunk_ids_to_be_removed.is_empty() { + re_tracing::profile_scope!("standard"); + + // NOTE: We cannot blindly `retain` across all temporal tables, it's way too costly + // and slow. Rather we need to surgically remove the superfluous chunks. + + let mut chunk_ids_removed = HashSet::default(); + + for (entity_path, chunk_ids_to_be_removed) in chunk_ids_to_be_removed { + let BTreeMapEntry::Occupied(mut temporal_chunk_ids_per_timeline) = + temporal_chunk_ids_per_entity.entry(entity_path) + else { + continue; + }; + + for (timeline, chunk_ids_to_be_removed) in chunk_ids_to_be_removed { + let BTreeMapEntry::Occupied(mut temporal_chunk_ids_per_component) = + temporal_chunk_ids_per_timeline.get_mut().entry(timeline) + else { + continue; + }; + + for (component_name, chunk_ids_to_be_removed) in chunk_ids_to_be_removed { + let BTreeMapEntry::Occupied(mut temporal_chunk_ids_per_time) = + temporal_chunk_ids_per_component + .get_mut() + .entry(component_name) + else { + continue; + }; + + let ChunkIdSetPerTime { + max_interval_length: _, + per_start_time, + per_end_time, + } = temporal_chunk_ids_per_time.get_mut(); + + // TODO(cmc): Technically, the optimal thing to do would be to + // recompute `max_interval_length` per time here. + // In practice, this adds a lot of complexity for likely very little + // performance benefit, since we expect the chunks to have similar + // interval lengths on the happy path. + + for (time, chunk_ids) in chunk_ids_to_be_removed { + if let BTreeMapEntry::Occupied(mut chunk_id_set) = + per_start_time.entry(time) + { + for chunk_id in &chunk_ids { + chunk_id_set.get_mut().remove(chunk_id); + } + if chunk_id_set.get().is_empty() { + chunk_id_set.remove_entry(); + } + } + + if let BTreeMapEntry::Occupied(mut chunk_id_set) = + per_end_time.entry(time) + { + for chunk_id in &chunk_ids { + chunk_id_set.get_mut().remove(chunk_id); + } + if chunk_id_set.get().is_empty() { + chunk_id_set.remove_entry(); + } + } + + chunk_ids_removed.extend(chunk_ids); + } + + if per_start_time.is_empty() && per_end_time.is_empty() { + temporal_chunk_ids_per_time.remove_entry(); + } + + if start_time.elapsed() >= options.time_budget { + break; + } + } + + if temporal_chunk_ids_per_component.get().is_empty() { + temporal_chunk_ids_per_component.remove_entry(); + } + + if start_time.elapsed() >= options.time_budget { + break; + } + } + + if temporal_chunk_ids_per_timeline.get().is_empty() { + temporal_chunk_ids_per_timeline.remove_entry(); + } + + if start_time.elapsed() >= options.time_budget { + break; + } + } + + chunk_id_per_min_row_id.retain(|_row_id, chunk_ids| { + chunk_ids.retain(|chunk_id| !chunk_ids_removed.contains(chunk_id)); + !chunk_ids.is_empty() + }); + + diffs.extend( + chunk_ids_removed + .into_iter() + .filter_map(|chunk_id| chunks_per_chunk_id.remove(&chunk_id)) + .inspect(|chunk| { + *temporal_chunks_stats -= ChunkStoreChunkStats::from_chunk(chunk); + }) + .map(ChunkStoreDiff::deletion), + ); + } + + diffs + } + } +} diff --git a/crates/re_chunk_store/src/lib.rs b/crates/re_chunk_store/src/lib.rs new file mode 100644 index 000000000000..d7d72afabd04 --- /dev/null +++ b/crates/re_chunk_store/src/lib.rs @@ -0,0 +1,49 @@ +//! The Rerun chunk store, implemented on top of [Apache Arrow](https://arrow.apache.org/) +//! using the [`arrow2`] crate. +//! +//! This crate is an in-memory time series database for Rerun log data. +//! It is indexed by Entity path, component, timeline, and time. +//! It supports out-of-order insertions, and fast `O(log(N))` queries. +//! +//! * See [`ChunkStore`] for an overview of the core data structures. +//! * See [`ChunkStore::latest_at_relevant_chunks`] and [`ChunkStore::range_relevant_chunks`] +//! for the documentation of the public read APIs. +//! * See [`ChunkStore::insert_chunk`] for the documentation of the public write APIs. +//! +//! ## Feature flags +#![doc = document_features::document_features!()] +//! + +mod events; +mod gc; +mod query; +mod stats; +mod store; +mod subscribers; +mod writes; + +pub use self::events::{ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent}; +pub use self::gc::{GarbageCollectionOptions, GarbageCollectionTarget}; +pub use self::stats::{ChunkStoreChunkStats, ChunkStoreStats}; +pub use self::store::{ChunkStore, ChunkStoreConfig, ChunkStoreGeneration}; +pub use self::subscribers::{ChunkStoreSubscriber, ChunkStoreSubscriberHandle}; + +// Re-exports +#[doc(no_inline)] +pub use re_chunk::{Chunk, ChunkId, LatestAtQuery, RangeQuery, RowId}; +#[doc(no_inline)] +pub use re_log_types::{ResolvedTimeRange, TimeInt, TimeType, Timeline}; + +pub mod external { + pub use re_chunk; +} + +// --- + +#[derive(thiserror::Error, Debug)] +pub enum ChunkStoreError { + #[error("Chunks must be sorted before insertion in the chunk store")] + UnsortedChunk, +} + +pub type ChunkStoreResult = ::std::result::Result; diff --git a/crates/re_chunk_store/src/query.rs b/crates/re_chunk_store/src/query.rs new file mode 100644 index 000000000000..3b6cde149f95 --- /dev/null +++ b/crates/re_chunk_store/src/query.rs @@ -0,0 +1,270 @@ +use std::{ + collections::BTreeSet, + sync::{atomic::Ordering, Arc}, +}; + +use re_chunk::{Chunk, LatestAtQuery, RangeQuery}; +use re_log_types::{EntityPath, TimeInt, Timeline}; +use re_types_core::{ComponentName, ComponentNameSet}; + +use crate::ChunkStore; + +// Used all over in docstrings. +#[allow(unused_imports)] +use crate::RowId; + +// --- + +impl ChunkStore { + /// Retrieve all the [`ComponentName`]s that have been written to for a given [`EntityPath`] on + /// the specified [`Timeline`]. + /// + /// Static components are always included in the results. + /// + /// Returns `None` if the entity doesn't exist at all on this `timeline`. + pub fn all_components( + &self, + timeline: &Timeline, + entity_path: &EntityPath, + ) -> Option { + re_tracing::profile_function!(); + + self.query_id.fetch_add(1, Ordering::Relaxed); + + let static_components: Option = self + .static_chunk_ids_per_entity + .get(entity_path) + .map(|static_chunks_per_component| { + static_chunks_per_component.keys().copied().collect() + }); + + let temporal_components: Option = self + .temporal_chunk_ids_per_entity + .get(entity_path) + .map(|temporal_chunk_ids_per_timeline| { + temporal_chunk_ids_per_timeline + .iter() + .filter(|(cur_timeline, _)| *cur_timeline == timeline) + .flat_map(|(_, temporal_chunk_ids_per_component)| { + temporal_chunk_ids_per_component.keys().copied() + }) + .collect() + }); + + match (static_components, temporal_components) { + (None, None) => None, + (None, comps @ Some(_)) | (comps @ Some(_), None) => comps, + (Some(static_comps), Some(temporal_comps)) => { + Some(static_comps.into_iter().chain(temporal_comps).collect()) + } + } + } + + /// Check whether a given entity has a specific [`ComponentName`] either on the specified + /// timeline, or in its static data. + #[inline] + pub fn entity_has_component( + &self, + timeline: &Timeline, + entity_path: &EntityPath, + component_name: &ComponentName, + ) -> bool { + re_tracing::profile_function!(); + self.all_components(timeline, entity_path) + .map_or(false, |components| components.contains(component_name)) + } + + /// Find the earliest time at which something was logged for a given entity on the specified + /// timeline. + /// + /// Ignores static data. + #[inline] + pub fn entity_min_time( + &self, + timeline: &Timeline, + entity_path: &EntityPath, + ) -> Option { + let temporal_chunk_ids_per_timeline = + self.temporal_chunk_ids_per_entity.get(entity_path)?; + let temporal_chunk_ids_per_component = temporal_chunk_ids_per_timeline.get(timeline)?; + + let mut time_min = TimeInt::MAX; + for temporal_chunk_ids_per_time in temporal_chunk_ids_per_component.values() { + let Some(time) = temporal_chunk_ids_per_time + .per_start_time + .first_key_value() + .map(|(time, _)| *time) + else { + continue; + }; + time_min = TimeInt::min(time_min, time); + } + + (time_min != TimeInt::MAX).then_some(time_min) + } + + /// Returns the most-relevant chunk(s) for the given [`LatestAtQuery`]. + /// + /// The [`ChunkStore`] always work at the [`Chunk`] level (as opposed to the row level): it is + /// oblivious to the data therein. + /// For that reason, and because [`Chunk`]s are allowed to temporally overlap, it is possible + /// that a query has more than one relevant chunk. + /// + /// The caller should filter the returned chunks further (see [`Chunk::latest_at`]) in order to + /// determine what exact row contains the final result. + /// + /// If the entity has static component data associated with it, it will unconditionally + /// override any temporal component data. + pub fn latest_at_relevant_chunks( + &self, + query: &LatestAtQuery, + entity_path: &EntityPath, + component_name: ComponentName, + ) -> Vec> { + re_tracing::profile_function!(format!("{query:?}")); + + self.query_id.fetch_add(1, Ordering::Relaxed); + + // Reminder: if a chunk has been indexed for a given component, then it must contain at + // least one non-null value for that column. + + if let Some(static_chunk) = self + .static_chunk_ids_per_entity + .get(entity_path) + .and_then(|static_chunks_per_component| { + static_chunks_per_component.get(&component_name) + }) + .and_then(|chunk_id| self.chunks_per_chunk_id.get(chunk_id)) + { + return vec![Arc::clone(static_chunk)]; + } + + if let Some(temporal_chunk_ids) = self + .temporal_chunk_ids_per_entity + .get(entity_path) + .and_then(|temporal_chunk_ids_per_timeline| { + temporal_chunk_ids_per_timeline.get(&query.timeline()) + }) + .and_then(|temporal_chunk_ids_per_component| { + temporal_chunk_ids_per_component.get(&component_name) + }) + .and_then(|temporal_chunk_ids_per_time| { + let upper_bound = temporal_chunk_ids_per_time + .per_start_time + .range(..=query.at()) + .next_back() + .map(|(time, _)| *time)?; + + // Overlapped chunks + // ================= + // + // To deal with potentially overlapping chunks, we keep track of the longest + // interval in the entire map, which gives us an upper bound on how much we + // would need to walk backwards in order to find all potential overlaps. + // + // This is a fairly simple solution that scales much better than interval-tree + // based alternatives, both in terms of complexity and performance, in the normal + // case where most chunks in a collection have similar lengths. + // + // The most degenerate case -- a single chunk overlaps everything else -- results + // in `O(n)` performance, which gets amortized by the query cache. + // If that turns out to be a problem in practice, we can experiment with more + // complex solutions then. + let lower_bound = upper_bound + .as_i64() + .saturating_sub(temporal_chunk_ids_per_time.max_interval_length as _); + + Some( + temporal_chunk_ids_per_time + .per_start_time + .range(..=query.at()) + .rev() + .take_while(|(time, _)| time.as_i64() >= lower_bound) + .flat_map(|(_time, chunk_ids)| chunk_ids.iter()) + .copied() + .collect::>(), + ) + }) + { + return temporal_chunk_ids + .iter() + .filter_map(|chunk_id| self.chunks_per_chunk_id.get(chunk_id).cloned()) + .collect(); + } + + Vec::new() + } + + /// Returns the most-relevant chunk(s) for the given [`RangeQuery`]. + /// + /// The criteria for returning a chunk is only that it may contain data that overlaps with + /// the queried range. + /// + /// The caller should filter the returned chunks further (see [`Chunk::range`]) in order to + /// determine how exactly each row of data fit with the rest. + /// + /// If the entity has static component data associated with it, it will unconditionally + /// override any temporal component data. + pub fn range_relevant_chunks( + &self, + query: &RangeQuery, + entity_path: &EntityPath, + component_name: ComponentName, + ) -> Vec> { + re_tracing::profile_function!(format!("{query:?}")); + + self.query_id.fetch_add(1, Ordering::Relaxed); + + if let Some(static_chunk) = self + .static_chunk_ids_per_entity + .get(entity_path) + .and_then(|static_chunks_per_component| { + static_chunks_per_component.get(&component_name) + }) + .and_then(|chunk_id| self.chunks_per_chunk_id.get(chunk_id)) + { + return vec![Arc::clone(static_chunk)]; + } + + self.temporal_chunk_ids_per_entity + .get(entity_path) + .and_then(|temporal_chunk_ids_per_timeline| { + temporal_chunk_ids_per_timeline.get(&query.timeline()) + }) + .and_then(|temporal_chunk_ids_per_component| { + temporal_chunk_ids_per_component.get(&component_name) + }) + .into_iter() + .map(|temporal_chunk_ids_per_time| { + let start_time = temporal_chunk_ids_per_time + .per_start_time + .range(..=query.range.min()) + .next_back() + .map_or(TimeInt::MIN, |(&time, _)| time); + + let end_time = temporal_chunk_ids_per_time + .per_start_time + .range(..=query.range.max()) + .next_back() + .map_or(start_time, |(&time, _)| time); + + // NOTE: Just being extra cautious because, even though this shouldnt possibly ever happen, + // indexing a std map with a backwards range is an instant crash. + let end_time = TimeInt::max(start_time, end_time); + + (start_time, end_time, temporal_chunk_ids_per_time) + }) + .flat_map(|(start_time, end_time, temporal_chunk_ids_per_time)| { + temporal_chunk_ids_per_time + .per_start_time + .range(start_time..=end_time) + .map(|(_time, chunk_ids)| chunk_ids) + }) + .flat_map(|temporal_chunk_ids| { + temporal_chunk_ids + .iter() + .filter_map(|chunk_id| self.chunks_per_chunk_id.get(chunk_id).cloned()) + }) + .collect() + } +} diff --git a/crates/re_chunk_store/src/stats.rs b/crates/re_chunk_store/src/stats.rs new file mode 100644 index 000000000000..fd713afe2c0e --- /dev/null +++ b/crates/re_chunk_store/src/stats.rs @@ -0,0 +1,166 @@ +use std::sync::Arc; + +use re_chunk::Chunk; +use re_types_core::SizeBytes; + +use crate::ChunkStore; + +// --- + +#[derive(Default, Debug, Clone, Copy)] +pub struct ChunkStoreStats { + pub static_chunks: ChunkStoreChunkStats, + pub temporal_chunks: ChunkStoreChunkStats, +} + +impl ChunkStoreStats { + #[inline] + pub fn total(&self) -> ChunkStoreChunkStats { + let Self { + static_chunks, + temporal_chunks, + } = *self; + static_chunks + temporal_chunks + } +} + +impl std::ops::Add for ChunkStoreStats { + type Output = Self; + + #[inline] + fn add(self, rhs: Self) -> Self::Output { + let Self { + static_chunks, + temporal_chunks, + } = self; + + let static_chunks = static_chunks + rhs.static_chunks; + let temporal_chunks = temporal_chunks + rhs.temporal_chunks; + + Self { + static_chunks, + temporal_chunks, + } + } +} + +impl std::ops::Sub for ChunkStoreStats { + type Output = Self; + + #[inline] + fn sub(self, rhs: Self) -> Self::Output { + let Self { + static_chunks, + temporal_chunks, + } = self; + + let static_chunks = static_chunks - rhs.static_chunks; + let temporal_chunks = temporal_chunks - rhs.temporal_chunks; + + Self { + static_chunks, + temporal_chunks, + } + } +} + +impl ChunkStore { + #[inline] + pub fn stats(&self) -> ChunkStoreStats { + ChunkStoreStats { + static_chunks: self.static_chunks_stats, + temporal_chunks: self.temporal_chunks_stats, + } + } +} + +// --- + +#[derive(Default, Debug, Clone, Copy)] +pub struct ChunkStoreChunkStats { + pub num_chunks: u64, + pub total_size_bytes: u64, + pub total_num_rows: u64, +} + +impl std::fmt::Display for ChunkStoreChunkStats { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { + num_chunks, + total_size_bytes, + total_num_rows, + } = *self; + + f.write_fmt(format_args!( + "num_chunks: {}\n", + re_format::format_uint(num_chunks) + ))?; + f.write_fmt(format_args!( + "total_size_bytes: {}\n", + re_format::format_bytes(total_size_bytes as _) + ))?; + f.write_fmt(format_args!( + "total_num_rows: {}\n", + re_format::format_uint(total_num_rows) + ))?; + + Ok(()) + } +} + +impl std::ops::Add for ChunkStoreChunkStats { + type Output = Self; + + #[inline] + fn add(self, rhs: Self) -> Self::Output { + Self { + num_chunks: self.num_chunks + rhs.num_chunks, + total_size_bytes: self.total_size_bytes + rhs.total_size_bytes, + total_num_rows: self.total_num_rows + rhs.total_num_rows, + } + } +} + +impl std::ops::AddAssign for ChunkStoreChunkStats { + #[inline] + fn add_assign(&mut self, rhs: Self) { + *self = *self + rhs; + } +} + +impl std::ops::Sub for ChunkStoreChunkStats { + type Output = Self; + + #[inline] + fn sub(self, rhs: Self) -> Self::Output { + Self { + num_chunks: self.num_chunks - rhs.num_chunks, + total_size_bytes: self.total_size_bytes - rhs.total_size_bytes, + total_num_rows: self.total_num_rows - rhs.total_num_rows, + } + } +} + +impl std::ops::SubAssign for ChunkStoreChunkStats { + #[inline] + fn sub_assign(&mut self, rhs: Self) { + *self = *self - rhs; + } +} + +impl ChunkStoreChunkStats { + #[inline] + pub fn from_chunk(chunk: &Arc) -> Self { + // NOTE: Do _NOT_ use `chunk.total_size_bytes` as it is sitting behind an Arc + // and would count as amortized (i.e. 0 bytes). + let size_bytes = ::total_size_bytes(&**chunk); + let num_rows = chunk.num_rows() as u64; + + Self { + num_chunks: 1, + total_size_bytes: size_bytes, + total_num_rows: num_rows, + } + } +} diff --git a/crates/re_chunk_store/src/store.rs b/crates/re_chunk_store/src/store.rs new file mode 100644 index 000000000000..d248bf29eb15 --- /dev/null +++ b/crates/re_chunk_store/src/store.rs @@ -0,0 +1,261 @@ +use std::collections::{BTreeMap, BTreeSet}; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; + +use arrow2::datatypes::DataType as ArrowDataType; +use nohash_hasher::IntMap; + +use re_chunk::{Chunk, ChunkId, RowId}; +use re_log_types::{EntityPath, StoreId, TimeInt, Timeline}; +use re_types_core::ComponentName; + +use crate::ChunkStoreChunkStats; + +// --- + +// TODO(cmc): empty for now but soon will contain compaction settings, so preemptively +// avoid breaking changes everywhere. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ChunkStoreConfig {} + +impl Default for ChunkStoreConfig { + #[inline] + fn default() -> Self { + Self::DEFAULT + } +} + +impl ChunkStoreConfig { + pub const DEFAULT: Self = Self {}; +} + +// --- + +pub type ChunkIdSet = BTreeSet; + +#[derive(Default, Debug, Clone)] +pub struct ChunkIdSetPerTime { + /// Keeps track of the longest interval being currently stored in the two maps below. + /// + /// This is used to bound the backwards linear walk when looking for overlapping chunks in + /// latest-at queries. + /// + /// See [`ChunkStore::latest_at_relevant_chunks`] implementation comments for more details. + pub(crate) max_interval_length: u64, + + pub(crate) per_start_time: BTreeMap, + pub(crate) per_end_time: BTreeMap, +} + +pub type ChunkIdSetPerTimePerComponent = BTreeMap; + +pub type ChunkIdSetPerTimePerComponentPerTimeline = + BTreeMap; + +pub type ChunkIdSetPerTimePerComponentPerTimelinePerEntity = + BTreeMap; + +pub type ChunkIdPerComponent = BTreeMap; + +pub type ChunkIdPerComponentPerEntity = BTreeMap; + +// --- + +/// Incremented on each edit. +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ChunkStoreGeneration { + insert_id: u64, + gc_id: u64, +} + +/// A complete chunk store: covers all timelines, all entities, everything. +/// +/// The chunk store _always_ works at the chunk level, whether it is for write & read queries or +/// garbage collection. It is completely oblivious to individual rows. +/// +/// Use the `Display` implementation for a detailed view of the internals. +#[derive(Debug)] +pub struct ChunkStore { + pub(crate) id: StoreId, + + /// The configuration of the chunk store (e.g. compaction settings). + pub(crate) config: ChunkStoreConfig, + + /// Keeps track of the _latest_ datatype information for all component types that have been written + /// to the store so far. + /// + /// See also [`Self::lookup_datatype`]. + // + // TODO(#1809): replace this with a centralized Arrow registry. + // TODO(cmc): this would become fairly problematic in a world where each chunk can use a + // different datatype for a given component. + pub(crate) type_registry: IntMap, + + pub(crate) chunks_per_chunk_id: BTreeMap>, + + /// All [`ChunkId`]s currently in the store, indexed by the smallest [`RowId`] in each of them. + /// + /// This is effectively all chunks in global data order. Used for garbage collection. + /// + /// This is a map of vecs instead of individual [`ChunkId`] in order to better support + /// duplicated [`RowId`]s. + pub(crate) chunk_ids_per_min_row_id: BTreeMap>, + + /// All temporal [`ChunkId`]s for all entities on all timelines. + /// + /// See also [`Self::static_chunk_ids_per_entity`]. + pub(crate) temporal_chunk_ids_per_entity: ChunkIdSetPerTimePerComponentPerTimelinePerEntity, + + /// Accumulated size statitistics for all temporal [`Chunk`]s currently present in the store. + /// + /// This is too costly to be computed from scratch every frame, and is required by e.g. the GC. + pub(crate) temporal_chunks_stats: ChunkStoreChunkStats, + + /// Static data. Never garbage collected. + /// + /// Static data unconditionally shadows temporal data at query time. + /// + /// Existing temporal will not be removed. Events won't be fired. + pub(crate) static_chunk_ids_per_entity: ChunkIdPerComponentPerEntity, + + /// Accumulated size statitistics for all static [`Chunk`]s currently present in the store. + /// + /// This is too costly to be computed from scratch every frame, and is required by e.g. the GC. + pub(crate) static_chunks_stats: ChunkStoreChunkStats, + + // pub(crate) static_tables: BTreeMap, + /// Monotonically increasing ID for insertions. + pub(crate) insert_id: u64, + + /// Monotonically increasing ID for queries. + pub(crate) query_id: AtomicU64, + + /// Monotonically increasing ID for GCs. + pub(crate) gc_id: u64, + + /// Monotonically increasing ID for store events. + pub(crate) event_id: AtomicU64, +} + +impl Clone for ChunkStore { + #[inline] + fn clone(&self) -> Self { + Self { + id: self.id.clone(), + config: self.config.clone(), + type_registry: self.type_registry.clone(), + chunks_per_chunk_id: self.chunks_per_chunk_id.clone(), + chunk_ids_per_min_row_id: self.chunk_ids_per_min_row_id.clone(), + temporal_chunk_ids_per_entity: self.temporal_chunk_ids_per_entity.clone(), + temporal_chunks_stats: self.temporal_chunks_stats, + static_chunk_ids_per_entity: self.static_chunk_ids_per_entity.clone(), + static_chunks_stats: self.static_chunks_stats, + insert_id: Default::default(), + query_id: Default::default(), + gc_id: Default::default(), + event_id: Default::default(), + } + } +} + +impl std::fmt::Display for ChunkStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { + id, + config, + type_registry: _, + chunks_per_chunk_id, + chunk_ids_per_min_row_id: chunk_id_per_min_row_id, + temporal_chunk_ids_per_entity: _, + temporal_chunks_stats, + static_chunk_ids_per_entity: _, + static_chunks_stats, + insert_id: _, + query_id: _, + gc_id: _, + event_id: _, + } = self; + + f.write_str("ChunkStore {\n")?; + + f.write_str(&indent::indent_all_by(4, format!("id: {id}\n")))?; + f.write_str(&indent::indent_all_by(4, format!("config: {config:?}\n")))?; + + f.write_str(&indent::indent_all_by(4, "stats: {\n"))?; + f.write_str(&indent::indent_all_by( + 8, + format!("{}", *static_chunks_stats + *temporal_chunks_stats), + ))?; + f.write_str(&indent::indent_all_by(4, "}\n"))?; + + f.write_str(&indent::indent_all_by(4, "chunks: [\n"))?; + for chunk_id in chunk_id_per_min_row_id.values().flatten() { + if let Some(chunk) = chunks_per_chunk_id.get(chunk_id) { + f.write_str(&indent::indent_all_by(8, format!("{chunk}\n")))?; + } else { + f.write_str(&indent::indent_all_by(8, "\n"))?; + } + } + f.write_str(&indent::indent_all_by(4, "]\n"))?; + + f.write_str("}")?; + + Ok(()) + } +} + +// --- + +impl ChunkStore { + #[inline] + pub fn new(id: StoreId, config: ChunkStoreConfig) -> Self { + Self { + id, + config, + type_registry: Default::default(), + chunk_ids_per_min_row_id: Default::default(), + chunks_per_chunk_id: Default::default(), + temporal_chunk_ids_per_entity: Default::default(), + temporal_chunks_stats: Default::default(), + static_chunk_ids_per_entity: Default::default(), + static_chunks_stats: Default::default(), + insert_id: 0, + query_id: AtomicU64::new(0), + gc_id: 0, + event_id: AtomicU64::new(0), + } + } + + #[inline] + pub fn id(&self) -> &StoreId { + &self.id + } + + /// Return the current [`ChunkStoreGeneration`]. This can be used to determine whether the + /// database has been modified since the last time it was queried. + #[inline] + pub fn generation(&self) -> ChunkStoreGeneration { + ChunkStoreGeneration { + insert_id: self.insert_id, + gc_id: self.gc_id, + } + } + + /// See [`ChunkStoreConfig`] for more information about configuration. + #[inline] + pub fn config(&self) -> &ChunkStoreConfig { + &self.config + } + + /// Iterate over all chunks in the store, in ascending [`ChunkId`] order. + #[inline] + pub fn iter_chunks(&self) -> impl Iterator> + '_ { + self.chunks_per_chunk_id.values() + } + + /// Lookup the _latest_ arrow [`ArrowDataType`] used by a specific [`re_types_core::Component`]. + #[inline] + pub fn lookup_datatype(&self, component_name: &ComponentName) -> Option<&ArrowDataType> { + self.type_registry.get(component_name) + } +} diff --git a/crates/re_data_store/src/store_subscriber.rs b/crates/re_chunk_store/src/subscribers.rs similarity index 56% rename from crates/re_data_store/src/store_subscriber.rs rename to crates/re_chunk_store/src/subscribers.rs index f56264164bfe..1d375bd58912 100644 --- a/crates/re_data_store/src/store_subscriber.rs +++ b/crates/re_chunk_store/src/subscribers.rs @@ -1,18 +1,19 @@ use parking_lot::RwLock; -use crate::{DataStore, StoreEvent}; +use crate::{ChunkStore, ChunkStoreEvent}; // --- // TODO(cmc): Not sure why I need the extra Box here, RwLock should be `?Sized`. -type SharedStoreSubscriber = RwLock>; +type SharedStoreSubscriber = RwLock>; -/// A [`StoreSubscriber`] subscribes to atomic changes from all [`DataStore`]s through [`StoreEvent`]s. +/// A [`ChunkStoreSubscriber`] subscribes to atomic changes from all [`ChunkStore`]s +/// through [`ChunkStoreEvent`]s. /// -/// [`StoreSubscriber`]s can be used to build both secondary indices and trigger systems. +/// [`ChunkStoreSubscriber`]s can be used to build both secondary indices and trigger systems. // // TODO(#4204): StoreSubscriber should require SizeBytes so they can be part of memstats. -pub trait StoreSubscriber: std::any::Any + Send + Sync { +pub trait ChunkStoreSubscriber: std::any::Any + Send + Sync { /// Arbitrary name for the subscriber. /// /// Does not need to be unique. @@ -34,54 +35,54 @@ pub trait StoreSubscriber: std::any::Any + Send + Sync { /// ``` fn as_any_mut(&mut self) -> &mut dyn std::any::Any; - /// The core of this trait: get notified of changes happening in all [`DataStore`]s. + /// The core of this trait: get notified of changes happening in all [`ChunkStore`]s. /// - /// This will be called automatically by the [`DataStore`] itself if the subscriber has been - /// registered: [`DataStore::register_subscriber`]. - /// Or you might want to feed it [`StoreEvent`]s manually, depending on your use case. + /// This will be called automatically by the [`ChunkStore`] itself if the subscriber has been + /// registered: [`ChunkStore::register_subscriber`]. + /// Or you might want to feed it [`ChunkStoreEvent`]s manually, depending on your use case. /// /// ## Example /// /// ```ignore - /// fn on_events(&mut self, events: &[StoreEvent]) { - /// use re_data_store::StoreDiffKind; + /// fn on_events(&mut self, events: &[ChunkStoreEvent]) { + /// use re_chunk_store::ChunkStoreDiffKind; /// for event in events { /// match event.kind { - /// StoreDiffKind::Addition => println!("Row added: {}", event.row_id), - /// StoreDiffKind::Deletion => println!("Row removed: {}", event.row_id), + /// ChunkStoreDiffKind::Addition => println!("Row added: {}", event.row_id), + /// ChunkStoreDiffKind::Deletion => println!("Row removed: {}", event.row_id), /// } /// } /// } /// ``` - fn on_events(&mut self, events: &[StoreEvent]); + fn on_events(&mut self, events: &[ChunkStoreEvent]); } -/// All registered [`StoreSubscriber`]s. +/// All registered [`ChunkStoreSubscriber`]s. static SUBSCRIBERS: once_cell::sync::Lazy>> = once_cell::sync::Lazy::new(|| RwLock::new(Vec::new())); #[derive(Debug, Clone, Copy)] -pub struct StoreSubscriberHandle(u32); +pub struct ChunkStoreSubscriberHandle(u32); -impl DataStore { - /// Registers a [`StoreSubscriber`] so it gets automatically notified when data gets added and/or - /// removed to/from a [`DataStore`]. +impl ChunkStore { + /// Registers a [`ChunkStoreSubscriber`] so it gets automatically notified when data gets added and/or + /// removed to/from a [`ChunkStore`]. /// - /// Refer to [`StoreEvent`]'s documentation for more information about these events. + /// Refer to [`ChunkStoreEvent`]'s documentation for more information about these events. /// /// ## Scope /// - /// Registered [`StoreSubscriber`]s are global scope: they get notified of all events from all - /// existing [`DataStore`]s, including [`DataStore`]s created after the subscriber was registered. + /// Registered [`ChunkStoreSubscriber`]s are global scope: they get notified of all events from all + /// existing [`ChunkStore`]s, including [`ChunkStore`]s created after the subscriber was registered. /// - /// Use [`StoreEvent::store_id`] to identify the source of an event. + /// Use [`ChunkStoreEvent::store_id`] to identify the source of an event. /// /// ## Late registration /// /// Subscribers must be registered before a store gets created to guarantee that no events /// were missed. /// - /// [`StoreEvent::event_id`] can be used to identify missing events. + /// [`ChunkStoreEvent::event_id`] can be used to identify missing events. /// /// ## Ordering /// @@ -92,17 +93,19 @@ impl DataStore { /// subscriber. // // TODO(cmc): send a compacted snapshot to late registerers for bootstrapping - pub fn register_subscriber(subscriber: Box) -> StoreSubscriberHandle { + pub fn register_subscriber( + subscriber: Box, + ) -> ChunkStoreSubscriberHandle { let mut subscribers = SUBSCRIBERS.write(); subscribers.push(RwLock::new(subscriber)); - StoreSubscriberHandle(subscribers.len() as u32 - 1) + ChunkStoreSubscriberHandle(subscribers.len() as u32 - 1) } /// Passes a reference to the downcasted subscriber to the given `FnMut` callback. /// /// Returns `None` if the subscriber doesn't exist or downcasting failed. - pub fn with_subscriber T>( - StoreSubscriberHandle(handle): StoreSubscriberHandle, + pub fn with_subscriber T>( + ChunkStoreSubscriberHandle(handle): ChunkStoreSubscriberHandle, mut f: F, ) -> Option { let subscribers = SUBSCRIBERS.read(); @@ -115,8 +118,8 @@ impl DataStore { /// Passes a reference to the downcasted subscriber to the given `FnOnce` callback. /// /// Returns `None` if the subscriber doesn't exist or downcasting failed. - pub fn with_subscriber_once T>( - StoreSubscriberHandle(handle): StoreSubscriberHandle, + pub fn with_subscriber_once T>( + ChunkStoreSubscriberHandle(handle): ChunkStoreSubscriberHandle, f: F, ) -> Option { let subscribers = SUBSCRIBERS.read(); @@ -129,8 +132,8 @@ impl DataStore { /// Passes a mutable reference to the downcasted subscriber to the given callback. /// /// Returns `None` if the subscriber doesn't exist or downcasting failed. - pub fn with_subscriber_mut T>( - StoreSubscriberHandle(handle): StoreSubscriberHandle, + pub fn with_subscriber_mut T>( + ChunkStoreSubscriberHandle(handle): ChunkStoreSubscriberHandle, mut f: F, ) -> Option { let subscribers = SUBSCRIBERS.read(); @@ -140,8 +143,8 @@ impl DataStore { }) } - /// Called by [`DataStore`]'s mutating methods to notify subscriber subscribers of upcoming events. - pub(crate) fn on_events(events: &[StoreEvent]) { + /// Called by [`ChunkStore`]'s mutating methods to notify subscriber subscribers of upcoming events. + pub(crate) fn on_events(events: &[ChunkStoreEvent]) { re_tracing::profile_function!(); let subscribers = SUBSCRIBERS.read(); // TODO(cmc): might want to parallelize at some point. @@ -153,22 +156,25 @@ impl DataStore { #[cfg(test)] mod tests { + use std::sync::Arc; + use ahash::HashSet; + use re_chunk::{Chunk, RowId}; use re_log_types::{ example_components::{MyColor, MyIndex, MyPoint}, - DataRow, RowId, StoreId, TimePoint, Timeline, + StoreId, TimePoint, Timeline, }; - use crate::{DataStore, GarbageCollectionOptions, StoreSubscriber}; + use crate::{ChunkStore, ChunkStoreSubscriber, GarbageCollectionOptions}; use super::*; - /// A simple [`StoreSubscriber`] for test purposes that just accumulates [`StoreEvent`]s. + /// A simple [`ChunkStoreSubscriber`] for test purposes that just accumulates [`ChunkStoreEvent`]s. #[derive(Debug)] struct AllEvents { store_ids: HashSet, - events: Vec, + events: Vec, } impl AllEvents { @@ -180,7 +186,7 @@ mod tests { } } - impl StoreSubscriber for AllEvents { + impl ChunkStoreSubscriber for AllEvents { fn name(&self) -> String { "rerun.testing.store_subscribers.AllEvents".into() } @@ -193,7 +199,7 @@ mod tests { self } - fn on_events(&mut self, events: &[StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { self.events.extend( events .iter() @@ -206,76 +212,79 @@ mod tests { #[test] fn store_subscriber() -> anyhow::Result<()> { - let mut store1 = DataStore::new( + let mut store1 = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); - let mut store2 = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); let mut expected_events = Vec::new(); - let view = AllEvents::new([store1.id().clone(), store2.id().clone()]); - let view_handle = DataStore::register_subscriber(Box::new(view)); + let view = AllEvents::new([store1.id().clone(), store.id().clone()]); + let view_handle = ChunkStore::register_subscriber(Box::new(view)); let timeline_frame = Timeline::new_sequence("frame"); let timeline_other = Timeline::new_temporal("other"); let timeline_yet_another = Timeline::new_sequence("yet_another"); - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::from_iter([ - (timeline_frame, 42), // - (timeline_other, 666), // - (timeline_yet_another, 1), // - ]), - "entity_a".into(), - [&MyIndex::from_iter(0..10) as _], - )?; + let chunk = Chunk::builder("entity_a".into()) + .with_component_batch( + RowId::new(), + TimePoint::from_iter([ + (timeline_frame, 42), // + (timeline_other, 666), // + (timeline_yet_another, 1), // + ]), + &MyIndex::from_iter(0..10), + ) + .build()?; - expected_events.extend(store1.insert_row(&row)); + expected_events.extend(store1.insert_chunk(&Arc::new(chunk))?); - let row = { + let chunk = { let num_instances = 3; let points: Vec<_> = (0..num_instances) .map(|i| MyPoint::new(0.0, i as f32)) .collect(); let colors = vec![MyColor::from(0xFF0000FF)]; - DataRow::from_component_batches( - RowId::new(), - TimePoint::from_iter([ - (timeline_frame, 42), // - (timeline_yet_another, 1), // - ]), - "entity_b".into(), - [&points as _, &colors as _], - )? + Chunk::builder("entity_b".into()) + .with_component_batches( + RowId::new(), + TimePoint::from_iter([ + (timeline_frame, 42), // + (timeline_yet_another, 1), // + ]), + [&points as _, &colors as _], + ) + .build()? }; - expected_events.extend(store2.insert_row(&row)); + expected_events.extend(store.insert_chunk(&Arc::new(chunk))?); - let row = { + let chunk = { let num_instances = 6; let colors = vec![MyColor::from(0x00DD00FF); num_instances]; - DataRow::from_component_batches( - RowId::new(), - TimePoint::default(), - "entity_b".into(), - [ - &MyIndex::from_iter(0..num_instances as _) as _, - &colors as _, - ], - )? + Chunk::builder("entity_b".into()) + .with_component_batches( + RowId::new(), + TimePoint::default(), + [ + &MyIndex::from_iter(0..num_instances as _) as _, + &colors as _, + ], + ) + .build()? }; - expected_events.extend(store1.insert_row(&row)); + expected_events.extend(store1.insert_chunk(&Arc::new(chunk))?); expected_events.extend(store1.gc(&GarbageCollectionOptions::gc_everything()).0); - expected_events.extend(store2.gc(&GarbageCollectionOptions::gc_everything()).0); + expected_events.extend(store.gc(&GarbageCollectionOptions::gc_everything()).0); - DataStore::with_subscriber::(view_handle, |got| { + ChunkStore::with_subscriber::(view_handle, |got| { similar_asserts::assert_eq!(expected_events.len(), got.events.len()); similar_asserts::assert_eq!(expected_events, got.events); }); diff --git a/crates/re_chunk_store/src/writes.rs b/crates/re_chunk_store/src/writes.rs new file mode 100644 index 000000000000..4cbc55c47395 --- /dev/null +++ b/crates/re_chunk_store/src/writes.rs @@ -0,0 +1,176 @@ +use std::sync::Arc; + +use arrow2::array::{Array as _, ListArray as ArrowListArray}; + +use re_chunk::{Chunk, RowId}; + +use crate::{ + ChunkStore, ChunkStoreChunkStats, ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreError, + ChunkStoreEvent, ChunkStoreResult, +}; + +// Used all over in docstrings. +#[allow(unused_imports)] +use crate::ChunkId; + +// --- + +impl ChunkStore { + /// Inserts a [`Chunk`] in the store. + /// + /// Iff the store was modified, all registered subscribers will be notified and the + /// resulting [`ChunkStoreEvent`] will be returned, or `None` otherwise. + /// + /// * Trying to insert an unsorted chunk ([`Chunk::is_sorted`]) will fail with an error. + /// * Inserting a duplicated [`ChunkId`] will result in a no-op. + /// * Inserting an empty [`Chunk`] will result in a no-op. + pub fn insert_chunk( + &mut self, + chunk: &Arc, + ) -> ChunkStoreResult> { + if self.chunks_per_chunk_id.contains_key(&chunk.id()) { + // We assume that chunk IDs are unique, and that reinserting a chunk has no effect. + re_log::warn_once!( + "Chunk #{} was inserted more than once (this has no effect)", + chunk.id() + ); + return Ok(None); + } + + if !chunk.is_sorted() { + return Err(ChunkStoreError::UnsortedChunk); + } + + let Some(row_id_range) = chunk.row_id_range() else { + return Ok(None); + }; + + re_tracing::profile_function!(format!("{}", row_id_range.0)); + + self.insert_id += 1; + + self.chunks_per_chunk_id.insert(chunk.id(), chunk.clone()); + self.chunk_ids_per_min_row_id + .entry(row_id_range.0) + .or_default() + .push(chunk.id()); + + if chunk.is_static() { + // Static data: make sure to keep the most recent chunk available for each component column. + + let row_id_range_per_component = chunk.row_id_range_per_component(); + + for (&component_name, list_array) in chunk.components() { + let is_empty = list_array + .validity() + .map_or(false, |validity| validity.is_empty()); + if is_empty { + continue; + } + + let Some((_row_id_min, row_id_max)) = + row_id_range_per_component.get(&component_name) + else { + continue; + }; + + self.static_chunk_ids_per_entity + .entry(chunk.entity_path().clone()) + .or_default() + .entry(component_name) + .and_modify(|cur_chunk_id| { + // NOTE: When attempting to overwrite static data, the chunk with the most + // recent data within -- according to RowId -- wins. + + let cur_row_id_max = self.chunks_per_chunk_id.get(cur_chunk_id).map_or( + RowId::ZERO, + |chunk| { + chunk + .row_id_range_per_component() + .get(&component_name) + .map_or(RowId::ZERO, |(_, row_id_max)| *row_id_max) + }, + ); + if *row_id_max > cur_row_id_max { + *cur_chunk_id = chunk.id(); + } + }) + .or_insert_with(|| chunk.id()); + } + + self.static_chunks_stats += ChunkStoreChunkStats::from_chunk(chunk); + } else { + // Temporal data: just index the chunk on every dimension of interest. + + let temporal_chunk_ids_per_timeline = self + .temporal_chunk_ids_per_entity + .entry(chunk.entity_path().clone()) + .or_default(); + + // NOTE: We must make sure to use the time range of each specific component column + // here, or we open ourselves to nasty edge cases. + // + // See the `latest_at_sparse_component_edge_case` test. + for (timeline, time_range_per_component) in chunk.time_range_per_component() { + let temporal_chunk_ids_per_component = + temporal_chunk_ids_per_timeline.entry(timeline).or_default(); + + for (component_name, time_range) in time_range_per_component { + let temporal_chunk_ids_per_time = temporal_chunk_ids_per_component + .entry(component_name) + .or_default(); + + // See `ChunkIdSetPerTime::max_interval_length`'s documentation. + temporal_chunk_ids_per_time.max_interval_length = u64::max( + temporal_chunk_ids_per_time.max_interval_length, + time_range.abs_length(), + ); + + temporal_chunk_ids_per_time + .per_start_time + .entry(time_range.min()) + .or_default() + .insert(chunk.id()); + temporal_chunk_ids_per_time + .per_end_time + .entry(time_range.max()) + .or_default() + .insert(chunk.id()); + } + } + + self.temporal_chunks_stats += ChunkStoreChunkStats::from_chunk(chunk); + } + + for (&component_name, list_array) in chunk.components() { + self.type_registry.insert( + component_name, + ArrowListArray::::get_child_type(list_array.data_type()).clone(), + ); + } + + let event = ChunkStoreEvent { + store_id: self.id.clone(), + store_generation: self.generation(), + event_id: self + .event_id + .fetch_add(1, std::sync::atomic::Ordering::Relaxed), + diff: ChunkStoreDiff::addition(Arc::clone(chunk)), + }; + + { + let events = &[event.clone()]; + + if cfg!(debug_assertions) { + let any_event_other_than_addition = events + .iter() + .any(|e| e.kind != ChunkStoreDiffKind::Addition); + assert!(!any_event_other_than_addition); + } + + Self::on_events(events); + } + + Ok(Some(event)) + } +} diff --git a/crates/re_chunk_store/tests/correctness.rs b/crates/re_chunk_store/tests/correctness.rs new file mode 100644 index 000000000000..4a06e65e4550 --- /dev/null +++ b/crates/re_chunk_store/tests/correctness.rs @@ -0,0 +1,436 @@ +// https://github.com/rust-lang/rust-clippy/issues/10011 +#![cfg(test)] + +use std::sync::Arc; + +use itertools::Itertools as _; +use re_chunk::{Chunk, ChunkId, RowId}; +use re_chunk_store::{ChunkStore, ChunkStoreError, LatestAtQuery}; +use re_log_types::example_components::{MyIndex, MyPoint}; +use re_log_types::{ + build_frame_nr, build_log_time, Duration, EntityPath, Time, TimeInt, TimePoint, TimeType, + Timeline, +}; +use re_types_core::Loggable as _; + +// --- + +fn query_latest_component( + store: &ChunkStore, + entity_path: &EntityPath, + query: &LatestAtQuery, +) -> Option<(TimeInt, RowId, C)> { + re_tracing::profile_function!(); + + let (data_time, row_id, array) = store + .latest_at_relevant_chunks(query, entity_path, C::name()) + .into_iter() + .flat_map(|chunk| { + chunk + .latest_at(query, C::name()) + .iter_rows(&query.timeline(), &C::name()) + .collect_vec() + }) + .max_by_key(|(data_time, row_id, _)| (*data_time, *row_id)) + .and_then(|(data_time, row_id, array)| array.map(|array| (data_time, row_id, array)))?; + + let value = C::from_arrow(&*array).ok()?.first()?.clone(); + + Some((data_time, row_id, value)) +} + +// --- + +#[test] +fn row_id_ordering_semantics() -> anyhow::Result<()> { + let entity_path: EntityPath = "some_entity".into(); + + let timeline_frame = Timeline::new_sequence("frame"); + let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); + + let point1 = MyPoint::new(1.0, 1.0); + let point2 = MyPoint::new(2.0, 2.0); + + // * Insert `point1` at frame #10 with a random `RowId`. + // * Insert `point2` at frame #10 with a random `RowId`. + // * Query at frame #11 and make sure we get `point2` because random `RowId`s are monotonically + // increasing. + { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(RowId::new(), timepoint.clone(), &[point1]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(RowId::new(), timepoint.clone(), &[point2]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + { + let query = LatestAtQuery::new(timeline_frame, 11); + let (_, _, got_point) = + query_latest_component::(&store, &entity_path, &query).unwrap(); + similar_asserts::assert_eq!(point2, got_point); + } + } + + // * Insert `point1` at frame #10 with a random `RowId`. + // * Insert `point2` at frame #10 with that same `RowId`. + // * Nothing happens, as re-using `RowId`s is simply UB. + { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let row_id = RowId::new(); + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id, timepoint.clone(), &[point1]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id, timepoint.clone(), &[point2]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + } + + // * Insert `point1` at frame #10 with a random `RowId`. + // * Insert `point2` at frame #10 using `point1`'s `RowId`, decremented by one. + // * Query at frame #11 and make sure we get `point1` because of intra-timestamp tie-breaks. + { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let row_id1 = RowId::new(); + let row_id2 = row_id1.next(); + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id2, timepoint.clone(), &[point1]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id1, timepoint.clone(), &[point2]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + { + let query = LatestAtQuery::new(timeline_frame, 11); + let (_, _, got_point) = + query_latest_component::(&store, &entity_path, &query).unwrap(); + similar_asserts::assert_eq!(point1, got_point); + } + } + + // Static data has last-write-wins semantics, as defined by RowId-ordering. + // Timeless is RowId-ordered too! + // + // * Insert static `point1` with a random `RowId`. + // * Insert static `point2` using `point1`'s `RowId`, decremented by one. + // * Query and make sure we get `point1` because of last-write-wins semantics. + { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let row_id1 = RowId::new(); + let row_id2 = row_id1.next(); + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id2, TimePoint::default(), &[point1]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id1, TimePoint::default(), &[point2]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + { + let query = LatestAtQuery::new(Timeline::new_temporal("doesnt_matter"), TimeInt::MAX); + let (_, _, got_point) = + query_latest_component::(&store, &entity_path, &query).unwrap(); + similar_asserts::assert_eq!(point1, got_point); + } + } + + // * Insert `point1` at frame #10 with a random `ChunkId` & `RowId`. + // * Insert `point2` at frame #10 using `point1`'s `ChunkId` & `RowId`. + // * Query at frame #11 and make sure we get `point1` because chunks are considered idempotent, + // and therefore the second write does nothing. + { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let chunk_id = ChunkId::new(); + let row_id = RowId::new(); + + let chunk = Chunk::builder_with_id(chunk_id, entity_path.clone()) + .with_component_batch(row_id, timepoint.clone(), &[point1]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let chunk = Chunk::builder_with_id(chunk_id, entity_path.clone()) + .with_component_batch(row_id, timepoint.clone(), &[point2]) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + { + let query = LatestAtQuery::new(timeline_frame, 11); + let (_, _, got_point) = + query_latest_component::(&store, &entity_path, &query).unwrap(); + similar_asserts::assert_eq!(point1, got_point); + } + } + + Ok(()) +} + +// --- + +#[test] +fn write_errors() -> anyhow::Result<()> { + re_log::setup_logging(); + + let entity_path = EntityPath::from("this/that"); + + { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let row_id1 = RowId::new(); + let row_id2 = row_id1.next(); + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + row_id2, + [build_frame_nr(1), build_log_time(Time::now())], + &MyPoint::from_iter(0..1), + ) + .with_component_batch( + row_id1, + [build_frame_nr(2), build_log_time(Time::now())], + &MyPoint::from_iter(0..1), + ) + .build()?; + + assert!(matches!( + store.insert_chunk(&Arc::new(chunk)), + Err(ChunkStoreError::UnsortedChunk), + )); + + Ok(()) + } +} + +// --- + +#[test] +fn latest_at_emptiness_edge_cases() -> anyhow::Result<()> { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let entity_path = EntityPath::from("this/that"); + let now = Time::now(); + let now_minus_1s = now - Duration::from_secs(1.0); + let now_minus_1s_nanos = now_minus_1s.nanos_since_epoch(); + let frame39 = 39; + let frame40 = 40; + let num_instances = 3; + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + RowId::new(), + [build_log_time(now), build_frame_nr(frame40)], + &MyIndex::from_iter(0..num_instances), + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let timeline_wrong_name = Timeline::new("lag_time", TimeType::Time); + let timeline_wrong_kind = Timeline::new("log_time", TimeType::Sequence); + let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); + let timeline_log_time = Timeline::log_time(); + + // empty frame_nr + { + let chunks = store.latest_at_relevant_chunks( + &LatestAtQuery::new(timeline_frame_nr, frame39), + &entity_path, + MyIndex::name(), + ); + assert!(chunks.is_empty()); + } + + // empty log_time + { + let chunks = store.latest_at_relevant_chunks( + &LatestAtQuery::new(timeline_log_time, now_minus_1s_nanos), + &entity_path, + MyIndex::name(), + ); + assert!(chunks.is_empty()); + } + + // wrong entity path + { + let chunks = store.latest_at_relevant_chunks( + &LatestAtQuery::new(timeline_frame_nr, frame40), + &EntityPath::from("does/not/exist"), + MyIndex::name(), + ); + assert!(chunks.is_empty()); + } + + // wrong timeline name + { + let chunks = store.latest_at_relevant_chunks( + &LatestAtQuery::new(timeline_wrong_name, frame40), + &EntityPath::from("does/not/exist"), + MyIndex::name(), + ); + assert!(chunks.is_empty()); + } + + // wrong timeline kind + { + let chunks = store.latest_at_relevant_chunks( + &LatestAtQuery::new(timeline_wrong_kind, frame40), + &EntityPath::from("does/not/exist"), + MyIndex::name(), + ); + assert!(chunks.is_empty()); + } + + Ok(()) +} + +// --- + +#[test] +fn entity_min_time_correct() -> anyhow::Result<()> { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let entity_path = EntityPath::from("this/that"); + let wrong_entity_path = EntityPath::from("this/that/other"); + + let point = MyPoint::new(1.0, 1.0); + let timeline_wrong_name = Timeline::new("lag_time", TimeType::Time); + let timeline_wrong_kind = Timeline::new("log_time", TimeType::Sequence); + let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); + let timeline_log_time = Timeline::log_time(); + + let now = Time::now(); + let now_plus_one = now + Duration::from_secs(1.0); + let now_minus_one = now - Duration::from_secs(1.0); + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + RowId::new(), + TimePoint::default() + .with(timeline_log_time, now) + .with(timeline_frame_nr, 42), + &[point], + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + assert!(store + .entity_min_time(&timeline_wrong_name, &entity_path) + .is_none()); + assert!(store + .entity_min_time(&timeline_wrong_kind, &entity_path) + .is_none()); + assert_eq!( + store.entity_min_time(&timeline_frame_nr, &entity_path), + Some(TimeInt::new_temporal(42)) + ); + assert_eq!( + store.entity_min_time(&timeline_log_time, &entity_path), + Some(TimeInt::try_from(now).unwrap()) + ); + assert!(store + .entity_min_time(&timeline_frame_nr, &wrong_entity_path) + .is_none()); + + // insert row in the future, these shouldn't be visible + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + RowId::new(), + TimePoint::default() + .with(timeline_log_time, now_plus_one) + .with(timeline_frame_nr, 54), + &[point], + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + assert!(store + .entity_min_time(&timeline_wrong_name, &entity_path) + .is_none()); + assert!(store + .entity_min_time(&timeline_wrong_kind, &entity_path) + .is_none()); + assert_eq!( + store.entity_min_time(&timeline_frame_nr, &entity_path), + Some(TimeInt::new_temporal(42)) + ); + assert_eq!( + store.entity_min_time(&timeline_log_time, &entity_path), + Some(TimeInt::try_from(now).unwrap()) + ); + assert!(store + .entity_min_time(&timeline_frame_nr, &wrong_entity_path) + .is_none()); + + // insert row in the past, these should be visible + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + RowId::new(), + TimePoint::default() + .with(timeline_log_time, now_minus_one) + .with(timeline_frame_nr, 32), + &[point], + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + assert!(store + .entity_min_time(&timeline_wrong_name, &entity_path) + .is_none()); + assert!(store + .entity_min_time(&timeline_wrong_kind, &entity_path) + .is_none()); + assert_eq!( + store.entity_min_time(&timeline_frame_nr, &entity_path), + Some(TimeInt::new_temporal(32)) + ); + assert_eq!( + store.entity_min_time(&timeline_log_time, &entity_path), + Some(TimeInt::try_from(now_minus_one).unwrap()) + ); + assert!(store + .entity_min_time(&timeline_frame_nr, &wrong_entity_path) + .is_none()); + + Ok(()) +} diff --git a/crates/re_chunk_store/tests/gc.rs b/crates/re_chunk_store/tests/gc.rs new file mode 100644 index 000000000000..f819f0e5f677 --- /dev/null +++ b/crates/re_chunk_store/tests/gc.rs @@ -0,0 +1,326 @@ +use std::sync::Arc; + +use arrow2::array::Array as ArrowArray; +use itertools::Itertools as _; +use rand::Rng as _; + +use re_chunk::{Chunk, ChunkId, ComponentName, LatestAtQuery, RowId, TimeInt}; +use re_chunk_store::{ChunkStore, GarbageCollectionOptions, GarbageCollectionTarget}; +use re_log_types::{ + build_frame_nr, + example_components::{MyColor, MyIndex, MyPoint}, + EntityPath, TimeType, Timeline, +}; +use re_types::testing::build_some_large_structs; +use re_types_core::Loggable as _; + +// --- + +fn query_latest_array( + store: &ChunkStore, + entity_path: &EntityPath, + component_name: ComponentName, + query: &LatestAtQuery, +) -> Option<(TimeInt, RowId, Box)> { + re_tracing::profile_function!(); + + let (data_time, row_id, array) = store + .latest_at_relevant_chunks(query, entity_path, component_name) + .into_iter() + .flat_map(|chunk| { + chunk + .latest_at(query, component_name) + .iter_rows(&query.timeline(), &component_name) + .collect_vec() + }) + .max_by_key(|(data_time, row_id, _)| (*data_time, *row_id)) + .and_then(|(data_time, row_id, array)| array.map(|array| (data_time, row_id, array)))?; + + Some((data_time, row_id, array)) +} + +// --- + +#[test] +fn simple() -> anyhow::Result<()> { + re_log::setup_logging(); + + let mut rng = rand::thread_rng(); + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + for _ in 0..2 { + let num_ents = 10; + for i in 0..num_ents { + let entity_path = EntityPath::from(format!("this/that/{i}")); + + let num_frames = rng.gen_range(0..=100); + let frames = (0..num_frames).filter(|_| rand::thread_rng().gen()); + for frame_nr in frames { + let num_instances = rng.gen_range(0..=1_000); + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + RowId::new(), + [build_frame_nr(frame_nr)], + &build_some_large_structs(num_instances), + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + } + } + + let stats_before = store.stats(); + + let (_store_events, stats_diff) = store.gc(&GarbageCollectionOptions { + target: GarbageCollectionTarget::DropAtLeastFraction(1.0 / 3.0), + protect_latest: 0, + dont_protect_components: Default::default(), + dont_protect_timelines: Default::default(), + time_budget: std::time::Duration::MAX, + }); + + // NOTE: only temporal data gets purged! + let num_bytes_dropped = stats_diff.total().total_size_bytes as f64; + let num_bytes_dropped_expected_min = + stats_before.total().total_size_bytes as f64 * 0.95 / 3.0; + let num_bytes_dropped_expected_max = + stats_before.total().total_size_bytes as f64 * 1.05 / 3.0; + + assert!( + num_bytes_dropped_expected_min <= num_bytes_dropped + && num_bytes_dropped <= num_bytes_dropped_expected_max, + "{} <= {} <= {}", + re_format::format_bytes(num_bytes_dropped_expected_min), + re_format::format_bytes(num_bytes_dropped), + re_format::format_bytes(num_bytes_dropped_expected_max), + ); + } + + Ok(()) +} + +#[test] +fn simple_static() -> anyhow::Result<()> { + re_log::setup_logging(); + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let entity_path = EntityPath::from("this/that"); + + let frame1 = TimeInt::new_temporal(1); + let frame2 = TimeInt::new_temporal(2); + let frame3 = TimeInt::new_temporal(3); + let frame4 = TimeInt::new_temporal(4); + + let row_id1 = RowId::new(); + let (indices1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); + let chunk1 = Arc::new( + Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id1, + [build_frame_nr(frame1)], + [&indices1 as _, &colors1 as _], + ) + .build()?, + ); + + let row_id2 = RowId::new(); + let points2 = MyPoint::from_iter(0..3); + let chunk2 = Arc::new( + Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id2, + [build_frame_nr(frame2)], + [&indices1 as _, &points2 as _], + ) + .build()?, + ); + + let points3 = MyPoint::from_iter(0..10); + let chunk3 = Chunk::builder(entity_path.clone()) + .with_component_batches(RowId::new(), [build_frame_nr(frame3)], [&points3 as _]) + .build()?; + + let colors4 = MyColor::from_iter(0..5); + let chunk4 = Chunk::builder(entity_path.clone()) + .with_component_batches(RowId::new(), [build_frame_nr(frame4)], [&colors4 as _]) + .build()?; + + store.insert_chunk(&chunk1)?; + store.insert_chunk(&chunk2)?; + store.insert_chunk(&Arc::new(chunk3))?; + store.insert_chunk(&Arc::new(chunk4))?; + + // Re-insert `chunk1` and `chunk2` as static data as well + let row_id1_static = RowId::new(); + let chunk1_static = chunk1 + .clone_as(ChunkId::new(), row_id1_static) + .into_static(); + let row_id2_static = RowId::new(); + let chunk2_static = chunk2 + .clone_as(ChunkId::new(), row_id2_static) + .into_static(); + store.insert_chunk(&Arc::new(chunk1_static))?; + store.insert_chunk(&Arc::new(chunk2_static))?; + + store.gc(&GarbageCollectionOptions { + target: GarbageCollectionTarget::Everything, + protect_latest: 1, + dont_protect_components: Default::default(), + dont_protect_timelines: Default::default(), + time_budget: std::time::Duration::MAX, + }); + + let assert_latest_components = |frame_nr: TimeInt, rows: &[(ComponentName, RowId)]| { + let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); + + for (component_name, expected_row_id) in rows { + let (_data_time, row_id, _array) = query_latest_array( + &store, + &entity_path, + *component_name, + &LatestAtQuery::new(timeline_frame_nr, frame_nr), + ) + .unwrap(); + + assert_eq!(*expected_row_id, row_id, "{component_name}"); + } + }; + + eprintln!("{store}"); + + assert_latest_components( + TimeInt::MAX, + &[ + (MyIndex::name(), row_id2_static), + (MyColor::name(), row_id1_static), + (MyPoint::name(), row_id2_static), + ], + ); + + Ok(()) +} + +#[test] +fn protected() -> anyhow::Result<()> { + re_log::setup_logging(); + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + Default::default(), + ); + + let entity_path = EntityPath::from("this/that"); + + let frame1 = TimeInt::new_temporal(1); + let frame2 = TimeInt::new_temporal(2); + let frame3 = TimeInt::new_temporal(3); + let frame4 = TimeInt::new_temporal(4); + + let row_id1 = RowId::new(); + let (indices1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); + let chunk1 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id1, + [build_frame_nr(frame1)], + [&indices1 as _, &colors1 as _], + ) + .build()?; + + let row_id2 = RowId::new(); + let points2 = MyPoint::from_iter(0..3); + let chunk2 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id2, + [build_frame_nr(frame2)], + [&indices1 as _, &points2 as _], + ) + .build()?; + + let row_id3 = RowId::new(); + let points3 = MyPoint::from_iter(0..10); + let chunk3 = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id3, [build_frame_nr(frame3)], [&points3 as _]) + .build()?; + + let row_id4 = RowId::new(); + let colors4 = MyColor::from_iter(0..5); + let chunk4 = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id4, [build_frame_nr(frame4)], [&colors4 as _]) + .build()?; + + store.insert_chunk(&Arc::new(chunk1))?; + store.insert_chunk(&Arc::new(chunk2))?; + store.insert_chunk(&Arc::new(chunk3))?; + store.insert_chunk(&Arc::new(chunk4))?; + + store.gc(&GarbageCollectionOptions { + target: GarbageCollectionTarget::Everything, + protect_latest: 1, + dont_protect_components: Default::default(), + dont_protect_timelines: Default::default(), + time_budget: std::time::Duration::MAX, + }); + + let assert_latest_components = |frame_nr: TimeInt, rows: &[(ComponentName, Option)]| { + let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); + + for (component_name, expected_row_id) in rows { + let row_id = query_latest_array( + &store, + &entity_path, + *component_name, + &LatestAtQuery::new(timeline_frame_nr, frame_nr), + ) + .map(|(_data_time, row_id, _array)| row_id); + + assert_eq!(*expected_row_id, row_id, "{component_name}"); + } + }; + + eprintln!("{store}"); + + assert_latest_components( + frame1, + &[ + (MyIndex::name(), None), + (MyColor::name(), None), + (MyPoint::name(), None), + ], + ); + + assert_latest_components( + frame2, + &[ + (MyIndex::name(), Some(row_id2)), + (MyColor::name(), None), + (MyPoint::name(), Some(row_id2)), + ], + ); + + assert_latest_components( + frame3, + &[ + (MyIndex::name(), Some(row_id2)), + (MyColor::name(), None), + (MyPoint::name(), Some(row_id3)), + ], + ); + + assert_latest_components( + frame4, + &[ + (MyIndex::name(), Some(row_id2)), + (MyColor::name(), Some(row_id4)), + (MyPoint::name(), Some(row_id3)), + ], + ); + + Ok(()) +} diff --git a/crates/re_chunk_store/tests/memory_test.rs b/crates/re_chunk_store/tests/memory_test.rs new file mode 100644 index 000000000000..040281af9b4a --- /dev/null +++ b/crates/re_chunk_store/tests/memory_test.rs @@ -0,0 +1,147 @@ +//! Measures the memory overhead of the chunk store. + +// https://github.com/rust-lang/rust-clippy/issues/10011 +#![cfg(test)] + +use std::sync::{ + atomic::{AtomicUsize, Ordering::Relaxed}, + Arc, +}; + +static LIVE_BYTES_GLOBAL: AtomicUsize = AtomicUsize::new(0); + +thread_local! { + static LIVE_BYTES_IN_THREAD: AtomicUsize = AtomicUsize::new(0); +} + +pub struct TrackingAllocator { + allocator: std::alloc::System, +} + +#[global_allocator] +pub static GLOBAL_ALLOCATOR: TrackingAllocator = TrackingAllocator { + allocator: std::alloc::System, +}; + +#[allow(unsafe_code)] +// SAFETY: +// We just do book-keeping and then let another allocator do all the actual work. +unsafe impl std::alloc::GlobalAlloc for TrackingAllocator { + #[allow(clippy::let_and_return)] + unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { + LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_add(layout.size(), Relaxed)); + LIVE_BYTES_GLOBAL.fetch_add(layout.size(), Relaxed); + + // SAFETY: + // Just deferring + unsafe { self.allocator.alloc(layout) } + } + + unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { + LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_sub(layout.size(), Relaxed)); + LIVE_BYTES_GLOBAL.fetch_sub(layout.size(), Relaxed); + + // SAFETY: + // Just deferring + unsafe { self.allocator.dealloc(ptr, layout) }; + } +} + +fn live_bytes_local() -> usize { + LIVE_BYTES_IN_THREAD.with(|bytes| bytes.load(Relaxed)) +} + +fn live_bytes_global() -> usize { + LIVE_BYTES_GLOBAL.load(Relaxed) +} + +/// Returns `(num_bytes_allocated, num_bytes_allocated_by_this_thread)`. +fn memory_use(run: impl Fn() -> R) -> (usize, usize) { + let used_bytes_start_local = live_bytes_local(); + let used_bytes_start_global = live_bytes_global(); + let ret = run(); + let bytes_used_local = live_bytes_local() - used_bytes_start_local; + let bytes_used_global = live_bytes_global() - used_bytes_start_global; + drop(ret); + (bytes_used_global, bytes_used_local) +} + +// ---------------------------------------------------------------------------- + +use re_chunk::{ + external::crossbeam::channel::TryRecvError, ChunkBatcher, ChunkBatcherConfig, PendingRow, +}; +use re_chunk_store::{ChunkStore, ChunkStoreConfig}; +use re_log_types::{TimePoint, TimeType, Timeline}; +use re_types::{components::Scalar, Loggable}; + +/// The memory overhead of storing many scalars in the store. +#[test] +fn scalar_memory_overhead() { + re_log::setup_logging(); + + re_log::warn!("THIS TEST HAS TO ACCOUNT FOR THE MEMORY OF ALL RUNNING THREADS -- IT MUST BE RUN ON ITS OWN, WITH NO OTHER TESTS RUNNING IN PARALLEL: `cargo t --all-features -p re_chunk_store memory_tests -- scalar_memory_overhead`"); + + const NUM_SCALARS: usize = 1024 * 1024; + + let (total_mem_use_global, _total_mem_use_local) = memory_use(|| { + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + ChunkStoreConfig::default(), + ); + + let batcher = ChunkBatcher::new(ChunkBatcherConfig { + flush_num_rows: 1000, + ..ChunkBatcherConfig::NEVER + }) + .unwrap(); + + for i in 0..NUM_SCALARS { + let entity_path = re_log_types::entity_path!("scalar"); + let timepoint = + TimePoint::default().with(Timeline::new("log_time", TimeType::Time), i as i64); + let scalars = Scalar::to_arrow([Scalar(i as f64)]).unwrap(); + + let row = PendingRow::new( + timepoint, + std::iter::once((Scalar::name(), scalars)).collect(), + ); + + batcher.push_row(entity_path.clone(), row); + } + + let chunks_rx = batcher.chunks(); + drop(batcher); // flush and close + + loop { + let chunk = match chunks_rx.try_recv() { + Ok(chunk) => chunk, + Err(TryRecvError::Empty) => panic!("expected chunk, got none"), + Err(TryRecvError::Disconnected) => break, + }; + // eprintln!( + // "chunk with {} rows: {}", + // chunk.num_rows(), + // re_format::format_bytes(chunk.total_size_bytes() as _) + // ); + _ = store.insert_chunk(&Arc::new(chunk)).unwrap(); + } + + store + }); + + insta::assert_debug_snapshot!( + "scalars_on_one_timeline_new", + [ + format!("{NUM_SCALARS} scalars"), + format!( + "{} in total", + re_format::format_bytes(total_mem_use_global as _) + ), + format!( + "{} per row", + re_format::format_bytes(total_mem_use_global as f64 / NUM_SCALARS as f64) + ), + ] + ); +} diff --git a/crates/re_chunk_store/tests/reads.rs b/crates/re_chunk_store/tests/reads.rs new file mode 100644 index 000000000000..f1149c77734e --- /dev/null +++ b/crates/re_chunk_store/tests/reads.rs @@ -0,0 +1,687 @@ +use std::sync::Arc; + +use arrow2::array::Array as ArrowArray; + +use itertools::Itertools; +use re_chunk::{Chunk, RowId, TimePoint}; +use re_chunk_store::{ + ChunkStore, ChunkStoreConfig, LatestAtQuery, RangeQuery, ResolvedTimeRange, TimeInt, +}; +use re_log_types::{ + build_frame_nr, + example_components::{MyColor, MyIndex, MyPoint}, + EntityPath, TimeType, Timeline, +}; +use re_types::testing::{build_some_large_structs, LargeStruct}; +use re_types::ComponentNameSet; +use re_types_core::{ComponentName, Loggable as _}; + +// --- + +fn query_latest_array( + store: &ChunkStore, + entity_path: &EntityPath, + component_name: ComponentName, + query: &LatestAtQuery, +) -> Option<(TimeInt, RowId, Box)> { + re_tracing::profile_function!(); + + let (data_time, row_id, array) = store + .latest_at_relevant_chunks(query, entity_path, component_name) + .into_iter() + .flat_map(|chunk| { + chunk + .latest_at(query, component_name) + .iter_rows(&query.timeline(), &component_name) + .collect_vec() + }) + .max_by_key(|(data_time, row_id, _)| (*data_time, *row_id)) + .and_then(|(data_time, row_id, array)| array.map(|array| (data_time, row_id, array)))?; + + Some((data_time, row_id, array)) +} + +// --- + +#[test] +fn all_components() -> anyhow::Result<()> { + re_log::setup_logging(); + + let entity_path = EntityPath::from("this/that"); + + let frame1 = TimeInt::new_temporal(1); + let frame2 = TimeInt::new_temporal(2); + + let assert_latest_components_at = + |store: &ChunkStore, entity_path: &EntityPath, expected: Option<&[ComponentName]>| { + let timeline = Timeline::new("frame_nr", TimeType::Sequence); + + let component_names = store.all_components(&timeline, entity_path); + + let expected_component_names = expected.map(|expected| { + let expected: ComponentNameSet = expected.iter().copied().collect(); + expected + }); + + assert_eq!( + expected_component_names, component_names, + "expected to find {expected_component_names:?}, found {component_names:?} instead\n{store}", + ); + }; + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + ChunkStoreConfig::default(), + ); + + let components_a = &[ + MyColor::name(), // added by test, static + LargeStruct::name(), // added by test + ]; + + let components_b = &[ + MyColor::name(), // added by test, static + MyPoint::name(), // added by test + LargeStruct::name(), // added by test + ]; + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + RowId::new(), + TimePoint::default(), + &MyColor::from_iter(0..2), + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch( + RowId::new(), + [build_frame_nr(frame1)], + &build_some_large_structs(2), + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + assert_latest_components_at(&mut store, &entity_path, Some(components_a)); + + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batches( + RowId::new(), + [build_frame_nr(frame2)], + [ + &build_some_large_structs(2) as _, + &MyPoint::from_iter(0..2) as _, + ], + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + assert_latest_components_at(&mut store, &entity_path, Some(components_b)); + + Ok(()) +} + +// --- + +#[test] +fn latest_at() -> anyhow::Result<()> { + re_log::setup_logging(); + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + ChunkStoreConfig::default(), + ); + + let entity_path = EntityPath::from("this/that"); + + let frame0 = TimeInt::new_temporal(0); + let frame1 = TimeInt::new_temporal(1); + let frame2 = TimeInt::new_temporal(2); + let frame3 = TimeInt::new_temporal(3); + let frame4 = TimeInt::new_temporal(4); + + let row_id1 = RowId::new(); + let (indices1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); + let chunk1 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id1, + [build_frame_nr(frame1)], + [&indices1 as _, &colors1 as _], + ) + .build()?; + + let row_id2 = RowId::new(); + let points2 = MyPoint::from_iter(0..3); + let chunk2 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id2, + [build_frame_nr(frame2)], + [&indices1 as _, &points2 as _], + ) + .build()?; + + let row_id3 = RowId::new(); + let points3 = MyPoint::from_iter(0..10); + let chunk3 = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id3, [build_frame_nr(frame3)], [&points3 as _]) + .build()?; + + let row_id4 = RowId::new(); + let colors4 = MyColor::from_iter(0..5); + let chunk4 = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id4, [build_frame_nr(frame4)], [&colors4 as _]) + .build()?; + + // injecting some static colors + let row_id5 = RowId::new(); + let colors5 = MyColor::from_iter(0..3); + let chunk5 = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id5, TimePoint::default(), [&colors5 as _]) + .build()?; + + store.insert_chunk(&Arc::new(chunk1))?; + store.insert_chunk(&Arc::new(chunk2))?; + store.insert_chunk(&Arc::new(chunk3))?; + store.insert_chunk(&Arc::new(chunk4))?; + store.insert_chunk(&Arc::new(chunk5))?; + + let assert_latest_components = |frame_nr: TimeInt, rows: &[(ComponentName, Option)]| { + let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); + + for (component_name, expected_row_id) in rows { + let row_id = query_latest_array( + &store, + &entity_path, + *component_name, + &LatestAtQuery::new(timeline_frame_nr, frame_nr), + ) + .map(|(_data_time, row_id, _array)| row_id); + + assert_eq!(*expected_row_id, row_id, "{component_name}"); + } + }; + + assert_latest_components( + frame0, + &[ + (MyColor::name(), Some(row_id5)), // static + (MyIndex::name(), None), + (MyPoint::name(), None), + ], + ); + assert_latest_components( + frame1, + &[ + (MyColor::name(), Some(row_id5)), // static + (MyIndex::name(), Some(row_id1)), + (MyPoint::name(), None), + ], + ); + assert_latest_components( + frame2, + &[ + (MyColor::name(), Some(row_id5)), + (MyPoint::name(), Some(row_id2)), + (MyIndex::name(), Some(row_id2)), + ], + ); + assert_latest_components( + frame3, + &[ + (MyColor::name(), Some(row_id5)), + (MyPoint::name(), Some(row_id3)), + (MyIndex::name(), Some(row_id2)), + ], + ); + assert_latest_components( + frame4, + &[ + (MyColor::name(), Some(row_id5)), + (MyPoint::name(), Some(row_id3)), + (MyIndex::name(), Some(row_id2)), + ], + ); + + Ok(()) +} + +#[test] +fn latest_at_sparse_component_edge_case() -> anyhow::Result<()> { + re_log::setup_logging(); + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + ChunkStoreConfig::default(), + ); + + let entity_path = EntityPath::from("this/that"); + + let frame1 = TimeInt::new_temporal(1); + let frame2 = TimeInt::new_temporal(2); + let frame3 = TimeInt::new_temporal(3); + + // This chunk has a time range of `(1, 3)`, but the actual data for `MyIndex` actually only + // starts at `3`. + + let row_id1_1 = RowId::new(); + let row_id1_2 = RowId::new(); + let row_id1_3 = RowId::new(); + let chunk = Chunk::builder(entity_path.clone()) + .with_sparse_component_batches( + row_id1_1, + [build_frame_nr(frame1)], + [ + (MyIndex::name(), None), + (MyPoint::name(), Some(&MyPoint::from_iter(0..1) as _)), + ], + ) + .with_sparse_component_batches( + row_id1_2, + [build_frame_nr(frame2)], + [ + (MyIndex::name(), None), + (MyPoint::name(), Some(&MyPoint::from_iter(1..2) as _)), + ], + ) + .with_sparse_component_batches( + row_id1_3, + [build_frame_nr(frame3)], + [ + (MyIndex::name(), Some(&MyIndex::from_iter(2..3) as _)), + (MyPoint::name(), Some(&MyPoint::from_iter(2..3) as _)), + ], + ) + .build()?; + eprintln!("chunk 1:\n{chunk}"); + store.insert_chunk(&Arc::new(chunk))?; + + // This chunk on the other hand has a time range of `(2, 3)`, and the data for `MyIndex` + // actually does start at `2`. + + let row_id2_1 = RowId::new(); + let chunk = Chunk::builder(entity_path.clone()) + .with_sparse_component_batches( + row_id2_1, + [build_frame_nr(frame2)], + [ + (MyIndex::name(), Some(&MyIndex::from_iter(2..3) as _)), + (MyPoint::name(), Some(&MyPoint::from_iter(1..2) as _)), + ], + ) + .build()?; + eprintln!("chunk 2:\n{chunk}"); + store.insert_chunk(&Arc::new(chunk))?; + + // We expect the data for `MyIndex` to come from `row_id_1_3`, since it is the most recent + // piece of data. + // The only way this can happen is if we have proper per-component time-ranges, since a global + // per-chunk time-range would erroneously push us towards the second chunk. + + let row_id = query_latest_array( + &store, + &entity_path, + MyIndex::name(), + &LatestAtQuery::new(Timeline::new_sequence("frame_nr"), TimeInt::MAX), + ) + .map(|(_data_time, row_id, _array)| row_id); + + assert_eq!(row_id1_3, row_id.unwrap()); + + Ok(()) +} + +#[test] +fn latest_at_overlapped_chunks() -> anyhow::Result<()> { + re_log::setup_logging(); + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + ChunkStoreConfig::default(), + ); + + let entity_path = EntityPath::from("this/that"); + + let frame1 = TimeInt::new_temporal(1); + let frame2 = TimeInt::new_temporal(2); + let frame3 = TimeInt::new_temporal(3); + let frame4 = TimeInt::new_temporal(4); + let frame5 = TimeInt::new_temporal(5); + let frame6 = TimeInt::new_temporal(6); + let frame7 = TimeInt::new_temporal(7); + + let points1 = MyPoint::from_iter(0..1); + let points2 = MyPoint::from_iter(1..2); + let points3 = MyPoint::from_iter(2..3); + let points4 = MyPoint::from_iter(3..4); + let points5 = MyPoint::from_iter(4..5); + let points6 = MyPoint::from_iter(5..6); + let points7 = MyPoint::from_iter(6..7); + + let row_id1_1 = RowId::new(); + let row_id1_3 = RowId::new(); + let row_id1_5 = RowId::new(); + let row_id1_7 = RowId::new(); + let chunk = Chunk::builder(entity_path.clone()) + .with_sparse_component_batches( + row_id1_1, + [build_frame_nr(frame1)], + [(MyPoint::name(), Some(&points1 as _))], + ) + .with_sparse_component_batches( + row_id1_3, + [build_frame_nr(frame3)], + [(MyPoint::name(), Some(&points3 as _))], + ) + .with_sparse_component_batches( + row_id1_5, + [build_frame_nr(frame5)], + [(MyPoint::name(), Some(&points5 as _))], + ) + .with_sparse_component_batches( + row_id1_7, + [build_frame_nr(frame7)], + [(MyPoint::name(), Some(&points7 as _))], + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let row_id2_2 = RowId::new(); + let row_id2_3 = RowId::new(); + let row_id2_4 = RowId::new(); + let chunk = Chunk::builder(entity_path.clone()) + .with_sparse_component_batches( + row_id2_2, + [build_frame_nr(frame2)], + [(MyPoint::name(), Some(&points2 as _))], + ) + .with_sparse_component_batches( + row_id2_3, + [build_frame_nr(frame3)], + [(MyPoint::name(), Some(&points3 as _))], + ) + .with_sparse_component_batches( + row_id2_4, + [build_frame_nr(frame4)], + [(MyPoint::name(), Some(&points4 as _))], + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + let row_id3_2 = RowId::new(); + let row_id3_4 = RowId::new(); + let row_id3_6 = RowId::new(); + let chunk = Chunk::builder(entity_path.clone()) + .with_sparse_component_batches( + row_id3_2, + [build_frame_nr(frame2)], + [(MyPoint::name(), Some(&points2 as _))], + ) + .with_sparse_component_batches( + row_id3_4, + [build_frame_nr(frame4)], + [(MyPoint::name(), Some(&points4 as _))], + ) + .with_sparse_component_batches( + row_id3_6, + [build_frame_nr(frame6)], + [(MyPoint::name(), Some(&points6 as _))], + ) + .build()?; + store.insert_chunk(&Arc::new(chunk))?; + + eprintln!("{store}"); + + for (at, expected_row_id) in [ + (frame1, row_id1_1), // + (frame2, row_id3_2), // + (frame3, row_id2_3), // + (frame4, row_id3_4), // + (frame5, row_id1_5), // + (frame6, row_id3_6), // + (frame7, row_id1_7), // + (TimeInt::MAX, row_id1_7), // + ] { + let query = LatestAtQuery::new(Timeline::new_sequence("frame_nr"), at); + eprintln!("{} @ {query:?}", MyPoint::name()); + let row_id = query_latest_array(&store, &entity_path, MyPoint::name(), &query) + .map(|(_data_time, row_id, _array)| row_id); + assert_eq!(expected_row_id, row_id.unwrap()); + } + + Ok(()) +} + +// --- + +#[test] +fn range() -> anyhow::Result<()> { + re_log::setup_logging(); + + let mut store = ChunkStore::new( + re_log_types::StoreId::random(re_log_types::StoreKind::Recording), + ChunkStoreConfig::default(), + ); + + let entity_path = EntityPath::from("this/that"); + + let frame1 = TimeInt::new_temporal(1); + let frame2 = TimeInt::new_temporal(2); + let frame3 = TimeInt::new_temporal(3); + let frame4 = TimeInt::new_temporal(4); + let frame5 = TimeInt::new_temporal(5); + + let row_id1 = RowId::new(); + let indices1 = MyIndex::from_iter(0..3); + let colors1 = MyColor::from_iter(0..3); + let chunk1 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id1, + [build_frame_nr(frame1)], + [&indices1 as _, &colors1 as _], + ) + .build()?; + + let row_id2 = RowId::new(); + let points2 = MyPoint::from_iter(0..3); + let chunk2 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id2, + [build_frame_nr(frame2)], + [&indices1 as _, &points2 as _], + ) + .build()?; + + let row_id3 = RowId::new(); + let points3 = MyPoint::from_iter(0..10); + let chunk3 = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id3, [build_frame_nr(frame3)], [&points3 as _]) + .build()?; + + let row_id4_1 = RowId::new(); + let indices4_1 = MyIndex::from_iter(20..25); + let colors4_1 = MyColor::from_iter(0..5); + let chunk4_1 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id4_1, + [build_frame_nr(frame4)], + [&indices4_1 as _, &colors4_1 as _], + ) + .build()?; + + let row_id4_2 = RowId::new(); + let indices4_2 = MyIndex::from_iter(25..30); + let colors4_2 = MyColor::from_iter(0..5); + let chunk4_2 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id4_2, + [build_frame_nr(frame4)], + [&indices4_2 as _, &colors4_2 as _], + ) + .build()?; + + let row_id4_25 = RowId::new(); + let points4_25 = MyPoint::from_iter(0..5); + let chunk4_25 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id4_25, + [build_frame_nr(frame4)], + [&indices4_2 as _, &points4_25 as _], + ) + .build()?; + + let row_id4_3 = RowId::new(); + let indices4_3 = MyIndex::from_iter(30..35); + let colors4_3 = MyColor::from_iter(0..5); + let chunk4_3 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id4_3, + [build_frame_nr(frame4)], + [&indices4_3 as _, &colors4_3 as _], + ) + .build()?; + + let row_id4_4 = RowId::new(); + let points4_4 = MyPoint::from_iter(0..5); + let chunk4_4 = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id4_4, + [build_frame_nr(frame4)], + [&indices4_3 as _, &points4_4 as _], + ) + .build()?; + + // injecting some static colors + let row_id5 = RowId::new(); + let colors5 = MyColor::from_iter(0..8); + let chunk5 = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id5, TimePoint::default(), [&colors5 as _]) + .build()?; + + store.insert_chunk(&Arc::new(chunk1))?; + store.insert_chunk(&Arc::new(chunk2))?; + store.insert_chunk(&Arc::new(chunk3))?; + store.insert_chunk(&Arc::new(chunk4_1))?; + store.insert_chunk(&Arc::new(chunk4_2))?; + store.insert_chunk(&Arc::new(chunk4_25))?; + store.insert_chunk(&Arc::new(chunk4_3))?; + store.insert_chunk(&Arc::new(chunk4_4))?; + store.insert_chunk(&Arc::new(chunk5))?; + + // Each entry in `rows_at_times` corresponds to a dataframe that's expected to be returned + // by the range query. + // A single timepoint might have several of those! That's one of the behaviors specific to + // range queries. + #[allow(clippy::type_complexity)] + let assert_range_components = + |time_range: ResolvedTimeRange, + component_name: ComponentName, + row_ids_at_times: &[(TimeInt, RowId)]| { + let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); + + let query = RangeQuery::new(timeline_frame_nr, time_range); + let results = store.range_relevant_chunks(&query, &entity_path, component_name); + + eprintln!("================= {component_name} @ {query:?} ==============="); + let mut results_processed = 0usize; + for chunk in results { + let chunk = chunk.range(&query, component_name); + eprintln!("{chunk}"); + for (data_time, row_id, _array) in + chunk.iter_rows(&timeline_frame_nr, &component_name) + { + let (expected_data_time, expected_row_id) = row_ids_at_times[results_processed]; + assert_eq!(expected_data_time, data_time); + assert_eq!(expected_row_id, row_id); + + results_processed += 1; + } + } + + let results_processed_expected = row_ids_at_times.len(); + assert_eq!(results_processed_expected, results_processed); + }; + + // Unit ranges + + assert_range_components( + ResolvedTimeRange::new(frame1, frame1), + MyColor::name(), + &[(TimeInt::STATIC, row_id5)], + ); + assert_range_components(ResolvedTimeRange::new(frame1, frame1), MyPoint::name(), &[]); + assert_range_components( + ResolvedTimeRange::new(frame2, frame2), + MyColor::name(), + &[(TimeInt::STATIC, row_id5)], + ); + assert_range_components( + ResolvedTimeRange::new(frame2, frame2), + MyPoint::name(), + &[(frame2, row_id2)], + ); + assert_range_components( + ResolvedTimeRange::new(frame3, frame3), + MyColor::name(), + &[(TimeInt::STATIC, row_id5)], + ); + assert_range_components( + ResolvedTimeRange::new(frame3, frame3), + MyPoint::name(), + &[(frame3, row_id3)], + ); + assert_range_components( + ResolvedTimeRange::new(frame4, frame4), + MyColor::name(), + &[(TimeInt::STATIC, row_id5)], + ); + assert_range_components( + ResolvedTimeRange::new(frame4, frame4), + MyPoint::name(), + &[(frame4, row_id4_25), (frame4, row_id4_4)], + ); + assert_range_components( + ResolvedTimeRange::new(frame5, frame5), + MyColor::name(), + &[(TimeInt::STATIC, row_id5)], + ); + assert_range_components(ResolvedTimeRange::new(frame5, frame5), MyPoint::name(), &[]); + + // Full range + + assert_range_components( + ResolvedTimeRange::new(frame1, frame5), + MyPoint::name(), + &[ + (frame2, row_id2), + (frame3, row_id3), + (frame4, row_id4_25), + (frame4, row_id4_4), + ], + ); + assert_range_components( + ResolvedTimeRange::new(frame1, frame5), + MyColor::name(), + &[(TimeInt::STATIC, row_id5)], + ); + + // Infinite range + + assert_range_components( + ResolvedTimeRange::new(TimeInt::MIN, TimeInt::MAX), + MyPoint::name(), + &[ + (frame2, row_id2), + (frame3, row_id3), + (frame4, row_id4_25), + (frame4, row_id4_4), + ], + ); + assert_range_components( + ResolvedTimeRange::new(TimeInt::MIN, TimeInt::MAX), + MyColor::name(), + &[(TimeInt::STATIC, row_id5)], + ); + + Ok(()) +} diff --git a/crates/re_chunk_store/tests/snapshots/memory_test__scalars_on_one_timeline_new.snap b/crates/re_chunk_store/tests/snapshots/memory_test__scalars_on_one_timeline_new.snap new file mode 100644 index 000000000000..edaa0ba0e5de --- /dev/null +++ b/crates/re_chunk_store/tests/snapshots/memory_test__scalars_on_one_timeline_new.snap @@ -0,0 +1,9 @@ +--- +source: crates/re_chunk_store/tests/memory_test.rs +expression: "[format!(\"{NUM_SCALARS} scalars\"),\n format!(\"{} in total\",\n re_format::format_bytes(total_mem_use_global as _)),\n format!(\"{} per row\",\n re_format::format_bytes(total_mem_use_global as f64 / NUM_SCALARS\n as f64))]" +--- +[ + "1048576 scalars", + "41.9 MiB in total", + "42 B per row", +] diff --git a/crates/re_data_loader/Cargo.toml b/crates/re_data_loader/Cargo.toml index e82b460db455..74ca2d66d69f 100644 --- a/crates/re_data_loader/Cargo.toml +++ b/crates/re_data_loader/Cargo.toml @@ -25,6 +25,7 @@ default = [] [dependencies] re_build_info.workspace = true +re_chunk.workspace = true re_log_encoding = { workspace = true, features = ["decoder"] } re_log_types.workspace = true re_log.workspace = true diff --git a/crates/re_data_loader/src/lib.rs b/crates/re_data_loader/src/lib.rs index 3edd76f9521f..5b119ef55660 100644 --- a/crates/re_data_loader/src/lib.rs +++ b/crates/re_data_loader/src/lib.rs @@ -4,7 +4,8 @@ use std::sync::Arc; use once_cell::sync::Lazy; -use re_log_types::{ArrowMsg, DataRow, EntityPath, LogMsg, TimePoint}; +use re_chunk::{Chunk, ChunkResult}; +use re_log_types::{ArrowMsg, EntityPath, LogMsg, TimePoint}; // ---------------------------------------------------------------------------- @@ -283,7 +284,7 @@ pub enum DataLoaderError { IO(#[from] std::io::Error), #[error(transparent)] - Arrow(#[from] re_log_types::DataCellError), + Arrow(#[from] re_chunk::ChunkError), #[error(transparent)] Decode(#[from] re_log_encoding::decoder::DecodeError), @@ -317,15 +318,15 @@ impl DataLoaderError { /// most convenient for them, whether it is raw components, arrow chunks or even /// full-on [`LogMsg`]s. pub enum LoadedData { - DataRow(DataRow), + Chunk(Chunk), ArrowMsg(ArrowMsg), LogMsg(LogMsg), } -impl From for LoadedData { +impl From for LoadedData { #[inline] - fn from(value: DataRow) -> Self { - Self::DataRow(value) + fn from(value: Chunk) -> Self { + Self::Chunk(value) } } @@ -345,18 +346,9 @@ impl From for LoadedData { impl LoadedData { /// Pack the data into a [`LogMsg`]. - pub fn into_log_msg( - self, - store_id: &re_log_types::StoreId, - ) -> Result { + pub fn into_log_msg(self, store_id: &re_log_types::StoreId) -> ChunkResult { match self { - Self::DataRow(row) => { - let mut table = - re_log_types::DataTable::from_rows(re_log_types::TableId::new(), [row]); - table.compute_all_size_bytes(); - - Ok(LogMsg::ArrowMsg(store_id.clone(), table.to_arrow_msg()?)) - } + Self::Chunk(chunk) => Ok(LogMsg::ArrowMsg(store_id.clone(), chunk.to_arrow_msg()?)), Self::ArrowMsg(msg) => Ok(LogMsg::ArrowMsg(store_id.clone(), msg)), diff --git a/crates/re_data_loader/src/load_file.rs b/crates/re_data_loader/src/load_file.rs index 6487ffe9d869..f5a9c6d19bab 100644 --- a/crates/re_data_loader/src/load_file.rs +++ b/crates/re_data_loader/src/load_file.rs @@ -106,7 +106,7 @@ pub(crate) fn prepare_store_info( (!is_rrd).then(|| { LogMsg::SetStoreInfo(SetStoreInfo { - row_id: re_log_types::RowId::new(), + row_id: *re_chunk::RowId::new(), info: re_log_types::StoreInfo { application_id: app_id.clone(), store_id: store_id.clone(), diff --git a/crates/re_data_loader/src/loader_archetype.rs b/crates/re_data_loader/src/loader_archetype.rs index 250c34d9e69c..80ed003eb200 100644 --- a/crates/re_data_loader/src/loader_archetype.rs +++ b/crates/re_data_loader/src/loader_archetype.rs @@ -1,4 +1,5 @@ -use re_log_types::{DataRow, EntityPath, RowId, TimeInt, TimePoint}; +use re_chunk::{Chunk, RowId}; +use re_log_types::{EntityPath, TimeInt, TimePoint}; use crate::{DataLoader, DataLoaderError, LoadedData}; @@ -132,7 +133,7 @@ fn load_image( timepoint: TimePoint, entity_path: EntityPath, contents: Vec, -) -> Result, DataLoaderError> { +) -> Result, DataLoaderError> { re_tracing::profile_function!(); let rows = [ @@ -141,7 +142,9 @@ fn load_image( contents, image::ImageFormat::from_path(filepath).ok(), )?; - DataRow::from_archetype(RowId::new(), timepoint, entity_path, &arch)? + Chunk::builder(entity_path) + .with_archetype(RowId::new(), timepoint, &arch) + .build()? }, // ]; @@ -154,7 +157,7 @@ fn load_mesh( timepoint: TimePoint, entity_path: EntityPath, contents: Vec, -) -> Result, DataLoaderError> { +) -> Result, DataLoaderError> { re_tracing::profile_function!(); let rows = [ @@ -163,7 +166,9 @@ fn load_mesh( contents, re_types::components::MediaType::guess_from_path(filepath), ); - DataRow::from_archetype(RowId::new(), timepoint, entity_path, &arch)? + Chunk::builder(entity_path) + .with_archetype(RowId::new(), timepoint, &arch) + .build()? }, // ]; @@ -175,14 +180,16 @@ fn load_point_cloud( timepoint: TimePoint, entity_path: EntityPath, contents: &[u8], -) -> Result, DataLoaderError> { +) -> Result, DataLoaderError> { re_tracing::profile_function!(); let rows = [ { // TODO(#4532): `.ply` data loader should support 2D point cloud & meshes let points3d = re_types::archetypes::Points3D::from_file_contents(contents)?; - DataRow::from_archetype(RowId::new(), timepoint, entity_path, &points3d)? + Chunk::builder(entity_path) + .with_archetype(RowId::new(), timepoint, &points3d) + .build()? }, // ]; @@ -195,7 +202,7 @@ fn load_text_document( timepoint: TimePoint, entity_path: EntityPath, contents: Vec, -) -> Result, DataLoaderError> { +) -> Result, DataLoaderError> { re_tracing::profile_function!(); let rows = [ @@ -204,7 +211,9 @@ fn load_text_document( contents, re_types::components::MediaType::guess_from_path(filepath), )?; - DataRow::from_archetype(RowId::new(), timepoint, entity_path, &arch)? + Chunk::builder(entity_path) + .with_archetype(RowId::new(), timepoint, &arch) + .build()? }, // ]; diff --git a/crates/re_data_store/README.md b/crates/re_data_store/README.md deleted file mode 100644 index 8cc004b6f9f6..000000000000 --- a/crates/re_data_store/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Rerun data store - -Part of the [`rerun`](https://github.com/rerun-io/rerun) family of crates. - -[![Latest version](https://img.shields.io/crates/v/re_data_store.svg)](https://crates.io/crates/re_data_store) -[![Documentation](https://docs.rs/re_data_store/badge.svg)](https://docs.rs/re_data_store) -![MIT](https://img.shields.io/badge/license-MIT-blue.svg) -![Apache](https://img.shields.io/badge/license-Apache-blue.svg) - -[Apache Arrow](https://arrow.apache.org/) is a language-independent columnar memory format for arbitrary data. - -The `re_data_store` crate is an in-memory time series database for Rerun log data. It is indexed by entity path, component, timeline, and time. It supports out-of-order insertions, and fast `O(log(N))` queries. diff --git a/crates/re_data_store/benches/arrow2.rs b/crates/re_data_store/benches/arrow2.rs deleted file mode 100644 index b14efa614b6a..000000000000 --- a/crates/re_data_store/benches/arrow2.rs +++ /dev/null @@ -1,378 +0,0 @@ -//! Keeping track of performance issues/regressions in `arrow2` that directly affect us. - -// Allow unwrap() in benchmarks -#![allow(clippy::unwrap_used)] - -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; - -use std::sync::Arc; - -use arrow2::array::{Array, PrimitiveArray, StructArray}; -use criterion::Criterion; -use itertools::Itertools; - -use re_log_types::example_components::{MyIndex, MyPoint}; -use re_log_types::DataCell; -use re_types::testing::{build_some_large_structs, LargeStruct}; -use re_types_core::{Component, SizeBytes}; - -// --- - -criterion::criterion_group!(benches, erased_clone, estimated_size_bytes); - -criterion::criterion_main!(benches); - -// --- - -#[cfg(not(debug_assertions))] -const NUM_ROWS: usize = 10_000; -#[cfg(not(debug_assertions))] -const NUM_INSTANCES: usize = 100; - -// `cargo test` also runs the benchmark setup code, so make sure they run quickly: -#[cfg(debug_assertions)] -const NUM_ROWS: usize = 1; -#[cfg(debug_assertions)] -const NUM_INSTANCES: usize = 1; - -// --- - -#[derive(Debug, Clone, Copy)] -enum ArrayKind { - /// E.g. an array of `MyIndex`. - Primitive, - - /// E.g. an array of `Position2D`. - Struct, - - /// An array of `LargeStruct`. - StructLarge, -} - -impl std::fmt::Display for ArrayKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(match self { - Self::Primitive => "primitive", - Self::Struct => "struct", - Self::StructLarge => "struct_large", - }) - } -} - -fn erased_clone(c: &mut Criterion) { - if std::env::var("CI").is_ok() { - return; - } - - let kind = [ - ArrayKind::Primitive, - ArrayKind::Struct, - ArrayKind::StructLarge, - ]; - - for kind in kind { - let mut group = c.benchmark_group(format!( - "arrow2/size_bytes/{kind}/rows={NUM_ROWS}/instances={NUM_INSTANCES}" - )); - group.throughput(criterion::Throughput::Elements(NUM_ROWS as _)); - - match kind { - ArrayKind::Primitive => { - let data = MyIndex::from_iter(0..NUM_INSTANCES as _); - bench_arrow(&mut group, &data); - bench_native(&mut group, &data); - } - ArrayKind::Struct => { - let data = MyPoint::from_iter(0..NUM_INSTANCES as u32); - bench_arrow(&mut group, &data); - bench_native(&mut group, &data); - } - ArrayKind::StructLarge => { - let data = build_some_large_structs(NUM_INSTANCES); - bench_arrow(&mut group, &data); - bench_native(&mut group, &data); - } - } - } - - // TODO(cmc): Use cells once `cell.size_bytes()` has landed (#1727) - fn bench_arrow<'a, T: Component + SizeBytes + 'a>( - group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - data: &'a Vec, - ) where - &'a T: Into<::std::borrow::Cow<'a, T>>, - { - let arrays: Vec> = (0..NUM_ROWS) - .map(|_| T::to_arrow(data).unwrap()) - .collect_vec(); - - let total_size_bytes = arrays - .iter() - .map(|array| array.total_size_bytes()) - .sum::(); - assert!(total_size_bytes > 0); - - group.bench_function("array", |b| { - b.iter(|| { - let sz = arrays - .iter() - .map(|array| array.total_size_bytes()) - .sum::(); - assert_eq!(total_size_bytes, sz); - sz - }); - }); - } - - #[allow(clippy::ptr_arg)] // We want to know it's a vec and not a slice to the stack! - fn bench_native( - group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - data: &Vec, - ) { - let vecs = (0..NUM_ROWS).map(|_| data.clone()).collect_vec(); - - let total_size_bytes = vecs - .iter() - .map(|vec| std::mem::size_of_val(vec.as_slice()) as u64) - .sum::(); - assert!(total_size_bytes as usize >= NUM_ROWS * NUM_INSTANCES * std::mem::size_of::()); - - { - let vecs = (0..NUM_ROWS).map(|_| data.clone()).collect_vec(); - group.bench_function("vec", |b| { - b.iter(|| { - let sz = vecs - .iter() - .map(|vec| std::mem::size_of_val(vec.as_slice()) as u64) - .sum::(); - assert_eq!(total_size_bytes, sz); - sz - }); - }); - } - - trait SizeOf { - fn size_of(&self) -> usize; - } - - impl SizeOf for Vec { - fn size_of(&self) -> usize { - std::mem::size_of_val(self.as_slice()) - } - } - - { - let vecs: Vec> = (0..NUM_ROWS) - .map(|_| Box::new(data.clone()) as Box) - .collect_vec(); - - group.bench_function("vec/erased", |b| { - b.iter(|| { - let sz = vecs.iter().map(|vec| vec.size_of() as u64).sum::(); - assert_eq!(total_size_bytes, sz); - sz - }); - }); - } - } -} - -fn estimated_size_bytes(c: &mut Criterion) { - if std::env::var("CI").is_ok() { - return; - } - - let kind = [ - ArrayKind::Primitive, - ArrayKind::Struct, - ArrayKind::StructLarge, - ]; - - for kind in kind { - let mut group = c.benchmark_group(format!( - "arrow2/erased_clone/{kind}/rows={NUM_ROWS}/instances={NUM_INSTANCES}" - )); - group.throughput(criterion::Throughput::Elements(NUM_ROWS as _)); - - fn generate_cells(kind: ArrayKind) -> Vec { - match kind { - ArrayKind::Primitive => (0..NUM_ROWS) - .map(|_| { - DataCell::from_native(MyIndex::from_iter(0..NUM_INSTANCES as _).as_slice()) - }) - .collect(), - ArrayKind::Struct => (0..NUM_ROWS) - .map(|_| { - DataCell::from_native( - MyPoint::from_iter(0..NUM_INSTANCES as u32).as_slice(), - ) - }) - .collect(), - ArrayKind::StructLarge => (0..NUM_ROWS) - .map(|_| { - DataCell::from_native(build_some_large_structs(NUM_INSTANCES).as_slice()) - }) - .collect(), - } - } - - { - { - let cells = generate_cells(kind); - let total_instances = cells.iter().map(|cell| cell.num_instances()).sum::(); - assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32); - - group.bench_function("cell/arc_erased", |b| { - b.iter(|| { - let cells = cells.clone(); - assert_eq!( - total_instances, - cells.iter().map(|cell| cell.num_instances()).sum::() - ); - cells - }); - }); - } - - { - let cells = generate_cells(kind).into_iter().map(Arc::new).collect_vec(); - let total_instances = cells.iter().map(|cell| cell.num_instances()).sum::(); - assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32); - - group.bench_function("cell/wrapped_in_arc", |b| { - b.iter(|| { - let cells = cells.clone(); - assert_eq!( - total_instances, - cells.iter().map(|cell| cell.num_instances()).sum::() - ); - cells - }); - }); - } - - { - let cells = generate_cells(kind); - let arrays = cells.iter().map(|cell| cell.to_arrow()).collect_vec(); - let total_instances = arrays.iter().map(|array| array.len() as u32).sum::(); - assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32); - - group.bench_function("array", |b| { - b.iter(|| { - let arrays = arrays.clone(); - assert_eq!( - total_instances, - arrays.iter().map(|array| array.len() as u32).sum::() - ); - arrays - }); - }); - } - - match kind { - ArrayKind::Primitive => { - bench_downcast_first::>(&mut group, kind); - } - ArrayKind::Struct | ArrayKind::StructLarge => { - bench_downcast_first::(&mut group, kind); - } - } - - fn bench_downcast_first( - group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - kind: ArrayKind, - ) { - let cells = generate_cells(kind); - let arrays = cells - .iter() - .map(|cell| { - cell.as_arrow_ref() - .as_any() - .downcast_ref::() - .unwrap() - .clone() - }) - .collect_vec(); - let total_instances = arrays.iter().map(|array| array.len() as u32).sum::(); - assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32); - - group.bench_function("array/downcast_first", |b| { - b.iter(|| { - let arrays = arrays.clone(); - assert_eq!( - total_instances, - arrays.iter().map(|array| array.len() as u32).sum::() - ); - arrays - }); - }); - } - } - - { - fn generate_positions() -> Vec> { - (0..NUM_ROWS) - .map(|_| MyPoint::from_iter(0..NUM_INSTANCES as u32)) - .collect() - } - - fn generate_indices() -> Vec> { - (0..NUM_ROWS) - .map(|_| MyIndex::from_iter(0..NUM_INSTANCES as _)) - .collect() - } - - fn generate_rects() -> Vec> { - (0..NUM_ROWS) - .map(|_| build_some_large_structs(NUM_INSTANCES)) - .collect() - } - - match kind { - ArrayKind::Primitive => bench_std(&mut group, generate_indices()), - ArrayKind::Struct => bench_std(&mut group, generate_positions()), - ArrayKind::StructLarge => bench_std(&mut group, generate_rects()), - } - - fn bench_std( - group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - data: Vec>, - ) { - { - let vecs = data.clone(); - let total_instances = vecs.iter().map(|vec| vec.len() as u32).sum::(); - assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32); - - group.bench_function("vec/full_copy", |b| { - b.iter(|| { - let vecs = vecs.clone(); - assert_eq!( - total_instances, - vecs.iter().map(|vec| vec.len() as u32).sum::() - ); - vecs - }); - }); - } - - { - let vecs = data.into_iter().map(Arc::new).collect_vec(); - let total_instances = vecs.iter().map(|vec| vec.len() as u32).sum::(); - assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32); - - group.bench_function("vec/wrapped_in_arc", |b| { - b.iter(|| { - let vecs = vecs.clone(); - assert_eq!( - total_instances, - vecs.iter().map(|vec| vec.len() as u32).sum::() - ); - vecs - }); - }); - } - } - } - } -} diff --git a/crates/re_data_store/benches/data_store.rs b/crates/re_data_store/benches/data_store.rs deleted file mode 100644 index 30854ab12845..000000000000 --- a/crates/re_data_store/benches/data_store.rs +++ /dev/null @@ -1,472 +0,0 @@ -// Allow unwrap() in benchmarks -#![allow(clippy::unwrap_used)] - -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; - -use arrow2::array::{Array as _, StructArray}; -use criterion::{criterion_group, criterion_main, Criterion}; - -use re_data_store::{ - DataStore, DataStoreConfig, GarbageCollectionOptions, GarbageCollectionTarget, LatestAtQuery, - RangeQuery, ResolvedTimeRange, TimeInt, -}; -use re_log_types::{ - build_frame_nr, example_components::MyIndex, DataCell, DataRow, DataTable, EntityPath, RowId, - TableId, TimePoint, TimeType, Timeline, -}; -use re_types::testing::{build_some_large_structs, LargeStruct}; -use re_types_core::{ComponentName, Loggable as _}; - -criterion_group!( - benches, - insert, - insert_same_time_point, - latest_at, - latest_at_missing, - range, - gc -); -criterion_main!(benches); - -// --- - -#[cfg(not(debug_assertions))] -const NUM_ROWS: i64 = 1_000; -#[cfg(not(debug_assertions))] -const NUM_INSTANCES: i64 = 1_000; - -// `cargo test` also runs the benchmark setup code, so make sure they run quickly: -#[cfg(debug_assertions)] -const NUM_ROWS: i64 = 1; -#[cfg(debug_assertions)] -const NUM_INSTANCES: i64 = 1; - -fn packed() -> &'static [bool] { - if std::env::var("CI").is_ok() { - &[false] - } else { - &[false, true] - } -} - -fn num_rows_per_bucket() -> &'static [u64] { - if std::env::var("CI").is_ok() { - &[] - } else { - &[0, 2, 32, 2048] - } -} - -// --- Benchmarks --- - -fn insert(c: &mut Criterion) { - for &packed in packed() { - let mut group = c.benchmark_group(format!( - "datastore/num_rows={NUM_ROWS}/num_instances={NUM_INSTANCES}/packed={packed}/insert" - )); - group.throughput(criterion::Throughput::Elements( - (NUM_INSTANCES * NUM_ROWS) as _, - )); - - let rows = build_rows_with_packed(packed); - - // Default config - group.bench_function("default", |b| { - b.iter(|| insert_rows(Default::default(), &rows)); - }); - - // Emulate more or less buckets - for &num_rows_per_bucket in num_rows_per_bucket() { - group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| { - b.iter(|| { - insert_rows( - DataStoreConfig { - indexed_bucket_num_rows: num_rows_per_bucket, - ..Default::default() - }, - &rows, - ) - }); - }); - } - } -} - -fn insert_same_time_point(c: &mut Criterion) { - // Benchmark a corner-case where all rows have the same time point, and arrive out-of-order. - // See https://github.com/rerun-io/rerun/issues/4415 - - // `cargo test` also runs the benchmark setup code, so make sure they run quickly: - #[cfg(debug_assertions)] - let num_rows_list = [100]; - #[cfg(not(debug_assertions))] - let num_rows_list = [1_000, 10_000, 50_000]; - - for num_rows in num_rows_list { - for shuffled in [false, true] { - let num_instances = 1; - let packed = false; - let mut group = c.benchmark_group(format!( - "datastore/num_rows={num_rows}/num_instances={num_instances}/insert_same_time_point/shuffled={shuffled}" - )); - group.sample_size(10); // it is so slow - group.throughput(criterion::Throughput::Elements(num_rows * num_instances)); - - let rows = build_rows_ex(num_rows as _, num_instances as _, shuffled, packed, |_| { - TimePoint::from([build_frame_nr(TimeInt::ZERO)]) - }); - - // Default config - group.bench_function("insert", |b| { - b.iter(|| { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig::default(), - ); - - let num_ingested_rows_per_sort = 10; - - for chunk in rows.chunks(num_ingested_rows_per_sort) { - for row in chunk { - store.insert_row(row).unwrap(); - } - - // This mimics the sorting required by a query. - // This benchmark emulating a streaming ingestion - // done concurrently with a bunch of queries. - // We ingest a bunch of rows, then query/sort them, then repeat. - store.sort_indices_if_needed(); - } - store - }); - }); - } - } -} - -fn latest_at(c: &mut Criterion) { - for &packed in packed() { - let mut group = c.benchmark_group(format!( - "datastore/num_rows={NUM_ROWS}/num_instances={NUM_INSTANCES}/packed={packed}/latest_at" - )); - group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); - - let rows = build_rows_with_packed(packed); - - // Default config - group.bench_function("default", |b| { - let store = insert_rows(Default::default(), &rows); - b.iter(|| { - let cells = latest_data_at(&store, LargeStruct::name(), &[LargeStruct::name()]); - let large_structs = cells[0] - .as_ref() - .unwrap() - .as_arrow_ref() - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(NUM_INSTANCES as usize, large_structs.len()); - }); - }); - - // Emulate more or less buckets - for &num_rows_per_bucket in num_rows_per_bucket() { - let store = insert_rows( - DataStoreConfig { - indexed_bucket_num_rows: num_rows_per_bucket, - ..Default::default() - }, - &rows, - ); - group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| { - b.iter(|| { - let cells = latest_data_at(&store, LargeStruct::name(), &[LargeStruct::name()]); - let large_structs = cells[0] - .as_ref() - .unwrap() - .as_arrow_ref() - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(NUM_INSTANCES as usize, large_structs.len()); - }); - }); - } - } -} - -fn latest_at_missing(c: &mut Criterion) { - for &packed in packed() { - let mut group = c.benchmark_group(format!( - "datastore/num_rows={NUM_ROWS}/num_instances={NUM_INSTANCES}/packed={packed}/latest_at_missing" - )); - group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); - - let rows = build_rows_with_packed(packed); - - // Default config - let store = insert_rows(Default::default(), &rows); - group.bench_function("primary/default", |b| { - b.iter(|| { - let results = latest_data_at( - &store, - "non_existing_component".into(), - &[LargeStruct::name()], - ); - assert!(results[0].is_none()); - }); - }); - group.bench_function("secondaries/default", |b| { - b.iter(|| { - let results = latest_data_at( - &store, - LargeStruct::name(), - &[ - "non_existing_component1".into(), - "non_existing_component2".into(), - "non_existing_component3".into(), - ], - ); - assert!(results[0].is_none()); - assert!(results[1].is_none()); - assert!(results[2].is_none()); - }); - }); - - // Emulate more or less buckets - for &num_rows_per_bucket in num_rows_per_bucket() { - let store = insert_rows( - DataStoreConfig { - indexed_bucket_num_rows: num_rows_per_bucket, - ..Default::default() - }, - &rows, - ); - group.bench_function(format!("primary/bucketsz={num_rows_per_bucket}"), |b| { - b.iter(|| { - let results = latest_data_at( - &store, - "non_existing_component".into(), - &[LargeStruct::name()], - ); - assert!(results[0].is_none()); - }); - }); - group.bench_function(format!("secondaries/bucketsz={num_rows_per_bucket}"), |b| { - b.iter(|| { - let results = latest_data_at( - &store, - LargeStruct::name(), - &[ - "non_existing_component1".into(), - "non_existing_component2".into(), - "non_existing_component3".into(), - ], - ); - assert!(results[0].is_none()); - assert!(results[1].is_none()); - assert!(results[2].is_none()); - }); - }); - } - } -} - -fn range(c: &mut Criterion) { - for &packed in packed() { - let mut group = c.benchmark_group(format!( - "datastore/num_rows={NUM_ROWS}/num_instances={NUM_INSTANCES}/packed={packed}/range" - )); - group.throughput(criterion::Throughput::Elements( - (NUM_INSTANCES * NUM_ROWS) as _, - )); - - let rows = build_rows_with_packed(packed); - - // Default config - group.bench_function("default", |b| { - b.iter(|| insert_rows(Default::default(), &rows)); - }); - - // Emulate more or less buckets - for &num_rows_per_bucket in num_rows_per_bucket() { - let store = insert_rows( - DataStoreConfig { - indexed_bucket_num_rows: num_rows_per_bucket, - ..Default::default() - }, - &rows, - ); - group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| { - b.iter(|| { - let rows = range_data(&store, [LargeStruct::name()]); - for (cur_time, (time, cells)) in rows.enumerate() { - assert_eq!(cur_time as i64, time.as_i64()); - - let large_structs = cells[0] - .as_ref() - .unwrap() - .as_arrow_ref() - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(NUM_INSTANCES as usize, large_structs.len()); - } - }); - }); - } - } -} - -fn gc(c: &mut Criterion) { - let mut group = c.benchmark_group(format!( - "datastore/num_rows={NUM_ROWS}/num_instances={NUM_INSTANCES}/gc" - )); - group.throughput(criterion::Throughput::Elements( - (NUM_INSTANCES * NUM_ROWS) as _, - )); - - let rows = build_rows_with_packed(false); - - // Default config - group.bench_function("default", |b| { - let store = insert_rows(Default::default(), &rows); - b.iter(|| { - let mut store = store.clone(); - let (_, stats_diff) = store.gc(&GarbageCollectionOptions { - target: GarbageCollectionTarget::DropAtLeastFraction(1.0 / 3.0), - protect_latest: 0, - purge_empty_tables: false, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - time_budget: std::time::Duration::MAX, - }); - stats_diff - }); - }); - - // Emulate more or less bucket - for &num_rows_per_bucket in num_rows_per_bucket() { - group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| { - let store = insert_rows( - DataStoreConfig { - indexed_bucket_num_rows: num_rows_per_bucket, - ..Default::default() - }, - &rows, - ); - b.iter(|| { - let mut store = store.clone(); - let (_, stats_diff) = store.gc(&GarbageCollectionOptions { - target: GarbageCollectionTarget::DropAtLeastFraction(1.0 / 3.0), - protect_latest: 0, - purge_empty_tables: false, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - time_budget: std::time::Duration::MAX, - }); - stats_diff - }); - }); - } -} - -// --- Helpers --- - -fn build_rows_with_packed(packed: bool) -> Vec { - build_rows_ex( - NUM_ROWS as _, - NUM_INSTANCES as _, - false, - packed, - |row_idx| TimePoint::from([build_frame_nr(row_idx as i64)]), - ) -} - -fn build_rows_ex( - num_rows: usize, - num_instances: usize, - shuffled: bool, - packed: bool, - time_point: impl Fn(usize) -> TimePoint, -) -> Vec { - let rows = (0..num_rows).map(move |frame_idx| { - DataRow::from_cells2( - RowId::new(), - "large_structs", - time_point(frame_idx), - ( - MyIndex::from_iter(0..num_instances as _), - build_some_large_structs(num_instances), - ), - ) - .unwrap() - }); - - let mut table = if shuffled { - use rand::seq::SliceRandom as _; - let mut rows: Vec = rows.collect(); - rows.shuffle(&mut rand::thread_rng()); - DataTable::from_rows(TableId::ZERO, rows) - } else { - DataTable::from_rows(TableId::ZERO, rows) - }; - - // Do a serialization roundtrip to pack everything in contiguous memory. - if packed { - let (schema, columns) = table.serialize().unwrap(); - table = DataTable::deserialize(TableId::ZERO, &schema, &columns).unwrap(); - } - - // NOTE: Using unsized cells will crash in debug mode, and benchmarks are run for 1 iteration, - // in debug mode, by the standard test harness. - if cfg!(debug_assertions) { - table.compute_all_size_bytes(); - } - - table.to_rows().map(|r| r.unwrap()).collect() -} - -fn insert_rows(config: DataStoreConfig, rows: &[DataRow]) -> DataStore { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config, - ); - for row in rows { - store.insert_row(row).unwrap(); - } - store -} - -fn latest_data_at( - store: &DataStore, - primary: ComponentName, - secondaries: &[ComponentName; N], -) -> [Option; N] { - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - let timeline_query = LatestAtQuery::new(timeline_frame_nr, NUM_ROWS / 2); - let entity_path = EntityPath::from("large_structs"); - - store - .latest_at(&timeline_query, &entity_path, primary, secondaries) - .map_or_else(|| [(); N].map(|_| None), |(_, _, cells)| cells) -} - -fn range_data( - store: &DataStore, - components: [ComponentName; N], -) -> impl Iterator; N])> + '_ { - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - let query = RangeQuery::new( - timeline_frame_nr, - ResolvedTimeRange::new(TimeInt::ZERO, NUM_ROWS), - ); - let entity_path = EntityPath::from("large_structs"); - - store - .range(&query, &entity_path, components) - .map(move |(time, _, cells)| (time, cells)) -} diff --git a/crates/re_data_store/benches/gc.rs b/crates/re_data_store/benches/gc.rs deleted file mode 100644 index 45e8a00e5273..000000000000 --- a/crates/re_data_store/benches/gc.rs +++ /dev/null @@ -1,206 +0,0 @@ -// Allow unwrap() in benchmarks -#![allow(clippy::unwrap_used)] - -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; - -use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; - -use itertools::Itertools; -use re_data_store::{ - DataStore, DataStoreConfig, GarbageCollectionOptions, GarbageCollectionTarget, -}; -use re_log_types::{ - build_frame_nr, build_log_time, DataRow, DataTable, EntityPath, RowId, TableId, Time, TimePoint, -}; -use re_types_core::{AsComponents, ComponentBatch}; - -criterion_group!(benches, plotting_dashboard); -criterion_main!(benches); - -// --- - -#[cfg(not(debug_assertions))] -mod constants { - pub const NUM_ENTITY_PATHS: usize = 20; - pub const NUM_ROWS_PER_ENTITY_PATH: usize = 10_000; -} - -// `cargo test` also runs the benchmark setup code, so make sure they run quickly: -#[cfg(debug_assertions)] -mod constants { - pub const NUM_ENTITY_PATHS: usize = 1; - pub const NUM_ROWS_PER_ENTITY_PATH: usize = 1; -} - -use constants::{NUM_ENTITY_PATHS, NUM_ROWS_PER_ENTITY_PATH}; - -fn gc_batching() -> &'static [bool] { - if std::env::var("CI").is_ok() { - &[false] - } else { - &[false, true] - } -} - -fn num_rows_per_bucket() -> &'static [u64] { - if std::env::var("CI").is_ok() { - &[] - } else { - &[256, 512, 1024, 2048] - } -} - -// --- Benchmarks --- - -fn plotting_dashboard(c: &mut Criterion) { - const DROP_AT_LEAST: f64 = 0.3; - - let mut group = c.benchmark_group(format!( - "datastore/num_entities={NUM_ENTITY_PATHS}/num_rows_per_entity={NUM_ROWS_PER_ENTITY_PATH}/plotting_dashboard/drop_at_least={DROP_AT_LEAST}" - )); - group.throughput(criterion::Throughput::Elements( - ((NUM_ENTITY_PATHS * NUM_ROWS_PER_ENTITY_PATH) as f64 * DROP_AT_LEAST) as _, - )); - group.sample_size(10); - - let gc_settings = GarbageCollectionOptions { - target: GarbageCollectionTarget::DropAtLeastFraction(DROP_AT_LEAST), - protect_latest: 1, - purge_empty_tables: false, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - time_budget: std::time::Duration::MAX, - }; - - // NOTE: insert in multiple timelines to more closely match real world scenarios. - let mut timegen = |i| { - [ - build_log_time(Time::from_seconds_since_epoch(i as _)), - build_frame_nr(i as i64), - ] - .into() - }; - - let mut datagen = - |i| Box::new(re_types::archetypes::Scalar::new(i as f64)) as Box; - - // Default config - group.bench_function("default", |b| { - let store = build_store(Default::default(), false, &mut timegen, &mut datagen); - b.iter_batched( - || store.clone(), - |mut store| { - let (_, stats_diff) = store.gc(&gc_settings); - stats_diff - }, - BatchSize::LargeInput, - ); - }); - - // Emulate more or less bucket - for &num_rows_per_bucket in num_rows_per_bucket() { - for &gc_batching in gc_batching() { - group.bench_function( - if gc_batching { - format!("bucketsz={num_rows_per_bucket}/gc_batching=true") - } else { - format!("bucketsz={num_rows_per_bucket}") - }, - |b| { - let store = build_store( - DataStoreConfig { - indexed_bucket_num_rows: num_rows_per_bucket, - ..Default::default() - }, - false, - &mut timegen, - &mut datagen, - ); - let mut gc_settings = gc_settings.clone(); - gc_settings.enable_batching = gc_batching; - b.iter_batched( - || store.clone(), - |mut store| { - let (_, stats_diff) = store.gc(&gc_settings); - stats_diff - }, - BatchSize::LargeInput, - ); - }, - ); - } - } -} - -// --- Helpers --- - -fn build_store( - config: DataStoreConfig, - packed: bool, - timegen: &mut FT, - datagen: &mut FD, -) -> DataStore -where - FT: FnMut(usize) -> TimePoint, - FD: FnMut(usize) -> Box, -{ - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config, - ); - - let tables = (0..NUM_ENTITY_PATHS) - .map(|i| build_table(format!("entity_path_{i}").into(), packed, timegen, datagen)) - .collect_vec(); - let mut rows_per_table = tables.iter().map(|table| table.to_rows()).collect_vec(); - - // NOTE: interleave insertions between entities to more closely match real world scenarios. - for _ in 0..NUM_ROWS_PER_ENTITY_PATH { - #[allow(clippy::needless_range_loop)] // readability - for i in 0..NUM_ENTITY_PATHS { - let row = rows_per_table[i].next().unwrap(); - store.insert_row(&row.unwrap()).unwrap(); - } - } - - store -} - -fn build_table( - entity_path: EntityPath, - packed: bool, - timegen: &mut FT, - datagen: &mut FD, -) -> DataTable -where - FT: FnMut(usize) -> TimePoint, - FD: FnMut(usize) -> Box, -{ - let mut table = DataTable::from_rows( - TableId::ZERO, - (0..NUM_ROWS_PER_ENTITY_PATH).map(move |i| { - DataRow::from_component_batches( - RowId::new(), - timegen(i), - entity_path.clone(), - datagen(i) - .as_component_batches() - .iter() - .map(|batch| batch as &dyn ComponentBatch), - ) - .unwrap() - }), - ); - - // Do a serialization roundtrip to pack everything in contiguous memory. - if packed { - let (schema, columns) = table.serialize().unwrap(); - table = DataTable::deserialize(TableId::ZERO, &schema, &columns).unwrap(); - } - - table.compute_all_size_bytes(); - - table -} diff --git a/crates/re_data_store/benches/vectors.rs b/crates/re_data_store/benches/vectors.rs deleted file mode 100644 index f2edefc577c8..000000000000 --- a/crates/re_data_store/benches/vectors.rs +++ /dev/null @@ -1,348 +0,0 @@ -//! Keeping track of performance issues/regressions for common vector operations. - -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; - -use criterion::{criterion_group, Criterion}; - -use smallvec::SmallVec; -use tinyvec::TinyVec; - -// --- - -criterion_group!(benches, sort, split, swap, swap_opt); - -criterion::criterion_main!(benches); - -// --- - -#[cfg(not(debug_assertions))] -const NUM_INSTANCES: usize = 10_000; -#[cfg(not(debug_assertions))] -const SMALLVEC_SIZE: usize = 4; - -// `cargo test` also runs the benchmark setup code, so make sure they run quickly: -#[cfg(debug_assertions)] -const NUM_INSTANCES: usize = 1; -#[cfg(debug_assertions)] -const SMALLVEC_SIZE: usize = 1; - -// --- Benchmarks --- - -fn split(c: &mut Criterion) { - if std::env::var("CI").is_ok() { - return; - } - - let mut group = c.benchmark_group(format!("vector_ops/split_off/instances={NUM_INSTANCES}")); - group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); - - { - fn split_off( - data: &mut SmallVec<[T; N]>, - split_idx: usize, - ) -> SmallVec<[T; N]> { - if split_idx >= data.len() { - return SmallVec::default(); - } - - let second_half = SmallVec::from_slice(&data[split_idx..]); - data.truncate(split_idx); - second_half - } - - let data: SmallVec<[i64; SMALLVEC_SIZE]> = (0..NUM_INSTANCES as i64).collect(); - - group.bench_function(format!("smallvec/n={SMALLVEC_SIZE}/manual"), |b| { - b.iter(|| { - let mut data = data.clone(); - let second_half = split_off(&mut data, NUM_INSTANCES / 2); - assert_eq!(NUM_INSTANCES, data.len() + second_half.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, second_half[0]); - (data, second_half) - }); - }); - } - - { - let data: TinyVec<[i64; SMALLVEC_SIZE]> = (0..NUM_INSTANCES as i64).collect(); - - group.bench_function(format!("tinyvec/n={SMALLVEC_SIZE}"), |b| { - b.iter(|| { - let mut data = data.clone(); - let second_half = data.split_off(NUM_INSTANCES / 2); - assert_eq!(NUM_INSTANCES, data.len() + second_half.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, second_half[0]); - (data, second_half) - }); - }); - } - - { - fn split_off( - data: &mut TinyVec<[T; N]>, - split_idx: usize, - ) -> TinyVec<[T; N]> { - if split_idx >= data.len() { - return TinyVec::default(); - } - - let second_half = TinyVec::from(&data[split_idx..]); - data.truncate(split_idx); - second_half - } - - let data: TinyVec<[i64; SMALLVEC_SIZE]> = (0..NUM_INSTANCES as i64).collect(); - - group.bench_function(format!("tinyvec/n={SMALLVEC_SIZE}/manual"), |b| { - b.iter(|| { - let mut data = data.clone(); - let second_half = split_off(&mut data, NUM_INSTANCES / 2); - assert_eq!(NUM_INSTANCES, data.len() + second_half.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, second_half[0]); - (data, second_half) - }); - }); - } - - { - let data: Vec = (0..NUM_INSTANCES as i64).collect(); - - group.bench_function("vec", |b| { - b.iter(|| { - let mut data = data.clone(); - let second_half = data.split_off(NUM_INSTANCES / 2); - assert_eq!(NUM_INSTANCES, data.len() + second_half.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, second_half[0]); - (data, second_half) - }); - }); - } - - { - fn split_off(data: &mut Vec, split_idx: usize) -> Vec { - if split_idx >= data.len() { - return Vec::default(); - } - - let second_half = Vec::from(&data[split_idx..]); - data.truncate(split_idx); - second_half - } - - let data: Vec = (0..NUM_INSTANCES as i64).collect(); - - group.bench_function("vec/manual", |b| { - b.iter(|| { - let mut data = data.clone(); - let second_half = split_off(&mut data, NUM_INSTANCES / 2); - assert_eq!(NUM_INSTANCES, data.len() + second_half.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, second_half[0]); - (data, second_half) - }); - }); - } -} - -fn sort(c: &mut Criterion) { - if std::env::var("CI").is_ok() { - return; - } - - let mut group = c.benchmark_group(format!("vector_ops/sort/instances={NUM_INSTANCES}")); - group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); - - { - let data: SmallVec<[i64; SMALLVEC_SIZE]> = (0..NUM_INSTANCES as i64).rev().collect(); - - group.bench_function(format!("smallvec/n={SMALLVEC_SIZE}"), |b| { - b.iter(|| { - let mut data = data.clone(); - data.sort_unstable(); - assert_eq!(NUM_INSTANCES, data.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, data[NUM_INSTANCES / 2]); - data - }); - }); - } - - { - let data: TinyVec<[i64; SMALLVEC_SIZE]> = (0..NUM_INSTANCES as i64).rev().collect(); - - group.bench_function(format!("tinyvec/n={SMALLVEC_SIZE}"), |b| { - b.iter(|| { - let mut data = data.clone(); - data.sort_unstable(); - assert_eq!(NUM_INSTANCES, data.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, data[NUM_INSTANCES / 2]); - data - }); - }); - } - - { - let data: Vec = (0..NUM_INSTANCES as i64).rev().collect(); - - group.bench_function("vec", |b| { - b.iter(|| { - let mut data = data.clone(); - data.sort_unstable(); - assert_eq!(NUM_INSTANCES, data.len()); - assert_eq!(NUM_INSTANCES as i64 / 2, data[NUM_INSTANCES / 2]); - data - }); - }); - } -} - -fn swap(c: &mut Criterion) { - if std::env::var("CI").is_ok() { - return; - } - - let mut group = c.benchmark_group(format!("vector_ops/swap/instances={NUM_INSTANCES}")); - group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); - - { - let data: SmallVec<[i64; SMALLVEC_SIZE]> = (0..NUM_INSTANCES as i64).collect(); - let swaps: SmallVec<[usize; SMALLVEC_SIZE]> = (0..NUM_INSTANCES).rev().collect(); - - group.bench_function(format!("smallvec/n={SMALLVEC_SIZE}"), |b| { - b.iter(|| { - let mut data1 = data.clone(); - let data2 = data.clone(); - for &swap in &swaps { - data1[NUM_INSTANCES - swap - 1] = data2[swap]; - } - assert_eq!(NUM_INSTANCES, data1.len()); - assert_eq!(NUM_INSTANCES, data2.len()); - assert_eq!( - (NUM_INSTANCES as i64 / 2).max(1) - 1, - data1[NUM_INSTANCES / 2] - ); - (data1, data2) - }); - }); - } - - { - let data: TinyVec<[i64; SMALLVEC_SIZE]> = (0..NUM_INSTANCES as i64).collect(); - let swaps: TinyVec<[usize; SMALLVEC_SIZE]> = (0..NUM_INSTANCES).rev().collect(); - - group.bench_function(format!("tinyvec/n={SMALLVEC_SIZE}"), |b| { - b.iter(|| { - let mut data1 = data.clone(); - let data2 = data.clone(); - for &swap in &swaps { - data1[NUM_INSTANCES - swap - 1] = data2[swap]; - } - assert_eq!(NUM_INSTANCES, data1.len()); - assert_eq!(NUM_INSTANCES, data2.len()); - assert_eq!( - (NUM_INSTANCES as i64 / 2).max(1) - 1, - data1[NUM_INSTANCES / 2] - ); - (data1, data2) - }); - }); - } - - { - let data: Vec = (0..NUM_INSTANCES as i64).collect(); - let swaps: Vec = (0..NUM_INSTANCES).rev().collect(); - - group.bench_function("vec", |b| { - b.iter(|| { - let mut data1 = data.clone(); - let data2 = data.clone(); - for &swap in &swaps { - data1[NUM_INSTANCES - swap - 1] = data2[swap]; - } - assert_eq!(NUM_INSTANCES, data1.len()); - assert_eq!(NUM_INSTANCES, data2.len()); - assert_eq!( - (NUM_INSTANCES as i64 / 2).max(1) - 1, - data1[NUM_INSTANCES / 2] - ); - (data1, data2) - }); - }); - } -} - -fn swap_opt(c: &mut Criterion) { - if std::env::var("CI").is_ok() { - return; - } - - let mut group = c.benchmark_group(format!("vector_ops/swap_opt/instances={NUM_INSTANCES}")); - group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); - - { - let data: SmallVec<[Option; SMALLVEC_SIZE]> = - (0..NUM_INSTANCES as i64).map(Some).collect(); - let swaps: SmallVec<[usize; SMALLVEC_SIZE]> = (0..NUM_INSTANCES).rev().collect(); - - group.bench_function(format!("smallvec/n={SMALLVEC_SIZE}"), |b| { - b.iter(|| { - let mut data1 = data.clone(); - let mut data2 = data.clone(); - for &swap in &swaps { - data1[NUM_INSTANCES - swap - 1] = data2[swap].take(); - } - assert_eq!(NUM_INSTANCES, data1.len()); - assert_eq!(NUM_INSTANCES, data2.len()); - assert_eq!( - Some((NUM_INSTANCES as i64 / 2).max(1) - 1), - data1[NUM_INSTANCES / 2] - ); - (data1, data2) - }); - }); - } - - { - let data: TinyVec<[Option; SMALLVEC_SIZE]> = - (0..NUM_INSTANCES as i64).map(Some).collect(); - let swaps: TinyVec<[usize; SMALLVEC_SIZE]> = (0..NUM_INSTANCES).rev().collect(); - - group.bench_function(format!("tinyvec/n={SMALLVEC_SIZE}"), |b| { - b.iter(|| { - let mut data1 = data.clone(); - let mut data2 = data.clone(); - for &swap in &swaps { - data1[NUM_INSTANCES - swap - 1] = data2[swap].take(); - } - assert_eq!(NUM_INSTANCES, data1.len()); - assert_eq!(NUM_INSTANCES, data2.len()); - assert_eq!( - Some((NUM_INSTANCES as i64 / 2).max(1) - 1), - data1[NUM_INSTANCES / 2] - ); - (data1, data2) - }); - }); - } - - { - let data: Vec> = (0..NUM_INSTANCES as i64).map(Some).collect(); - let swaps: Vec = (0..NUM_INSTANCES).rev().collect(); - - group.bench_function("vec", |b| { - b.iter(|| { - let mut data1 = data.clone(); - let mut data2 = data.clone(); - for &swap in &swaps { - data1[NUM_INSTANCES - swap - 1] = data2[swap].take(); - } - assert_eq!(NUM_INSTANCES, data1.len()); - assert_eq!(NUM_INSTANCES, data2.len()); - assert_eq!( - Some((NUM_INSTANCES as i64 / 2).max(1) - 1), - data1[NUM_INSTANCES / 2] - ); - (data1, data2) - }); - }); - } -} diff --git a/crates/re_data_store/src/arrow_util.rs b/crates/re_data_store/src/arrow_util.rs deleted file mode 100644 index cf6cbed43ad3..000000000000 --- a/crates/re_data_store/src/arrow_util.rs +++ /dev/null @@ -1,28 +0,0 @@ -use arrow2::array::{Array, ListArray}; - -// --- - -pub trait ArrayExt: Array { - /// Returns the length of the child array at the given index. - /// - /// * Panics if `self` is not a `ListArray`. - /// * Panics if `child_nr` is out of bounds. - fn get_child_length(&self, child_nr: usize) -> usize; -} - -impl ArrayExt for dyn Array { - /// Return the length of the first child. - /// - /// ## Panics - /// - /// Panics if `Self` is not a `ListArray`, or if the array is empty (no children). - fn get_child_length(&self, child_nr: usize) -> usize { - self.as_any() - .downcast_ref::>() - .expect("not a ListArray") - .offsets() - .lengths() - .nth(child_nr) - .unwrap_or_else(|| panic!("no child at index {child_nr}")) - } -} diff --git a/crates/re_data_store/src/lib.rs b/crates/re_data_store/src/lib.rs deleted file mode 100644 index 6bf77bda69fa..000000000000 --- a/crates/re_data_store/src/lib.rs +++ /dev/null @@ -1,54 +0,0 @@ -//! The Rerun datastore, implemented on top of [Apache Arrow](https://arrow.apache.org/) -//! using the [`arrow2`] crate. -//! -//! This crate is an in-memory time series database for Rerun log data. -//! It is indexed by entity path, component, timeline, and time. -//! It supports out-of-order insertions, and fast `O(log(N))` queries. -//! -//! * See [`DataStore`] for an overview of the core data structures. -//! * See [`DataStore::latest_at`] and [`DataStore::range`] for the documentation of the public -//! read APIs. -//! * See [`DataStore::insert_row`] for the documentation of the public write APIs. -//! -//! ## Feature flags -#![doc = document_features::document_features!()] -//! - -mod arrow_util; -mod store; -mod store_arrow; -mod store_dump; -mod store_event; -mod store_format; -mod store_gc; -mod store_helpers; -mod store_read; -mod store_sanity; -mod store_stats; -mod store_subscriber; -mod store_write; - -#[doc(hidden)] -pub mod test_util; - -pub use self::store::{DataStore, DataStoreConfig, StoreGeneration}; -pub use self::store_event::{StoreDiff, StoreDiffKind, StoreEvent}; -pub use self::store_gc::{GarbageCollectionOptions, GarbageCollectionTarget}; -pub use self::store_read::{LatestAtQuery, RangeQuery}; -pub use self::store_stats::{DataStoreRowStats, DataStoreStats, EntityStats}; -pub use self::store_subscriber::{StoreSubscriber, StoreSubscriberHandle}; -pub use self::store_write::{WriteError, WriteResult}; - -pub(crate) use self::store::{ - IndexedBucket, IndexedBucketInner, IndexedTable, MetadataRegistry, StaticCell, StaticTable, -}; - -// Re-exports -#[doc(no_inline)] -pub use arrow2::io::ipc::read::{StreamReader, StreamState}; -#[doc(no_inline)] -pub use re_log_types::{ResolvedTimeRange, TimeInt, TimeType, Timeline}; // for politeness sake - -pub mod external { - pub use arrow2; -} diff --git a/crates/re_data_store/src/store.rs b/crates/re_data_store/src/store.rs deleted file mode 100644 index 4bc15a76736f..000000000000 --- a/crates/re_data_store/src/store.rs +++ /dev/null @@ -1,509 +0,0 @@ -use std::collections::{BTreeMap, VecDeque}; -use std::sync::atomic::AtomicU64; - -use arrow2::datatypes::DataType; -use nohash_hasher::IntMap; -use parking_lot::RwLock; -use re_log_types::{ - DataCell, DataCellColumn, EntityPath, EntityPathHash, ErasedTimeVec, ResolvedTimeRange, RowId, - RowIdVec, StoreId, TimeInt, TimePoint, Timeline, -}; -use re_types_core::{ComponentName, ComponentNameSet, SizeBytes}; - -// --- Data store --- - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct DataStoreConfig { - /// The maximum number of rows in an indexed bucket before triggering a split. - /// Does not apply to static data. - /// - /// ⚠ When configuring this threshold, do keep in mind that indexed tables are always scoped - /// to a specific timeline _and_ a specific entity. - /// - /// This effectively puts an upper bound on the number of rows that need to be sorted when an - /// indexed bucket gets out of order (e.g. because of new insertions or a GC pass). - /// This is a tradeoff: less rows means faster sorts at the cost of more metadata overhead. - /// In particular: - /// - Query performance scales inversely logarithmically to this number (i.e. it gets better - /// the higher this number gets). - /// - GC performance scales quadratically with this number (i.e. it gets better the lower this - /// number gets). - /// - /// See [`Self::DEFAULT`] for defaults. - pub indexed_bucket_num_rows: u64, - - /// If enabled, will store the ID of the write request alongside the inserted data. - /// - /// This can make inspecting the data within the store much easier, at the cost of an extra - /// `u64` value stored per row. - /// - /// Enabled by default in debug builds. - pub store_insert_ids: bool, -} - -impl Default for DataStoreConfig { - #[inline] - fn default() -> Self { - Self::DEFAULT - } -} - -impl DataStoreConfig { - pub const DEFAULT: Self = Self { - // NOTE: Empirical testing has shown that 512 is a good balance between sorting - // and binary search costs with the current GC implementation. - // - // Garbage collection costs are entirely driven by the number of buckets around, the size - // of the data itself has no impact. - indexed_bucket_num_rows: 512, - store_insert_ids: cfg!(debug_assertions), - }; -} - -// --- - -pub type InsertIdVec = VecDeque; - -/// Keeps track of datatype information for all component types that have been written to the store -/// so far. -/// -/// See also [`DataStore::lookup_datatype`]. -#[derive(Debug, Default, Clone)] -pub struct DataTypeRegistry(pub IntMap); - -impl std::ops::Deref for DataTypeRegistry { - type Target = IntMap; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::ops::DerefMut for DataTypeRegistry { - #[inline] - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -/// Keeps track of arbitrary per-row metadata. -#[derive(Debug, Clone)] -pub struct MetadataRegistry { - pub registry: BTreeMap, - - /// Cached heap size, because the registry gets very, very large. - pub heap_size_bytes: u64, -} - -impl Default for MetadataRegistry<(TimePoint, EntityPathHash)> { - fn default() -> Self { - let mut this = Self { - registry: Default::default(), - heap_size_bytes: 0, - }; - this.heap_size_bytes = this.heap_size_bytes(); // likely zero, just future proofing - this - } -} - -impl std::ops::Deref for MetadataRegistry { - type Target = BTreeMap; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.registry - } -} - -impl std::ops::DerefMut for MetadataRegistry { - #[inline] - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.registry - } -} - -// --- - -/// Incremented on each edit. -#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] -pub struct StoreGeneration { - insert_id: u64, - gc_id: u64, -} - -/// A complete data store: covers all timelines, all entities, everything. -/// -/// ## Debugging -/// -/// `DataStore` provides a very thorough `Display` implementation that makes it manageable to -/// know what's going on internally. -/// For even more information, you can set `RERUN_DATA_STORE_DISPLAY_SCHEMAS=1` in your -/// environment, which will result in additional schema information being printed out. -pub struct DataStore { - pub(crate) id: StoreId, - - /// The configuration of the data store (e.g. bucket sizes). - pub(crate) config: DataStoreConfig, - - /// Keeps track of datatype information for all component types that have been written to - /// the store so far. - /// - /// See also [`Self::lookup_datatype`]. - // - // TODO(#1809): replace this with a centralized Arrow registry. - pub(crate) type_registry: DataTypeRegistry, - - /// Keeps track of arbitrary per-row metadata. - pub(crate) metadata_registry: MetadataRegistry<(TimePoint, EntityPathHash)>, - - /// All temporal [`IndexedTable`]s for all entities on all timelines. - /// - /// See also [`Self::static_tables`]. - pub(crate) tables: BTreeMap<(EntityPathHash, Timeline), IndexedTable>, - - /// Static data. Never garbage collected. - /// - /// Static data unconditionally shadows temporal data at query time. - /// - /// Existing temporal will not be removed. Events won't be fired. - /// - /// See also [`Self::tables`]. - pub(crate) static_tables: BTreeMap, - - /// Monotonically increasing ID for insertions. - pub(crate) insert_id: u64, - - /// Monotonically increasing ID for queries. - pub(crate) query_id: AtomicU64, - - /// Monotonically increasing ID for GCs. - pub(crate) gc_id: u64, - - /// Monotonically increasing ID for store events. - pub(crate) event_id: AtomicU64, -} - -impl Clone for DataStore { - fn clone(&self) -> Self { - Self { - id: self.id.clone(), - config: self.config.clone(), - type_registry: self.type_registry.clone(), - metadata_registry: self.metadata_registry.clone(), - tables: self.tables.clone(), - static_tables: self.static_tables.clone(), - insert_id: Default::default(), - query_id: Default::default(), - gc_id: Default::default(), - event_id: Default::default(), - } - } -} - -impl DataStore { - pub fn new(id: StoreId, config: DataStoreConfig) -> Self { - Self { - id, - config, - type_registry: Default::default(), - metadata_registry: Default::default(), - tables: Default::default(), - static_tables: Default::default(), - insert_id: 0, - query_id: AtomicU64::new(0), - gc_id: 0, - event_id: AtomicU64::new(0), - } - } - - #[inline] - pub fn id(&self) -> &StoreId { - &self.id - } - - /// The column name used for storing insert requests' IDs alongside the data when manipulating - /// dataframes. - /// - /// See [`DataStoreConfig::store_insert_ids`]. - pub fn insert_id_component_name() -> ComponentName { - "rerun.controls.InsertId".into() - } - - /// Return the current `StoreGeneration`. This can be used to determine whether the - /// database has been modified since the last time it was queried. - pub fn generation(&self) -> StoreGeneration { - StoreGeneration { - insert_id: self.insert_id, - gc_id: self.gc_id, - } - } - - /// See [`DataStoreConfig`] for more information about configuration. - pub fn config(&self) -> &DataStoreConfig { - &self.config - } - - /// Lookup the arrow [`DataType`] of a [`re_types_core::Component`] in the internal - /// `DataTypeRegistry`. - pub fn lookup_datatype(&self, component: &ComponentName) -> Option<&DataType> { - self.type_registry.get(component) - } - - /// The oldest time for which we have any data. - /// - /// Ignores static data. - /// - /// Useful to call after a gc. - pub fn oldest_time_per_timeline(&self) -> BTreeMap { - re_tracing::profile_function!(); - - let mut oldest_time_per_timeline = BTreeMap::default(); - - for index in self.tables.values() { - if let Some(bucket) = index.buckets.values().next() { - let entry = oldest_time_per_timeline - .entry(bucket.timeline) - .or_insert(TimeInt::MAX); - if let Some(&time) = bucket.inner.read().col_time.front() { - *entry = TimeInt::min(*entry, TimeInt::new_temporal(time)); - } - } - } - - oldest_time_per_timeline - } - - /// Returns a read-only iterator over the raw indexed tables. - /// - /// Do _not_ use this to try and assert the internal state of the datastore. - pub fn iter_indices( - &self, - ) -> impl ExactSizeIterator { - self.tables.iter().map(|((_, timeline), table)| { - ((table.entity_path.clone() /* shallow */, *timeline), table) - }) - } -} - -// --- Temporal --- - -/// An `IndexedTable` is an ever-growing, arbitrary large [`re_log_types::DataTable`] that is -/// optimized for time-based insertions and queries (which means a lot of bucketing). -/// -/// See also [`IndexedBucket`]. -/// -/// Run the following command to display a visualization of the store's internal datastructures and -/// better understand how everything fits together: -/// ```text -/// cargo test -p re_data_store -- --nocapture datastore_internal_repr -/// ``` -#[derive(Debug, Clone)] -pub struct IndexedTable { - /// The timeline this table operates in, for debugging purposes. - pub timeline: Timeline, - - /// The entity this table is related to, for debugging purposes. - pub entity_path: EntityPath, - - /// The actual buckets, where the data is stored. - /// - /// The keys of this `BTreeMap` represent the lower bounds of the time-ranges covered by - /// their associated buckets, _as seen from an indexing rather than a data standpoint_! - /// - /// This means that e.g. for the initial bucket, this will always be `-∞`, as from an - /// indexing standpoint, all reads and writes with a time `t >= -∞` should go there, even - /// though the bucket doesn't actually contains data with a timestamp of `-∞`! - pub buckets: BTreeMap, - - /// Track all of the components that have been written to. - /// - /// Note that this set will never be purged and will continue to return components that may - /// have been set in the past even if all instances of that component have since been purged - /// to free up space. - pub all_components: ComponentNameSet, - - /// The number of rows stored in this table, across all of its buckets. - pub buckets_num_rows: u64, - - /// The size of both the control & component data stored in this table, across all of its - /// buckets, in bytes. - /// - /// This is a best-effort approximation, adequate for most purposes (stats, - /// triggering GCs, …). - pub buckets_size_bytes: u64, -} - -impl IndexedTable { - pub fn new(timeline: Timeline, entity_path: EntityPath) -> Self { - let bucket = IndexedBucket::new(timeline); - let buckets_size_bytes = bucket.total_size_bytes(); - Self { - timeline, - entity_path, - buckets: [(TimeInt::MIN, bucket)].into(), - all_components: Default::default(), - buckets_num_rows: 0, - buckets_size_bytes, - } - } - - /// Makes sure bucketing invariants are upheld, and takes necessary actions if not. - /// - /// Invariants are: - /// 1. There must always be at least one bucket alive. - /// 2. The first bucket must always have an _indexing time_ `-∞`. - pub(crate) fn uphold_indexing_invariants(&mut self) { - if self.buckets.is_empty() { - let Self { - timeline, - entity_path: _, - buckets, - all_components: _, // keep the history on purpose - buckets_num_rows, - buckets_size_bytes, - } = self; - - let bucket = IndexedBucket::new(*timeline); - let size_bytes = bucket.total_size_bytes(); - - *buckets = [(TimeInt::MIN, bucket)].into(); - *buckets_num_rows = 0; - *buckets_size_bytes = size_bytes; - } - // NOTE: Make sure the first bucket is responsible for `-∞`, which might or might not be - // the case now if we've been moving buckets around. - else if let Some((_, bucket)) = self.buckets.pop_first() { - self.buckets.insert(TimeInt::MIN, bucket); - } - } -} - -/// An `IndexedBucket` holds a chunk of rows from an [`IndexedTable`] -/// (see [`DataStoreConfig::indexed_bucket_num_rows`]). -#[derive(Debug)] -pub struct IndexedBucket { - /// The timeline the bucket's parent table operates in, for debugging purposes. - pub timeline: Timeline, - - // To simplify interior mutability. - pub inner: RwLock, -} - -impl Clone for IndexedBucket { - fn clone(&self) -> Self { - Self { - timeline: self.timeline, - inner: RwLock::new(self.inner.read().clone()), - } - } -} - -impl IndexedBucket { - pub(crate) fn new(timeline: Timeline) -> Self { - Self { - timeline, - inner: RwLock::new(IndexedBucketInner::default()), - } - } -} - -/// See [`IndexedBucket`]; this is a helper struct to simplify interior mutability. -#[derive(Debug, Clone)] -pub struct IndexedBucketInner { - /// Are the rows in this table chunk sorted? - /// - /// Querying an [`IndexedBucket`] will always trigger a sort if the rows within aren't already - /// sorted. - pub is_sorted: bool, - - /// The time range covered by the primary time column (see [`Self::col_time`]). - /// - /// For an empty bucket, this defaults to `[+∞,-∞]`. - pub time_range: ResolvedTimeRange, - - // The primary time column, which is what drives the ordering of every other column. - pub col_time: ErasedTimeVec, - - /// The entire column of insertion IDs, if enabled in [`DataStoreConfig`]. - /// - /// Keeps track of insertion order from the point-of-view of the [`DataStore`]. - pub col_insert_id: InsertIdVec, - - /// The entire column of `RowId`s. - /// - /// Keeps track of the unique identifier for each row that was generated by the clients. - pub col_row_id: RowIdVec, - - /// Keeps track of the latest/newest [`RowId`] present in this bucket. - /// - /// Useful to batch GC buckets. - /// - /// `RowId::ZERO` for empty buckets. - pub max_row_id: RowId, - - /// All the rows for all the component columns. - /// - /// The cells are optional since not all rows will have data for every single component - /// (i.e. the table is sparse). - pub columns: IntMap, - - /// The size of both the control & component data stored in this bucket, heap and stack - /// included, in bytes. - /// - /// This is a best-effort approximation, adequate for most purposes (stats, - /// triggering GCs, …). - /// - /// We cache this because there can be many, many buckets. - pub size_bytes: u64, -} - -impl Default for IndexedBucketInner { - fn default() -> Self { - let mut this = Self { - is_sorted: true, - time_range: ResolvedTimeRange::EMPTY, - col_time: Default::default(), - col_insert_id: Default::default(), - col_row_id: Default::default(), - max_row_id: RowId::ZERO, - columns: Default::default(), - size_bytes: 0, // NOTE: computed below - }; - this.compute_size_bytes(); - this - } -} - -// --- Static --- - -/// Keeps track of static component data per entity. -#[derive(Clone)] -pub struct StaticTable { - /// The entity this table is related to, for debugging purposes. - pub entity_path: EntityPath, - - /// Keeps track of one and only one [`StaticCell`] per component. - /// - /// Last-write-wins semantics apply, where ordering is defined by `RowId`. - pub cells: BTreeMap, -} - -impl StaticTable { - #[inline] - pub fn new(entity_path: EntityPath) -> Self { - Self { - entity_path, - cells: Default::default(), - } - } -} - -#[derive(Clone)] -pub struct StaticCell { - /// None if [`DataStoreConfig::store_insert_ids`] is `false`. - pub insert_id: Option, - - pub row_id: RowId, - pub cell: DataCell, -} diff --git a/crates/re_data_store/src/store_arrow.rs b/crates/re_data_store/src/store_arrow.rs deleted file mode 100644 index eb398f1155f7..000000000000 --- a/crates/re_data_store/src/store_arrow.rs +++ /dev/null @@ -1,198 +0,0 @@ -use std::collections::{BTreeMap, VecDeque}; - -use arrow2::{array::Array, chunk::Chunk, datatypes::Schema}; -use nohash_hasher::IntMap; -use re_log_types::{DataCellColumn, DataTable, DataTableResult, RowId, Timeline}; -use re_types_core::ComponentName; - -use crate::{ - store::{IndexedBucket, IndexedBucketInner}, - StaticTable, -}; - -// --- - -impl IndexedBucket { - /// Serializes the entire bucket into an arrow payload and schema. - /// - /// Column order: - /// - `insert_id` - /// - `row_id` - /// - `time` - /// - rest of component columns in ascending lexical order - pub fn serialize(&self) -> DataTableResult<(Schema, Chunk>)> { - re_tracing::profile_function!(); - - let Self { timeline, inner } = self; - - let IndexedBucketInner { - is_sorted: _, - time_range: _, - col_time, - col_insert_id, - col_row_id, - max_row_id: _, - columns, - size_bytes: _, - } = &*inner.read(); - - serialize( - Some((*timeline, col_time)), - col_insert_id, - col_row_id, - columns, - ) - } -} - -impl StaticTable { - /// Serializes the entire table into an arrow payload and schema. - /// - /// Column order: - /// - `insert_id` - /// - `row_id` - /// - `time` - /// - rest of component columns in ascending lexical order - pub fn serialize(&self) -> DataTableResult<(Schema, Chunk>)> { - re_tracing::profile_function!(); - - let mut cells_per_row_id: BTreeMap> = Default::default(); - for static_cell in self.cells.values() { - cells_per_row_id - .entry(static_cell.row_id) - .or_default() - .push(static_cell.clone()); - } - - let col_insert_id = cells_per_row_id - .values() - .filter_map(|cells| cells.first().and_then(|cell| cell.insert_id)) - .collect(); - - let col_row_id = cells_per_row_id.keys().copied().collect(); - - let component_names: Vec<_> = self - .cells - .values() - .map(|cell| cell.cell.component_name()) - .collect(); - - let mut columns = IntMap::::default(); - for (_row_id, cells) in cells_per_row_id { - let cells: BTreeMap<_, _> = cells - .iter() - .map(|cell| (cell.cell.component_name(), &cell.cell)) - .collect(); - for component_name in &component_names { - columns - .entry(*component_name) - .or_default() - .push_back(cells.get(component_name).copied().cloned()); - } - } - - serialize(None, &col_insert_id, &col_row_id, &columns) - } -} - -// --- - -fn serialize( - col_time: Option<(Timeline, &VecDeque)>, - col_insert_id: &VecDeque, - col_row_id: &VecDeque, - table: &IntMap, -) -> DataTableResult<(Schema, Chunk>)> { - re_tracing::profile_function!(); - - let mut schema = Schema::default(); - let mut columns = Vec::new(); - - // NOTE: Empty table / bucket. - if col_row_id.is_empty() { - return Ok((schema, Chunk::new(columns))); - } - - { - let (control_schema, control_columns) = - serialize_control_columns(col_time, col_insert_id, col_row_id)?; - schema.fields.extend(control_schema.fields); - schema.metadata.extend(control_schema.metadata); - columns.extend(control_columns); - } - - { - let (data_schema, data_columns) = serialize_data_columns(table)?; - schema.fields.extend(data_schema.fields); - schema.metadata.extend(data_schema.metadata); - columns.extend(data_columns); - } - - Ok((schema, Chunk::new(columns))) -} - -fn serialize_control_columns( - col_time: Option<(Timeline, &VecDeque)>, - col_insert_id: &VecDeque, - col_row_id: &VecDeque, -) -> DataTableResult<(Schema, Vec>)> { - re_tracing::profile_function!(); - - let mut schema = Schema::default(); - let mut columns = Vec::new(); - - // NOTE: ordering is taken into account! - // - insert_id - // - row_id - // - time - - // NOTE: Optional column, so make sure it's actually there: - if !col_insert_id.is_empty() { - let (insert_id_field, insert_id_column) = DataTable::serialize_primitive_column( - &crate::DataStore::insert_id_component_name(), - col_insert_id, - None, - ); - schema.fields.push(insert_id_field); - columns.push(insert_id_column); - } - - let (row_id_field, row_id_column) = DataTable::serialize_control_column(col_row_id)?; - schema.fields.push(row_id_field); - columns.push(row_id_column); - - if let Some((timeline, col_time)) = col_time { - let (time_field, time_column) = DataTable::serialize_primitive_column( - timeline.name(), - col_time, - timeline.datatype().into(), - ); - schema.fields.push(time_field); - columns.push(time_column); - } - - Ok((schema, columns)) -} - -fn serialize_data_columns( - table: &IntMap, -) -> DataTableResult<(Schema, Vec>)> { - re_tracing::profile_function!(); - - let mut schema = Schema::default(); - let mut columns = Vec::new(); - - // NOTE: ordering is taken into account! - let table: BTreeMap<_, _> = table.iter().collect(); - - for (component, column) in table { - // NOTE: Don't serialize columns with only null values. - if column.iter().any(Option::is_some) { - let (field, column) = DataTable::serialize_data_column(component, column)?; - schema.fields.push(field); - columns.push(column); - } - } - - Ok((schema, columns)) -} diff --git a/crates/re_data_store/src/store_dump.rs b/crates/re_data_store/src/store_dump.rs deleted file mode 100644 index b6a7bdd6ac2e..000000000000 --- a/crates/re_data_store/src/store_dump.rs +++ /dev/null @@ -1,228 +0,0 @@ -use std::collections::BTreeMap; - -use arrow2::Either; -use re_log_types::{ - DataCellColumn, DataRow, DataTable, ErasedTimeVec, ResolvedTimeRange, RowId, RowIdVec, TableId, - TimeInt, TimePoint, Timeline, -}; - -use crate::{store::IndexedBucketInner, DataStore, IndexedBucket}; - -// --- - -impl DataStore { - /// Serializes the entire datastore into one big sorted list of [`DataRow`]. - /// - /// Individual [`re_log_types::DataRow`]s that were split apart due to bucketing are merged back together. - /// - /// Beware: this is extremely costly, don't use this in hot paths. - pub fn to_rows(&self) -> re_log_types::DataReadResult> { - re_tracing::profile_function!(); - - let mut rows = ahash::HashMap::::default(); - for table in self.to_data_tables(None) { - for row in table.to_rows().collect::>() { - let row = row?; - match rows.entry(row.row_id()) { - std::collections::hash_map::Entry::Occupied(mut entry) => { - for (timeline, time) in row.timepoint() { - entry.get_mut().timepoint.insert(*timeline, *time); - } - } - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(row); - } - } - } - } - - let mut rows = rows.into_values().collect::>(); - { - re_tracing::profile_scope!("sort_rows"); - rows.sort_by_key(|row| (row.timepoint.clone(), row.row_id)); - } - - Ok(rows) - } - - /// Serializes the entire datastore into one big sorted [`DataTable`]. - /// - /// Individual [`re_log_types::DataRow`]s that were split apart due to bucketing are merged back together. - /// - /// Beware: this is extremely costly, don't use this in hot paths. - pub fn to_data_table(&self) -> re_log_types::DataReadResult { - re_tracing::profile_function!(); - - let rows = self.to_rows()?; - - Ok(re_log_types::DataTable::from_rows( - re_log_types::TableId::new(), - rows, - )) - } - - /// Serializes the entire datastore into an iterator of [`DataTable`]s, where each table - /// corresponds 1-to-1 to an internal bucket. - pub fn to_data_tables( - &self, - time_filter: Option<(Timeline, ResolvedTimeRange)>, - ) -> impl Iterator + '_ { - let static_tables = self.dump_static_tables(); - let temporal = if let Some(time_filter) = time_filter { - Either::Left(self.dump_temporal_tables_filtered(time_filter)) - } else { - Either::Right(self.dump_temporal_tables()) - }; - - static_tables.chain(temporal) - } - - fn dump_static_tables(&self) -> impl Iterator + '_ { - self.static_tables.values().map(|static_table| { - let mut cells_per_row_id: BTreeMap> = Default::default(); - for static_cell in static_table.cells.values() { - cells_per_row_id - .entry(static_cell.row_id) - .or_default() - .push(static_cell.clone()); - } - - let rows = cells_per_row_id - .into_iter() - .filter_map(|(row_id, static_cells)| { - DataRow::from_cells( - row_id, - TimePoint::default(), - static_table.entity_path.clone(), - static_cells.into_iter().map(|static_cell| static_cell.cell), - ) - .ok() - }); - - DataTable::from_rows(TableId::ZERO, rows) - }) - } - - fn dump_temporal_tables(&self) -> impl Iterator + '_ { - self.tables.values().flat_map(|table| { - re_tracing::profile_scope!("temporal_table"); - - table.buckets.values().map(move |bucket| { - re_tracing::profile_scope!("temporal_bucket"); - - bucket.sort_indices_if_needed(); - - let IndexedBucket { timeline, inner } = bucket; - - let IndexedBucketInner { - is_sorted: _, - time_range: _, - col_time, - col_insert_id: _, - col_row_id, - max_row_id: _, - columns, - size_bytes: _, - } = &*inner.read(); - - DataTable { - table_id: TableId::new(), - col_row_id: col_row_id.clone(), - col_timelines: [(*timeline, col_time.iter().copied().map(Some).collect())] - .into(), - col_entity_path: std::iter::repeat_with(|| table.entity_path.clone()) - .take(col_row_id.len()) - .collect(), - columns: columns.clone().into_iter().collect(), // shallow - } - }) - }) - } - - fn dump_temporal_tables_filtered( - &self, - (timeline_filter, time_filter): (Timeline, ResolvedTimeRange), - ) -> impl Iterator + '_ { - self.tables - .values() - .filter_map(move |table| { - re_tracing::profile_scope!("temporal_table_filtered"); - - if table.timeline != timeline_filter { - return None; - } - - Some(table.buckets.values().filter_map(move |bucket| { - re_tracing::profile_scope!("temporal_bucket_filtered"); - - bucket.sort_indices_if_needed(); - - let IndexedBucket { timeline, inner } = bucket; - - let IndexedBucketInner { - is_sorted: _, - time_range, - col_time, - col_insert_id: _, - col_row_id, - max_row_id: _, - columns, - size_bytes: _, - } = &*inner.read(); - - if !time_range.intersects(time_filter) { - return None; - } - - let col_row_id: RowIdVec = - filter_column(col_time, col_row_id.iter(), time_filter).collect(); - - // NOTE: Shouldn't ever happen due to check above, but better safe than - // sorry… - debug_assert!(!col_row_id.is_empty()); - if col_row_id.is_empty() { - return None; - } - - let col_timelines = [( - *timeline, - filter_column(col_time, col_time.iter(), time_filter) - .map(Some) - .collect(), - )] - .into(); - - let col_entity_path = std::iter::repeat_with(|| table.entity_path.clone()) - .take(col_row_id.len()) - .collect(); - - let mut columns2 = BTreeMap::default(); - for (component, column) in columns { - let column = filter_column(col_time, column.iter(), time_filter).collect(); - columns2.insert(*component, DataCellColumn(column)); - } - - Some(DataTable { - table_id: TableId::new(), - col_row_id, - col_timelines, - col_entity_path, - columns: columns2, - }) - })) - }) - .flatten() - } -} - -fn filter_column<'a, T: 'a + Clone>( - col_time: &'a ErasedTimeVec, - column: impl Iterator + 'a, - time_filter: ResolvedTimeRange, -) -> impl Iterator + 'a { - col_time - .iter() - .zip(column) - .filter(move |(&time, _)| time_filter.contains(TimeInt::new_temporal(time))) - .map(|(_, v)| v.clone()) -} diff --git a/crates/re_data_store/src/store_format.rs b/crates/re_data_store/src/store_format.rs deleted file mode 100644 index defcaca7199f..000000000000 --- a/crates/re_data_store/src/store_format.rs +++ /dev/null @@ -1,184 +0,0 @@ -use arrow2::datatypes::Metadata; -use re_format::{format_bytes, format_uint}; -use re_log_types::TimeInt; -use re_types_core::SizeBytes as _; - -use crate::{DataStore, IndexedBucket, IndexedTable, StaticTable}; - -// --- Data store --- - -impl std::fmt::Display for DataStore { - #[allow(clippy::string_add)] - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let Self { - id, - config, - type_registry: _, - metadata_registry: _, - tables, - static_tables, - insert_id: _, - query_id: _, - gc_id: _, - event_id: _, - } = self; - - f.write_str("DataStore {\n")?; - - f.write_str(&indent::indent_all_by(4, format!("id: {id}\n")))?; - f.write_str(&indent::indent_all_by(4, format!("config: {config:?}\n")))?; - - { - f.write_str(&indent::indent_all_by( - 4, - format!( - "{} static tables, for a total of {}\n", - static_tables.len(), - format_bytes(self.static_size_bytes() as _), - ), - ))?; - f.write_str(&indent::indent_all_by(4, "static_tables: [\n"))?; - for static_table in static_tables.values() { - f.write_str(&indent::indent_all_by(8, "StaticTable {\n"))?; - f.write_str(&indent::indent_all_by(12, static_table.to_string() + "\n"))?; - f.write_str(&indent::indent_all_by(8, "}\n"))?; - } - f.write_str(&indent::indent_all_by(4, "]\n"))?; - } - - { - f.write_str(&indent::indent_all_by( - 4, - format!( - "{} indexed tables, for a total of {} across {} total rows\n", - tables.len(), - format_bytes(self.temporal_size_bytes() as _), - format_uint(self.num_temporal_rows()) - ), - ))?; - f.write_str(&indent::indent_all_by(4, "tables: [\n"))?; - for table in tables.values() { - f.write_str(&indent::indent_all_by(8, "IndexedTable {\n"))?; - f.write_str(&indent::indent_all_by(12, table.to_string() + "\n"))?; - f.write_str(&indent::indent_all_by(8, "}\n"))?; - } - f.write_str(&indent::indent_all_by(4, "]\n"))?; - } - - f.write_str("}")?; - - Ok(()) - } -} - -// --- Temporal --- - -impl std::fmt::Display for IndexedTable { - #[allow(clippy::string_add)] - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let Self { - timeline, - entity_path, - buckets, - all_components: _, - buckets_num_rows: _, - buckets_size_bytes: _, - } = self; - - f.write_fmt(format_args!("timeline: {}\n", timeline.name()))?; - f.write_fmt(format_args!("entity: {entity_path}\n"))?; - - f.write_fmt(format_args!( - "size: {} buckets for a total of {} across {} total rows\n", - self.buckets.len(), - format_bytes(self.total_size_bytes() as _), - format_uint(self.num_rows()), - ))?; - f.write_str("buckets: [\n")?; - for (time, bucket) in buckets { - f.write_str(&indent::indent_all_by(4, "IndexedBucket {\n"))?; - f.write_str(&indent::indent_all_by( - 8, - format!( - "index time bound: >= {}\n", - timeline.typ().format_utc(*time) - ), - ))?; - f.write_str(&indent::indent_all_by(8, bucket.to_string()))?; - f.write_str(&indent::indent_all_by(4, "}\n"))?; - } - f.write_str("]")?; - - Ok(()) - } -} - -impl std::fmt::Display for IndexedBucket { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_fmt(format_args!( - "size: {} across {} rows\n", - format_bytes(self.total_size_bytes() as _), - format_uint(self.num_rows()), - ))?; - - let time_range = { - let time_range = &self.inner.read().time_range; - if time_range.min() != TimeInt::MAX && time_range.max() != TimeInt::MIN { - format!( - " - {}: {}", - self.timeline.name(), - self.timeline.format_time_range_utc(time_range) - ) - } else { - "time range: N/A\n".to_owned() - } - }; - f.write_fmt(format_args!("{time_range}\n"))?; - - let (schema, columns) = self.serialize().map_err(|err| { - re_log::error_once!("couldn't display indexed bucket: {err}"); - std::fmt::Error - })?; - re_format_arrow::format_dataframe( - Metadata::default(), - &schema.fields, - columns.columns().iter().map(|array| &**array), - ) - .fmt(f)?; - - writeln!(f) - } -} - -// --- Static --- - -impl std::fmt::Display for StaticTable { - #[allow(clippy::string_add)] - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_fmt(format_args!("entity: {}\n", self.entity_path))?; - - f.write_fmt(format_args!( - "size: {} across {} cells\n", - format_bytes( - self.cells - .values() - .map(|cell| cell.cell.total_size_bytes()) - .sum::() as _ - ), - format_uint(self.cells.len()), - ))?; - - let (schema, columns) = self.serialize().map_err(|err| { - re_log::error_once!("couldn't display static table: {err}"); - std::fmt::Error - })?; - re_format_arrow::format_dataframe( - Metadata::default(), - &schema.fields, - columns.columns().iter().map(|array| &**array), - ) - .fmt(f)?; - - writeln!(f) - } -} diff --git a/crates/re_data_store/src/store_gc.rs b/crates/re_data_store/src/store_gc.rs deleted file mode 100644 index b907181a8759..000000000000 --- a/crates/re_data_store/src/store_gc.rs +++ /dev/null @@ -1,775 +0,0 @@ -use std::{collections::BTreeMap, time::Duration}; - -use ahash::{HashMap, HashSet}; -use web_time::Instant; - -use re_log_types::{ - DataCell, EntityPath, EntityPathHash, ResolvedTimeRange, RowId, TimeInt, TimePoint, Timeline, - VecDequeRemovalExt as _, -}; -use re_types_core::{ComponentName, SizeBytes as _}; - -use crate::{ - store::{IndexedBucketInner, IndexedTable}, - DataStore, DataStoreStats, StoreDiff, StoreDiffKind, StoreEvent, -}; - -// --- - -#[derive(Debug, Clone, Copy)] -pub enum GarbageCollectionTarget { - /// Try to drop _at least_ the given fraction. - /// - /// The fraction must be a float in the range [0.0 : 1.0]. - DropAtLeastFraction(f64), - - /// GC Everything that isn't protected - Everything, -} - -#[derive(Debug, Clone)] -pub struct GarbageCollectionOptions { - /// What target threshold should the GC try to meet. - pub target: GarbageCollectionTarget, - - /// How long the garbage collection in allowed to run for. - /// - /// Trades off latency for throughput: - /// - A smaller `time_budget` will clear less data in a shorter amount of time, allowing for a - /// more responsive UI at the cost of more GC overhead and more frequent runs. - /// - A larger `time_budget` will clear more data in a longer amount of time, increasing the - /// chance of UI freeze frames but decreasing GC overhead and running less often. - /// - /// The default is an unbounded time budget (i.e. throughput only). - pub time_budget: Duration, - - /// How many component revisions to preserve on each timeline. - pub protect_latest: usize, - - /// Whether to purge tables that no longer contain any data - pub purge_empty_tables: bool, - - /// Components which should not be protected from GC when using `protect_latest` - pub dont_protect_components: HashSet, - - /// Timelines which should not be protected from GC when using `protect_latest` - pub dont_protect_timelines: HashSet, - - /// Whether to enable batched bucket drops. - /// - /// Disabled by default as it is currently slower in most cases (somehow). - pub enable_batching: bool, -} - -impl GarbageCollectionOptions { - pub fn gc_everything() -> Self { - Self { - target: GarbageCollectionTarget::Everything, - time_budget: std::time::Duration::MAX, - protect_latest: 0, - purge_empty_tables: true, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - } - } -} - -impl std::fmt::Display for GarbageCollectionTarget { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::DropAtLeastFraction(p) => { - write!(f, "DropAtLeast({:.3}%)", *p * 100.0) - } - Self::Everything => write!(f, "Everything"), - } - } -} - -impl DataStore { - /// Triggers a garbage collection according to the desired `target`. - /// - /// Garbage collection's performance is bounded by the number of buckets in each table (for - /// each `RowId`, we have to find the corresponding bucket, which is roughly `O(log(n))`) as - /// well as the number of rows in each of those buckets (for each `RowId`, we have to sort the - /// corresponding bucket (roughly `O(n*log(n))`) and then find the corresponding row (roughly - /// `O(log(n))`. - /// The size of the data itself has no impact on performance. - /// - /// Returns the list of `RowId`s that were purged from the store. - /// - /// ## Semantics - /// - /// Garbage collection works on a row-level basis and is driven by [`RowId`] order, - /// i.e. the order defined by the clients' wall-clocks, allowing it to drop data across - /// the different timelines in a fair, deterministic manner. - /// Similarly, out-of-order data is supported out of the box. - /// - /// The garbage collector doesn't deallocate data in and of itself: all it does is drop the - /// store's internal references to that data (the `DataCell`s), which will be deallocated once - /// their reference count reaches 0. - /// - /// ## Limitations - /// - /// The garbage collector has limited support for latest-at semantics. The configuration option: - /// [`GarbageCollectionOptions::protect_latest`] will protect the N latest values of each - /// component on each timeline. The only practical guarantee this gives is that a latest-at query - /// with a value of max-int will be unchanged. However, latest-at queries from other arbitrary - /// points in time may provide different results pre- and post- GC. - pub fn gc(&mut self, options: &GarbageCollectionOptions) -> (Vec, DataStoreStats) { - re_tracing::profile_function!(); - - self.gc_id += 1; - - let stats_before = DataStoreStats::from_store(self); - - let (initial_num_rows, initial_num_bytes) = stats_before.total_rows_and_bytes(); - - let protected_rows = self.find_all_protected_rows( - options.protect_latest, - &options.dont_protect_components, - &options.dont_protect_timelines, - ); - - let mut diffs = match options.target { - GarbageCollectionTarget::DropAtLeastFraction(p) => { - assert!((0.0..=1.0).contains(&p)); - - let num_bytes_to_drop = initial_num_bytes * p; - let target_num_bytes = initial_num_bytes - num_bytes_to_drop; - - re_log::trace!( - kind = "gc", - id = self.gc_id, - %options.target, - initial_num_rows = re_format::format_uint(initial_num_rows), - initial_num_bytes = re_format::format_bytes(initial_num_bytes), - target_num_bytes = re_format::format_bytes(target_num_bytes), - drop_at_least_num_bytes = re_format::format_bytes(num_bytes_to_drop), - "starting GC" - ); - - self.gc_drop_at_least_num_bytes(options, num_bytes_to_drop, &protected_rows) - } - GarbageCollectionTarget::Everything => { - re_log::trace!( - kind = "gc", - id = self.gc_id, - %options.target, - initial_num_rows = re_format::format_uint(initial_num_rows), - initial_num_bytes = re_format::format_bytes(initial_num_bytes), - "starting GC" - ); - - self.gc_drop_at_least_num_bytes(options, f64::INFINITY, &protected_rows) - } - }; - - if options.purge_empty_tables { - diffs.extend(self.purge_empty_tables()); - } - - #[cfg(debug_assertions)] - #[allow(clippy::unwrap_used)] - self.sanity_check().unwrap(); - - // NOTE: only temporal data and row metadata get purged! - let stats_after = DataStoreStats::from_store(self); - let (new_num_rows, new_num_bytes) = stats_after.total_rows_and_bytes(); - - re_log::trace!( - kind = "gc", - id = self.gc_id, - %options.target, - initial_num_rows = re_format::format_uint(initial_num_rows), - initial_num_bytes = re_format::format_bytes(initial_num_bytes), - new_num_rows = re_format::format_uint(new_num_rows), - new_num_bytes = re_format::format_bytes(new_num_bytes), - "GC done" - ); - - let stats_diff = stats_before - stats_after; - - let events: Vec<_> = diffs - .into_iter() - .map(|diff| StoreEvent { - store_id: self.id.clone(), - store_generation: self.generation(), - event_id: self - .event_id - .fetch_add(1, std::sync::atomic::Ordering::Relaxed), - diff, - }) - .collect(); - - { - if cfg!(debug_assertions) { - let any_event_other_than_deletion = - events.iter().any(|e| e.kind != StoreDiffKind::Deletion); - assert!(!any_event_other_than_deletion); - } - - Self::on_events(&events); - } - - (events, stats_diff) - } - - /// Tries to drop _at least_ `num_bytes_to_drop` bytes of data from the store. - fn gc_drop_at_least_num_bytes( - &mut self, - options: &GarbageCollectionOptions, - mut num_bytes_to_drop: f64, - protected_rows: &HashSet, - ) -> Vec { - re_tracing::profile_function!(); - - let mut diffs = Vec::new(); - - // The algorithm is straightforward: - // 1. Accumulate a bunch of `RowId`s in ascending order, starting from the beginning of time. - // 2. Check if any `RowId` in the batch is protected, in which case the entire batch is - // considered protected and cannot be dropped all at once. - // 3. Send the batch to `drop_batch` to handle the actual deletion. - // 4. Removed the dropped rows from the metadata registry. - - let batch_size = (self.config.indexed_bucket_num_rows as usize).saturating_mul(2); - let batch_size = batch_size.clamp(64, 4096); - - let mut batch: Vec<(TimePoint, (EntityPathHash, RowId))> = Vec::with_capacity(batch_size); - let mut batch_is_protected = false; - - let Self { - metadata_registry, - tables, - .. - } = self; - - let now = Instant::now(); - for (&row_id, (timepoint, entity_path_hash)) in &metadata_registry.registry { - if protected_rows.contains(&row_id) { - batch_is_protected = true; - continue; - } - - batch.push((timepoint.clone(), (*entity_path_hash, row_id))); - if batch.len() < batch_size { - continue; - } - - let dropped = Self::drop_batch( - options, - tables, - &mut num_bytes_to_drop, - &batch, - batch_is_protected, - ); - - // Only decrement the metadata size trackers if we're actually certain that we'll drop - // that RowId in the end. - for dropped in dropped { - let metadata_dropped_size_bytes = dropped.row_id.total_size_bytes() - + dropped.timepoint().total_size_bytes() - + dropped.entity_path.hash().total_size_bytes(); - metadata_registry.heap_size_bytes = metadata_registry - .heap_size_bytes - .checked_sub(metadata_dropped_size_bytes) - .unwrap_or_else(|| { - re_log::debug!( - entity_path = %dropped.entity_path, - current = metadata_registry.heap_size_bytes, - removed = metadata_dropped_size_bytes, - "book keeping underflowed" - ); - u64::MIN - }); - num_bytes_to_drop -= metadata_dropped_size_bytes as f64; - - diffs.push(dropped); - } - - if now.elapsed() >= options.time_budget || num_bytes_to_drop <= 0.0 { - break; - } - - batch.clear(); - batch_is_protected = false; - } - - // Handle leftovers. - { - let dropped = Self::drop_batch( - options, - tables, - &mut num_bytes_to_drop, - &batch, - batch_is_protected, - ); - - // Only decrement the metadata size trackers if we're actually certain that we'll drop - // that RowId in the end. - for dropped in dropped { - let metadata_dropped_size_bytes = dropped.row_id.total_size_bytes() - + dropped.timepoint().total_size_bytes() - + dropped.entity_path.hash().total_size_bytes(); - metadata_registry.heap_size_bytes = metadata_registry - .heap_size_bytes - .checked_sub(metadata_dropped_size_bytes) - .unwrap_or_else(|| { - re_log::debug!( - entity_path = %dropped.entity_path, - current = metadata_registry.heap_size_bytes, - removed = metadata_dropped_size_bytes, - "book keeping underflowed" - ); - u64::MIN - }); - num_bytes_to_drop -= metadata_dropped_size_bytes as f64; - - diffs.push(dropped); - } - } - - // Purge the removed rows from the metadata_registry. - // This is safe because the entire GC process is driven by RowId-order. - for diff in &diffs { - metadata_registry.remove(&diff.row_id); - } - - diffs - } - - #[allow(clippy::too_many_arguments, clippy::fn_params_excessive_bools)] - fn drop_batch( - options: &GarbageCollectionOptions, - tables: &mut BTreeMap<(EntityPathHash, Timeline), IndexedTable>, - num_bytes_to_drop: &mut f64, - batch: &[(TimePoint, (EntityPathHash, RowId))], - batch_is_protected: bool, - ) -> Vec { - let &GarbageCollectionOptions { - enable_batching, .. - } = options; - - let mut diffs = Vec::new(); - - // The algorithm is straightforward: - // 1. If the batch isn't protected, find and drop all buckets that are guaranteed to - // contain only rows older than the ones in the batch. - // 2. Check how many bytes were dropped; continue if we haven't met our objective. - // 3. Fallback to deletion of individual rows. - // 4. Check how many bytes were dropped; continue if we haven't met our objective. - - // NOTE: The batch is already sorted by definition since it's extracted from the registry's btreemap. - let max_row_id = batch.last().map(|(_, (_, row_id))| *row_id); - - if enable_batching && max_row_id.is_some() && !batch_is_protected { - // NOTE: unwrap cannot fail but just a precaution in case this code moves around… - let max_row_id = max_row_id.unwrap_or(RowId::ZERO); - - let mut batch_removed: HashMap = HashMap::default(); - let mut cur_entity_path_hash = None; - - // NOTE: We _must_ go through all tables no matter what, since the batch might contain - // any number of distinct entities. - for ((entity_path_hash, _), table) in &mut *tables { - let (removed, num_bytes_removed) = table.try_drop_bucket(max_row_id); - - *num_bytes_to_drop -= num_bytes_removed as f64; - - if cur_entity_path_hash != Some(*entity_path_hash) { - diffs.extend(batch_removed.drain().map(|(_, diff)| diff)); - - cur_entity_path_hash = Some(*entity_path_hash); - } - - for mut removed in removed { - batch_removed - .entry(removed.row_id) - .and_modify(|diff| { - diff.times.extend(std::mem::take(&mut removed.times)); - }) - .or_insert(removed); - } - } - - diffs.extend(batch_removed.drain().map(|(_, diff)| diff)); - } - - if *num_bytes_to_drop <= 0.0 { - return diffs; - } - - for (timepoint, (entity_path_hash, row_id)) in batch { - let mut diff: Option = None; - - // find all tables that could possibly contain this `RowId` - for (&timeline, &time) in timepoint { - if let Some(table) = tables.get_mut(&(*entity_path_hash, timeline)) { - let (removed, num_bytes_removed) = table.try_drop_row(*row_id, time); - if let Some(inner) = diff.as_mut() { - if let Some(removed) = removed { - inner.times.extend(removed.times); - } - } else { - diff = removed; - } - *num_bytes_to_drop -= num_bytes_removed as f64; - } - } - - diffs.extend(diff); - - if *num_bytes_to_drop <= 0.0 { - break; - } - } - - diffs - } - - /// For each `EntityPath`, `Timeline`, `Component` find the N latest [`RowId`]s. - // - // TODO(jleibs): More complex functionality might required expanding this to also - // *ignore* specific entities, components, timelines, etc. for this protection. - // - // TODO(jleibs): `RowId`s should never overlap between entities. Creating a single large - // HashSet might actually be sub-optimal here. Consider switching to a map of - // `EntityPath` -> `HashSet`. - // Update: this is true-er than ever before now that RowIds are truly unique! - fn find_all_protected_rows( - &mut self, - target_count: usize, - dont_protect_components: &HashSet, - dont_protect_timelines: &HashSet, - ) -> HashSet { - re_tracing::profile_function!(); - - if target_count == 0 { - return Default::default(); - } - - // We need to sort to be able to determine latest-at. - self.sort_indices_if_needed(); - - let mut protected_rows: HashSet = Default::default(); - - // Find all protected rows in regular indexed tables - for ((_, timeline), table) in &self.tables { - if dont_protect_timelines.contains(timeline) { - continue; - } - let mut components_to_find: HashMap = table - .all_components - .iter() - .filter(|c| !dont_protect_components.contains(*c)) - .map(|c| (*c, target_count)) - .collect(); - - for bucket in table.buckets.values().rev() { - for (component, count) in &mut components_to_find { - if *count == 0 { - continue; - } - let inner = bucket.inner.read(); - // TODO(jleibs): If the entire column for a component is empty, we should - // make sure the column is dropped so we don't have to iterate over a - // bunch of Nones. - if let Some(column) = inner.columns.get(component) { - for row in column - .iter() - .enumerate() - .rev() - .filter_map(|(row_index, cell)| { - cell.as_ref().and_then(|_| inner.col_row_id.get(row_index)) - }) - .take(*count) - { - *count -= 1; - protected_rows.insert(*row); - } - } - } - } - } - - protected_rows - } - - /// Remove any tables which contain only components which are empty. - // TODO(jleibs): We could optimize this further by also erasing empty columns. - fn purge_empty_tables(&mut self) -> impl Iterator { - re_tracing::profile_function!(); - - let mut diffs: BTreeMap = BTreeMap::default(); - - self.tables.retain(|_, table| { - // If any bucket has a non-empty component in any column, we keep it… - for bucket in table.buckets.values() { - let inner = bucket.inner.read(); - for column in inner.columns.values() { - if column - .iter() - .any(|cell| cell.as_ref().map_or(false, |cell| cell.num_instances() > 0)) - { - return true; - } - } - } - - // …otherwise we can drop it. - - let entity_path = table.entity_path.clone(); - - for bucket in table.buckets.values() { - let mut inner = bucket.inner.write(); - - for i in 0..inner.col_row_id.len() { - let row_id = inner.col_row_id[i]; - let time = inner.col_time[i]; - - let diff = diffs - .entry(row_id) - .or_insert_with(|| StoreDiff::deletion(row_id, entity_path.clone())); - - diff.times - .push((bucket.timeline, TimeInt::new_temporal(time))); - - for column in &mut inner.columns.values_mut() { - let cell = column[i].take(); - if let Some(cell) = cell { - diff.insert(cell); - } - } - } - } - - false - }); - - diffs.into_values() - } -} - -impl IndexedTable { - /// Try to drop an entire bucket at once if it doesn't contain any `RowId` greater than `max_row_id`. - fn try_drop_bucket(&mut self, max_row_id: RowId) -> (Vec, u64) { - re_tracing::profile_function!(); - - let entity_path = self.entity_path.clone(); - let timeline = self.timeline; - - let mut diffs: Vec = Vec::new(); - let mut dropped_num_bytes = 0u64; - let mut dropped_num_rows = 0u64; - - let mut dropped_bucket_times = HashSet::default(); - - // TODO(cmc): scaling linearly with the number of buckets could be improved, although this - // is quite fast in practice because of the early check. - for (bucket_time, bucket) in &self.buckets { - let inner = &mut *bucket.inner.write(); - - if inner.col_time.is_empty() || max_row_id < inner.max_row_id { - continue; - } - - let IndexedBucketInner { - mut col_time, - mut col_row_id, - mut columns, - size_bytes, - .. - } = std::mem::take(inner); - - dropped_bucket_times.insert(*bucket_time); - - while let Some(row_id) = col_row_id.pop_front() { - let mut diff = StoreDiff::deletion(row_id, entity_path.clone()); - - if let Some(time) = col_time.pop_front() { - diff.times.push((timeline, TimeInt::new_temporal(time))); - } - - for (component_name, column) in &mut columns { - if let Some(cell) = column.pop_front().flatten() { - diff.cells.insert(*component_name, cell); - } - } - - diffs.push(diff); - } - - dropped_num_bytes += size_bytes; - dropped_num_rows += col_time.len() as u64; - } - - self.buckets - .retain(|bucket_time, _| !dropped_bucket_times.contains(bucket_time)); - - self.uphold_indexing_invariants(); - - self.buckets_num_rows -= dropped_num_rows; - self.buckets_size_bytes -= dropped_num_bytes; - - (diffs, dropped_num_bytes) - } - - /// Tries to drop the given `row_id` from the table, which is expected to be found at the - /// specified `time`. - /// - /// Returns how many bytes were actually dropped, or zero if the row wasn't found. - fn try_drop_row(&mut self, row_id: RowId, time: TimeInt) -> (Option, u64) { - re_tracing::profile_function!(); - - let entity_path = self.entity_path.clone(); - let timeline = self.timeline; - - let table_has_more_than_one_bucket = self.buckets.len() > 1; - - let (bucket_key, bucket) = self.find_bucket_mut(time); - let bucket_num_bytes = bucket.total_size_bytes(); - - let (diff, mut dropped_num_bytes) = { - let inner = &mut *bucket.inner.write(); - inner.try_drop_row(row_id, timeline, &entity_path, time) - }; - - // NOTE: We always need to keep at least one bucket alive, otherwise we have - // nowhere to write to. - if table_has_more_than_one_bucket && bucket.num_rows() == 0 { - // NOTE: We're dropping the bucket itself in this case, rather than just its - // contents. - debug_assert!( - dropped_num_bytes <= bucket_num_bytes, - "Bucket contained more bytes than it thought" - ); - dropped_num_bytes = bucket_num_bytes; - self.buckets.remove(&bucket_key); - - self.uphold_indexing_invariants(); - } - - self.buckets_size_bytes -= dropped_num_bytes; - self.buckets_num_rows -= (dropped_num_bytes > 0) as u64; - - (diff, dropped_num_bytes) - } -} - -impl IndexedBucketInner { - /// Tries to drop the given `row_id` from the table, which is expected to be found at the - /// specified `time`. - /// - /// Returns how many bytes were actually dropped, or zero if the row wasn't found. - fn try_drop_row( - &mut self, - row_id: RowId, - timeline: Timeline, - entity_path: &EntityPath, - time: TimeInt, - ) -> (Option, u64) { - self.sort(); - - let Self { - is_sorted, - time_range, - col_time, - col_insert_id, - col_row_id, - max_row_id, - columns, - size_bytes, - } = self; - - let mut diff: Option = None; - let mut dropped_num_bytes = 0u64; - - let mut row_index = col_time.partition_point(|&time2| time2 < time.as_i64()); - while col_time.get(row_index) == Some(&time.as_i64()) { - if col_row_id[row_index] != row_id { - row_index += 1; - continue; - } - - // Update the time_range min/max: - if col_time.len() == 1 { - // We removed the last row - *time_range = ResolvedTimeRange::EMPTY; - } else { - *is_sorted = row_index == 0 || row_index.saturating_add(1) == col_row_id.len(); - - // We have at least two rows, so we can safely [index] here: - if row_index == 0 { - // We removed the first row, so the second row holds the new min - time_range.set_min(col_time[1]); - } - if row_index + 1 == col_time.len() { - // We removed the last row, so the penultimate row holds the new max - time_range.set_max(col_time[row_index - 1]); - } - } - - // col_row_id - let Some(removed_row_id) = col_row_id.swap_remove(row_index) else { - continue; - }; - debug_assert_eq!(row_id, removed_row_id); - dropped_num_bytes += removed_row_id.total_size_bytes(); - - // col_time - if let Some(row_time) = col_time.swap_remove(row_index) { - dropped_num_bytes += row_time.total_size_bytes(); - } - - // col_insert_id (if present) - if !col_insert_id.is_empty() { - if let Some(insert_id) = col_insert_id.swap_remove(row_index) { - dropped_num_bytes += insert_id.total_size_bytes(); - } - } - - // each data column - for column in columns.values_mut() { - let cell = column.0.swap_remove(row_index).flatten(); - - // TODO(#1809): once datatype deduplication is in, we should really not count - // autogenerated keys as part of the memory stats (same on write path). - dropped_num_bytes += cell.total_size_bytes(); - - if let Some(cell) = cell { - if let Some(inner) = diff.as_mut() { - inner.insert(cell); - } else { - let mut d = StoreDiff::deletion(removed_row_id, entity_path.clone()); - d.at_timestamp(timeline, time); - d.insert(cell); - diff = Some(d); - } - } - } - - if *max_row_id == removed_row_id { - // NOTE: We _have_ to fullscan here: the bucket is sorted by `(Time, RowId)`, there - // could very well be a greater lurking in a lesser entry. - *max_row_id = col_row_id.iter().max().copied().unwrap_or(RowId::ZERO); - } - - // NOTE: A single `RowId` cannot possibly have more than one datapoint for - // a single timeline. - break; - } - - *size_bytes -= dropped_num_bytes; - - (diff, dropped_num_bytes) - } -} - -// --- - -impl StoreDiff { - fn insert(&mut self, cell: DataCell) { - self.cells.insert(cell.component_name(), cell); - } -} diff --git a/crates/re_data_store/src/store_helpers.rs b/crates/re_data_store/src/store_helpers.rs deleted file mode 100644 index f98b32c09a55..000000000000 --- a/crates/re_data_store/src/store_helpers.rs +++ /dev/null @@ -1,92 +0,0 @@ -use re_log_types::{DataCell, DataRow, EntityPath, RowId, TimePoint}; -use re_types_core::{Component, ComponentName}; - -use crate::DataStore; - -// --- Write --- - -impl DataStore { - /// Stores a single value for a given [`re_types_core::Component`]. - /// - /// This is a best-effort helper, it will merely log errors on failure. - pub fn insert_component<'a, C>( - &mut self, - entity_path: &EntityPath, - timepoint: &TimePoint, - component: C, - ) where - C: Component + Clone + 'a, - std::borrow::Cow<'a, C>: std::convert::From, - { - re_tracing::profile_function!(); - - let mut row = match DataRow::from_cells1( - RowId::new(), - entity_path.clone(), - timepoint.clone(), - [component], - ) { - Ok(row) => row, - Err(err) => { - re_log::error_once!( - "Couldn't serialize component at {entity_path}.{}: {err}", - C::name() - ); - return; - } - }; - row.compute_all_size_bytes(); - - if let Err(err) = self.insert_row(&row) { - re_log::error_once!( - "Couldn't insert component at {entity_path}.{}: {err}", - C::name() - ); - } - } - - /// Stores a single empty value for a given [`re_types_core::ComponentName`]. - /// - /// This is a best-effort helper, it will merely log errors on failure. - pub fn insert_empty_component( - &mut self, - entity_path: &EntityPath, - timepoint: &TimePoint, - component: ComponentName, - ) { - re_tracing::profile_function!(); - - if let Some(datatype) = self.lookup_datatype(&component) { - let cell = DataCell::from_arrow_empty(component, datatype.clone()); - - let mut row = match DataRow::from_cells1( - RowId::new(), - entity_path.clone(), - timepoint.clone(), - cell, - ) { - Ok(row) => row, - Err(err) => { - re_log::error_once!( - "Couldn't serialize component at {entity_path}.{}: {err}", - component - ); - return; - } - }; - row.compute_all_size_bytes(); - - if let Err(err) = self.insert_row(&row) { - re_log::error_once!( - "Couldn't insert component at {entity_path}.{}: {err}", - component - ); - } - } else { - re_log::error_once!( - "Couldn't find appropriate datatype at {entity_path}.{}", - component - ); - } - } -} diff --git a/crates/re_data_store/src/store_read.rs b/crates/re_data_store/src/store_read.rs deleted file mode 100644 index c76e04d66197..000000000000 --- a/crates/re_data_store/src/store_read.rs +++ /dev/null @@ -1,888 +0,0 @@ -use std::{collections::VecDeque, ops::RangeBounds, sync::atomic::Ordering}; - -use itertools::Itertools as _; - -use re_log::trace; -use re_log_types::{ - DataCell, EntityPath, EntityPathHash, ResolvedTimeRange, RowId, TimeInt, TimePoint, Timeline, -}; -use re_types_core::{ComponentName, ComponentNameSet}; - -use crate::{DataStore, IndexedBucket, IndexedBucketInner, IndexedTable}; - -// --- Queries --- - -/// A query at a given time, for a given timeline. -/// -/// Get the latest version of the data available at this time. -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct LatestAtQuery { - timeline: Timeline, - at: TimeInt, -} - -impl std::fmt::Debug for LatestAtQuery { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_fmt(format_args!( - "", - self.timeline.typ().format_utc(self.at), - self.timeline.name(), - )) - } -} - -impl LatestAtQuery { - /// The returned query is guaranteed to never include [`TimeInt::STATIC`]. - #[inline] - pub fn new(timeline: Timeline, at: impl TryInto) -> Self { - let at = at.try_into().unwrap_or(TimeInt::MIN); - Self { timeline, at } - } - - #[inline] - pub const fn latest(timeline: Timeline) -> Self { - Self { - timeline, - at: TimeInt::MAX, - } - } - - #[inline] - pub fn timeline(&self) -> Timeline { - self.timeline - } - - #[inline] - pub fn at(&self) -> TimeInt { - self.at - } -} - -/// A query over a time range, for a given timeline. -/// -/// Get all the data within this time interval, plus the latest one before the start of the -/// interval. -/// -/// Motivation: all data is considered alive until the next logging to the same component path. -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct RangeQuery { - pub timeline: Timeline, - pub range: ResolvedTimeRange, -} - -impl std::fmt::Debug for RangeQuery { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_fmt(format_args!( - " Self { - Self { timeline, range } - } - - #[inline] - pub const fn everything(timeline: Timeline) -> Self { - Self { - timeline, - range: ResolvedTimeRange::EVERYTHING, - } - } - - #[inline] - pub fn timeline(&self) -> Timeline { - self.timeline - } - - #[inline] - pub fn range(&self) -> ResolvedTimeRange { - self.range - } -} - -// --- Data store --- - -impl DataStore { - /// Retrieve all the [`ComponentName`]s that have been written to for a given [`EntityPath`] on - /// the specified [`Timeline`]. - /// - /// Static components are always included in the results. - /// - /// Returns `None` if the entity doesn't exist at all on this `timeline`. - pub fn all_components( - &self, - timeline: &Timeline, - entity_path: &EntityPath, - ) -> Option { - re_tracing::profile_function!(); - - // TODO(cmc): kind & query_id need to somehow propagate through the span system. - self.query_id.fetch_add(1, Ordering::Relaxed); - - let entity_path_hash = entity_path.hash(); - - let static_components: Option = self - .static_tables - .get(&entity_path_hash) - .map(|static_table| static_table.cells.keys().copied().collect()); - - let temporal_components: Option = self - .tables - .get(&(entity_path_hash, *timeline)) - .map(|table| table.all_components.clone()); - - match (static_components, temporal_components) { - (None, None) => None, - (None, comps @ Some(_)) | (comps @ Some(_), None) => comps, - (Some(static_comps), Some(temporal_comps)) => { - Some(static_comps.into_iter().chain(temporal_comps).collect()) - } - } - } - - /// Check whether a given entity has a specific [`ComponentName`] either on the specified - /// timeline, or in its static data. - #[inline] - pub fn entity_has_component( - &self, - timeline: &Timeline, - entity_path: &EntityPath, - component_name: &ComponentName, - ) -> bool { - re_tracing::profile_function!(); - self.all_components(timeline, entity_path) - .map_or(false, |components| components.contains(component_name)) - } - - /// Find the earliest time at which something was logged for a given entity on the specified - /// timeline. - /// - /// Ignores static data. - #[inline] - pub fn entity_min_time( - &self, - timeline: &Timeline, - entity_path: &EntityPath, - ) -> Option { - let entity_path_hash = entity_path.hash(); - - let min_time = self - .tables - .get(&(entity_path_hash, *timeline))? - .buckets - .first_key_value()? - .1 - .inner - .read() - .time_range - .min(); - - // handle case where no data was logged - if min_time == TimeInt::MIN { - None - } else { - Some(min_time) - } - } - - /// Queries the datastore for the cells of the specified `component_names`, as seen from the point - /// of view of the so-called `primary` component. - /// - /// Returns an array of [`DataCell`]s (as well as the associated _data_ time and [`RowId`], if - /// the data is temporal) on success. - /// - /// Success is defined by one thing and one thing only: whether a cell could be found for the - /// `primary` component. - /// The presence or absence of secondary components has no effect on the success criteria. - /// - /// If the entity has static component data associated with it, it will unconditionally - /// override any temporal component data. - pub fn latest_at( - &self, - query: &LatestAtQuery, - entity_path: &EntityPath, - primary: ComponentName, - component_names: &[ComponentName; N], - ) -> Option<(TimeInt, RowId, [Option; N])> { - // TODO(cmc): kind & query_id need to somehow propagate through the span system. - self.query_id.fetch_add(1, Ordering::Relaxed); - - let entity_path_hash = entity_path.hash(); - let primary_comp_pos = component_names - .iter() - .find_position(|component_name| **component_name == primary) - .map(|(pos, _)| pos)?; - - let static_table = self.static_tables.get(&entity_path_hash); - - // Check which components have static data associated with them, and if so don't bother - // querying for their temporal data. - let mut component_names_opt = [(); N].map(|_| None); - for (i, component_name) in component_names.iter().copied().enumerate() { - let has_static_data = static_table.map_or(false, |static_table| { - static_table.cells.contains_key(&component_name) - }); - component_names_opt[i] = (!has_static_data).then_some(component_name); - } - - // Grab the temporal results. - let (mut data_time, mut max_row_id, mut results) = self - .tables - .get(&(entity_path_hash, query.timeline)) - .and_then(|table| table.latest_at(query.at, primary, &component_names_opt)) - .map_or_else( - || (TimeInt::STATIC, RowId::ZERO, [(); N].map(|_| None)), - |(data_time, row_id, cells)| (data_time, row_id, cells), - ); - - // Overwrite results with static data, where applicable. - if let Some(static_table) = self.static_tables.get(&entity_path_hash) { - for (i, component_name) in component_names.iter().enumerate() { - if let Some(static_cell) = static_table.cells.get(component_name).cloned() { - results[i] = Some(static_cell.cell.clone()); - - // If and only if the primary is static, overwrite the returned index. - if *component_name == primary { - data_time = TimeInt::STATIC; - max_row_id = RowId::max(max_row_id, static_cell.row_id); - } - } - } - } - - results[primary_comp_pos] - .is_some() - .then_some((data_time, max_row_id, results)) - } - - /// Iterates the datastore in order to return the cells of the specified `component_names` for - /// the given time range. - /// - /// For each and every relevant row that is found, the returned iterator will yield an array - /// that is filled with the cells of each and every component in `component_names`, or `None` if - /// said component is not available in that row. - /// - /// This method cannot fail! If there's no data to return, an empty iterator is returned. - /// - /// ⚠ Contrary to latest-at queries, range queries can and will yield multiple rows for a - /// single timestamp if it happens to hold multiple entries. - /// - /// If the entity has static component data associated with it, it will unconditionally - /// override any temporal component data. - pub fn range<'a, const N: usize>( - &'a self, - query: &RangeQuery, - entity_path: &EntityPath, - component_names: [ComponentName; N], - ) -> impl Iterator; N])> + 'a { - // Beware! This merely measures the time it takes to gather all the necessary metadata - // for building the returned iterator. - re_tracing::profile_function!(); - - // TODO(cmc): kind & query_id need to somehow propagate through the span system. - self.query_id.fetch_add(1, Ordering::Relaxed); - - let entity_path_hash = entity_path.hash(); - - let static_table = self.static_tables.get(&entity_path_hash); - - // Check which components have static data associated with them, and if so don't bother - // querying for their temporal data. - let mut component_names_opt = [(); N].map(|_| None); - for (i, component_name) in component_names.iter().copied().enumerate() { - let has_static_data = static_table.map_or(false, |static_table| { - static_table.cells.contains_key(&component_name) - }); - component_names_opt[i] = (!has_static_data).then_some(component_name); - } - - // Yield the static data that's available first. - let static_data = if let Some(static_table) = self.static_tables.get(&entity_path_hash) { - let mut max_row_id = RowId::ZERO; - let mut results = [(); N].map(|_| None); - - for (i, component_name) in component_names.iter().enumerate() { - if let Some(static_cell) = static_table.cells.get(component_name).cloned() { - results[i] = Some(static_cell.cell.clone()); - - // There's no concept of a primary in low-level range queries, so we just give - // priority to whichever component has the most recent rowid when it comes to - // the returned index. - if static_cell.row_id > max_row_id { - max_row_id = RowId::max(max_row_id, static_cell.row_id); - } - } - } - - if results.iter().any(Option::is_some) { - itertools::Either::Left(std::iter::once((TimeInt::STATIC, max_row_id, results))) - } else { - itertools::Either::Right(std::iter::empty()) - } - } else { - itertools::Either::Right(std::iter::empty()) - }; - - static_data.chain( - self.tables - .get(&(entity_path_hash, query.timeline)) - .map(|index| index.range(query.range, component_names_opt)) - .into_iter() - .flatten(), - ) - } - - #[inline] - pub fn row_metadata(&self, row_id: &RowId) -> Option<&(TimePoint, EntityPathHash)> { - self.metadata_registry.get(row_id) - } - - /// Sort all unsorted indices in the store. - pub fn sort_indices_if_needed(&self) { - re_tracing::profile_function!(); - for index in self.tables.values() { - index.sort_indices_if_needed(); - } - } -} - -// --- Temporal --- - -impl IndexedTable { - /// Queries the table for the cells of the specified `component_names`, as seen from the point - /// of view of the so-called `primary` component. - /// - /// Returns an array of [`DataCell`]s (as well as the associated _data_ time and `RowId`) on - /// success, or `None` iff no cell could be found for the `primary` component. - pub fn latest_at( - &self, - query_time: TimeInt, - primary: ComponentName, - component_names: &[Option; N], - ) -> Option<(TimeInt, RowId, [Option; N])> { - // Early-exit if this entire table is unaware of this component. - if !self.all_components.contains(&primary) { - return None; - } - - let timeline = self.timeline; - - // The time we're looking for gives us an upper bound: all components must be indexed - // in either this bucket _or any of those that come before_! - // - // That is because secondary columns allow for null values, which forces us to not only - // walk backwards within an indexed bucket, but sometimes even walk backwards across - // multiple indexed buckets within the same table! - - let buckets = self - .range_buckets_rev(..=query_time) - .map(|(_, bucket)| bucket) - .enumerate(); - for (attempt, bucket) in buckets { - trace!( - kind = "latest_at", - timeline = %timeline.name(), - time = timeline.typ().format_utc(query_time), - %primary, - ?component_names, - attempt, - bucket_time_range = timeline.typ().format_range_utc(bucket.inner.read().time_range), - "found candidate bucket" - ); - if let ret @ Some(_) = bucket.latest_at(query_time, primary, component_names) { - return ret; // found at least the primary component! - } - } - - None // primary component not found - } - - /// Iterates the table in order to return the cells of the specified `component_names` for the - /// given time range. - /// - /// For each and every relevant row that is found, the returned iterator will yield an array - /// that is filled with the cells of each and every component in `component_names`, or `None` if - /// said component is not available in that row. - /// - /// This method cannot fail! If there's no data to return, an empty iterator is returned. - pub fn range( - &self, - time_range: ResolvedTimeRange, - component_names: [Option; N], - ) -> impl Iterator; N])> + '_ { - // Beware! This merely measures the time it takes to gather all the necessary metadata - // for building the returned iterator. - re_tracing::profile_function!(); - - let timeline = self.timeline; - - // We need to find the _indexing time_ that corresponds to this time range's minimum bound! - let (time_range_min, _) = self.find_bucket(time_range.min()); - - self.range_buckets(time_range_min..=time_range.max()) - .map(|(_, bucket)| bucket) - .enumerate() - .flat_map(move |(bucket_nr, bucket)| { - trace!( - kind = "range", - bucket_nr, - bucket_time_range = - timeline.typ().format_range_utc(bucket.inner.read().time_range), - timeline = %timeline.name(), - ?time_range, - ?component_names, - "found bucket in range" - ); - - bucket.range(time_range, component_names) - }) - } - - /// Returns the indexed bucket whose time range covers the given `time`. - /// - /// In addition to returning a reference to the `IndexedBucket` itself, this also returns its - /// _indexing time_, which is different from its minimum time range bound! - /// - /// See [`IndexedTable::buckets`] for more information. - pub fn find_bucket(&self, time: TimeInt) -> (TimeInt, &IndexedBucket) { - // This cannot fail, `iter_bucket` is guaranteed to always yield at least one bucket, - // since indexed tables always spawn with a default bucket that covers [-∞;+∞]. - #[allow(clippy::unwrap_used)] - self.range_buckets_rev(..=time).next().unwrap() - } - - /// Returns the indexed bucket whose time range covers the given `time`. - /// - /// In addition to returning a reference to the `IndexedBucket` itself, this also returns its - /// _indexing time_, which is different from its minimum time range bound! - /// - /// See [`IndexedTable::buckets`] for more information. - pub fn find_bucket_mut(&mut self, time: TimeInt) -> (TimeInt, &mut IndexedBucket) { - // This cannot fail, `iter_bucket_mut` is guaranteed to always yield at least one bucket, - // since indexed tables always spawn with a default bucket that covers [-∞;+∞]. - #[allow(clippy::unwrap_used)] - self.range_bucket_rev_mut(..=time).next().unwrap() - } - - /// Returns an iterator that is guaranteed to yield at least one bucket, which is the bucket - /// whose time range covers the start bound of the given `time_range`. - /// - /// It then continues yielding buckets until it runs out, in increasing time range order. - /// - /// In addition to yielding references to the `IndexedBucket`s themselves, this also returns - /// their _indexing times_, which are different from their minimum time range bounds! - /// - /// See [`IndexedTable::buckets`] for more information. - pub fn range_buckets( - &self, - time_range: impl RangeBounds, - ) -> impl Iterator { - // Beware! This merely measures the time it takes to gather all the necessary metadata - // for building the returned iterator. - re_tracing::profile_function!(); - - self.buckets - .range(time_range) - .map(|(time, bucket)| (*time, bucket)) - } - - /// Returns an iterator that is guaranteed to yield at least one bucket, which is the bucket - /// whose time range covers the end bound of the given `time_range`. - /// - /// It then continues yielding buckets until it runs out, in decreasing time range order. - /// - /// In addition to yielding references to the `IndexedBucket`s themselves, this also returns - /// their _indexing times_, which are different from their minimum time range bounds! - /// - /// See [`IndexedTable::buckets`] for more information. - pub fn range_buckets_rev( - &self, - time_range: impl RangeBounds, - ) -> impl Iterator { - // Beware! This merely measures the time it takes to gather all the necessary metadata - // for building the returned iterator. - re_tracing::profile_function!(); - - self.buckets - .range(time_range) - .rev() - .map(|(time, bucket)| (*time, bucket)) - } - - /// Returns an iterator that is guaranteed to yield at least one bucket, which is the bucket - /// whose time range covers the end bound of the given `time_range`. - /// - /// It then continues yielding buckets until it runs out, in decreasing time range order. - /// - /// In addition to yielding references to the `IndexedBucket`s themselves, this also returns - /// their _indexing times_, which are different from their minimum time range bounds! - /// - /// See [`IndexedTable::buckets`] for more information. - pub fn range_bucket_rev_mut( - &mut self, - time_range: impl RangeBounds, - ) -> impl Iterator { - self.buckets - .range_mut(time_range) - .rev() - .map(|(time, bucket)| (*time, bucket)) - } - - /// Sort all unsorted indexed buckets in this table. - pub fn sort_indices_if_needed(&self) { - for bucket in self.buckets.values() { - bucket.sort_indices_if_needed(); - } - } -} - -impl IndexedBucket { - /// Sort all component indices by time and [`RowId`], provided that's not already the case. - #[inline] - pub fn sort_indices_if_needed(&self) { - if self.inner.read().is_sorted { - return; // early read-only exit - } - - re_tracing::profile_scope!("sort"); - self.inner.write().sort(); - } - - /// Queries the bucket for the cells of the specified `component_names`, as seen from the point - /// of view of the so-called `primary` component. - /// - /// Returns an array of [`DataCell`]s (as well as the associated _data_ time and `RowId`) on - /// success, or `None` iff no cell could be found for the `primary` component. - pub fn latest_at( - &self, - query_time: TimeInt, - primary: ComponentName, - component_names: &[Option; N], - ) -> Option<(TimeInt, RowId, [Option; N])> { - self.sort_indices_if_needed(); - - let IndexedBucketInner { - is_sorted, - time_range: _, - col_time, - col_insert_id: _, - col_row_id, - max_row_id: _, - columns, - size_bytes: _, - } = &*self.inner.read(); - debug_assert!(is_sorted); - - // Early-exit if this bucket is unaware of this component. - let column = columns.get(&primary)?; - - trace!( - kind = "latest_at", - %primary, - ?component_names, - timeline = %self.timeline.name(), - query_time = self.timeline.typ().format_utc(query_time), - "searching for primary & secondary cells…" - ); - - let time_row_nr = - col_time.partition_point(|data_time| *data_time <= query_time.as_i64()) as i64; - - // The partition point is always _beyond_ the index that we're looking for. - // A partition point of 0 thus means that we're trying to query for data that lives - // _before_ the beginning of time… there's nothing to be found there. - if time_row_nr == 0 { - return None; - } - - // The partition point is always _beyond_ the index that we're looking for; we need - // to step back to find what we came for. - let primary_row_nr = time_row_nr - 1; - trace!( - kind = "latest_at", - %primary, - ?component_names, - timeline = %self.timeline.name(), - query_time = self.timeline.typ().format_utc(query_time), - %primary_row_nr, - "found primary row number", - ); - - // find the secondary row number, and the associated cells. - let mut secondary_row_nr = primary_row_nr; - while column[secondary_row_nr as usize].is_none() { - if secondary_row_nr == 0 { - trace!( - kind = "latest_at", - %primary, - ?component_names, - timeline = %self.timeline.name(), - query_time = self.timeline.typ().format_utc(query_time), - %primary_row_nr, - "no secondary row number found", - ); - return None; - } - secondary_row_nr -= 1; - } - - trace!( - kind = "latest_at", - %primary, - ?component_names, - timeline = %self.timeline.name(), - query_time = self.timeline.typ().format_utc(query_time), - %primary_row_nr, %secondary_row_nr, - "found secondary row number", - ); - debug_assert!(column[secondary_row_nr as usize].is_some()); - - let mut cells = [(); N].map(|_| None); - for (i, component_name) in component_names.iter().enumerate() { - let Some(component_name) = component_name else { - // That component has static data. - continue; - }; - - if let Some(column) = columns.get(component_name) { - if let Some(cell) = &column[secondary_row_nr as usize] { - trace!( - kind = "latest_at", - %primary, - %component_name, - timeline = %self.timeline.name(), - query_time = self.timeline.typ().format_utc(query_time), - %primary_row_nr, %secondary_row_nr, - "found cell", - ); - cells[i] = Some(cell.clone() /* shallow */); - } - } - } - - Some(( - col_time[secondary_row_nr as usize] - .try_into() - .unwrap_or(TimeInt::MIN), - col_row_id[secondary_row_nr as usize], - cells, - )) - } - - /// Iterates the bucket in order to return the cells of the specified `component_names` for - /// the given time range. - /// - /// For each and every relevant row that is found, the returned iterator will yield an array - /// that is filled with the cells of each and every component in `component_names`, or `None` if - /// said component is not available in that row. - /// - /// This method cannot fail! If there's no data to return, an empty iterator is returned. - pub fn range( - &self, - time_range: ResolvedTimeRange, - component_names: [Option; N], - ) -> impl Iterator; N])> + '_ { - self.sort_indices_if_needed(); - - let IndexedBucketInner { - is_sorted, - time_range: bucket_time_range, - col_time, - col_insert_id: _, - col_row_id, - max_row_id: _, - columns, - size_bytes: _, - } = &*self.inner.read(); - debug_assert!(is_sorted); - - let bucket_time_range = *bucket_time_range; - - // Early-exit if this bucket is unaware of any of our components of interest. - if component_names - .iter() - .filter_map(|c| *c) - .all(|component| columns.get(&component).is_none()) - { - return itertools::Either::Right(std::iter::empty()); - } - - // Beware! This merely measures the time it takes to gather all the necessary metadata - // for building the returned iterator. - re_tracing::profile_function!(); - - trace!( - kind = "range", - bucket_time_range = self.timeline.typ().format_range_utc(bucket_time_range), - ?component_names, - timeline = %self.timeline.name(), - time_range = self.timeline.typ().format_range_utc(time_range), - "searching for time & component cell numbers…" - ); - - let time_row_nr = col_time.partition_point(|t| *t < time_range.min().as_i64()) as u64; - - trace!( - kind = "range", - bucket_time_range = self.timeline.typ().format_range_utc(bucket_time_range), - ?component_names, - timeline = %self.timeline.name(), - time_range = self.timeline.typ().format_range_utc(time_range), - %time_row_nr, - "found time row number", - ); - - // TODO(cmc): Cloning these is obviously not great and will need to be addressed at - // some point. - // But, really, it's not _that_ bad either: these are either integers or erased pointers, - // and e.g. with the default configuration there are only 1024 of them (times the number - // of components). - let col_time = col_time.clone(); - let col_row_id = col_row_id.clone(); - let mut columns = columns.clone(); // shallow - - // We have found the index of the first row that possibly contains data for any single one - // of the components we're interested in. - // - // Now we need to iterate through every remaining rows in the bucket and yield any that - // contains data for these components and is still within the time range. - let cells = col_time - .into_iter() - .skip(time_row_nr as usize) - // don't go beyond the time range we're interested in! - .filter(move |&data_time| time_range.contains(TimeInt::new_temporal(data_time))) - .enumerate() - .filter_map(move |(time_row_offset, data_time)| { - let row_nr = time_row_nr + time_row_offset as u64; - - let mut cells = [(); N].map(|_| None); - for (i, component_name) in component_names.iter().enumerate() { - let Some(component_name) = component_name else { - // That component has static data. - continue; - }; - - if let Some(column) = columns.get_mut(component_name) { - cells[i] = column[row_nr as usize].take(); - } - } - - // We only yield rows that contain data for at least one of the components of - // interest. - if cells.iter().all(Option::is_none) { - return None; - } - - let row_id = col_row_id[row_nr as usize]; - - trace!( - kind = "range", - bucket_time_range = - self.timeline.typ().format_range_utc(bucket_time_range), - ?component_names, - timeline = %self.timeline.name(), - time_range = self.timeline.typ().format_range_utc(time_range), - %row_nr, - %row_id, - ?cells, - "yielding cells", - ); - - Some((TimeInt::new_temporal(data_time), row_id, cells)) - }); - - itertools::Either::Left(cells) - } - - /// Whether the indices in this `IndexedBucket` are sorted - pub fn is_sorted(&self) -> bool { - self.inner.read().is_sorted - } -} - -impl IndexedBucketInner { - pub fn sort(&mut self) { - let Self { - is_sorted, - time_range: _, - col_time, - col_insert_id, - col_row_id, - max_row_id: _, - columns, - size_bytes: _, - } = self; - - if *is_sorted { - return; - } - - re_tracing::profile_function!(); - - let swaps = { - re_tracing::profile_scope!("swaps"); - let mut swaps = (0..col_time.len()).collect::>(); - // NOTE: Within a single timestamp, we must use the Row ID as tie-breaker! - // The Row ID is how we define ordering within a client's thread, and our public APIs - // guarantee that logging order is respected within a single thread! - swaps.sort_by_key(|&i| (&col_time[i], &col_row_id[i])); - swaps - .iter() - .copied() - .enumerate() - .map(|(to, from)| (from, to)) - .collect::>() - }; - - // Yep, the reshuffle implementation is very dumb and very slow :) - // TODO(#442): re_datastore: implement efficient shuffling on the read path. - - { - re_tracing::profile_scope!("control"); - - fn reshuffle_control_column( - column: &mut VecDeque, - swaps: &[(usize, usize)], - ) { - let source = { - re_tracing::profile_scope!("clone"); - column.clone() - }; - { - re_tracing::profile_scope!("rotate"); - for (from, to) in swaps.iter().copied() { - column[to] = source[from]; - } - } - } - - reshuffle_control_column(col_time, &swaps); - if !col_insert_id.is_empty() { - reshuffle_control_column(col_insert_id, &swaps); - } - reshuffle_control_column(col_row_id, &swaps); - } - - { - re_tracing::profile_scope!("data"); - // shuffle component columns back into a sorted state - for column in columns.values_mut() { - let mut source = column.clone(); - { - for (from, to) in swaps.iter().copied() { - column[to] = source[from].take(); - } - } - } - } - - *is_sorted = true; - } -} diff --git a/crates/re_data_store/src/store_sanity.rs b/crates/re_data_store/src/store_sanity.rs deleted file mode 100644 index 74699489db9d..000000000000 --- a/crates/re_data_store/src/store_sanity.rs +++ /dev/null @@ -1,246 +0,0 @@ -use re_log_types::{ResolvedTimeRange, RowId, TimeInt, VecDequeSortingExt as _}; -use re_types_core::{ComponentName, Loggable, SizeBytes as _}; - -use crate::{DataStore, IndexedBucket, IndexedBucketInner, IndexedTable}; - -// --- - -/// Returned by the `sanity_check` family of function when an invariant violation has been detected -/// in the `DataStore`'s internal datastructures. -/// These violations can only stem from a bug in the store's implementation itself. -#[derive(thiserror::Error, Debug)] -pub enum SanityError { - #[error( - "Reported time range for indexed bucket is out of sync: got {got:?}, expected {expected:?}" - )] - TimeRangeOutOfSync { - expected: ResolvedTimeRange, - got: ResolvedTimeRange, - }, - - #[error( - "Reported max RowId for indexed bucket is out of sync: got {got}, expected {expected}" - )] - MaxRowIdOutOfSync { expected: RowId, got: RowId }, - - #[error("Reported size for {origin} is out of sync: got {got}, expected {expected}")] - SizeOutOfSync { - origin: &'static str, - expected: String, - got: String, - }, - - #[error("Reported number of rows for {origin} is out of sync: got {got}, expected {expected}")] - RowsOutOfSync { - origin: &'static str, - expected: String, - got: String, - }, - - #[error("Column '{component}' has too few/many rows: got {got} instead of {expected}")] - ColumnLengthMismatch { - component: ComponentName, - expected: u64, - got: u64, - }, - - #[error("Found overlapping indexed buckets: {t1_max_formatted} ({t1_max}) <-> {t2_max_formatted} ({t2_max})")] - OverlappingBuckets { - t1_max: i64, - t1_max_formatted: String, - t2_max: i64, - t2_max_formatted: String, - }, -} - -pub type SanityResult = ::std::result::Result; - -// --- Data store --- - -impl DataStore { - /// Runs the sanity check suite for the entire datastore. - /// - /// Returns an error if anything looks wrong. - pub fn sanity_check(&self) -> SanityResult<()> { - re_tracing::profile_function!(); - - for table in self.tables.values() { - table.sanity_check()?; - } - - Ok(()) - } -} - -// --- Temporal --- - -impl IndexedTable { - /// Runs the sanity check suite for the entire table. - /// - /// Returns an error if anything looks wrong. - pub fn sanity_check(&self) -> SanityResult<()> { - re_tracing::profile_function!(); - - // No two buckets should ever overlap time-range-wise. - { - let time_ranges = self - .buckets - .values() - .map(|bucket| bucket.inner.read().time_range) - .collect::>(); - for time_ranges in time_ranges.windows(2) { - let &[t1, t2] = time_ranges else { - unreachable!() - }; - if t1.max().as_i64() >= t2.min().as_i64() { - return Err(SanityError::OverlappingBuckets { - t1_max: t1.max().as_i64(), - t1_max_formatted: self.timeline.typ().format_utc(t1.max()), - t2_max: t2.max().as_i64(), - t2_max_formatted: self.timeline.typ().format_utc(t2.max()), - }); - } - } - } - - // Make sure row numbers aren't out of sync - { - let num_rows = self.num_rows(); - let num_rows_uncached = self.num_rows_uncached(); - if num_rows != num_rows_uncached { - return Err(SanityError::RowsOutOfSync { - origin: std::any::type_name::(), - expected: re_format::format_uint(num_rows_uncached), - got: re_format::format_uint(num_rows), - }); - } - } - - // Run individual bucket sanity check suites too. - for bucket in self.buckets.values() { - bucket.sanity_check()?; - } - - // Make sure size values aren't out of sync - { - let total_size_bytes = self.total_size_bytes(); - let total_size_bytes_uncached = self.size_bytes_uncached(); - if total_size_bytes != total_size_bytes_uncached { - return Err(SanityError::SizeOutOfSync { - origin: std::any::type_name::(), - expected: re_format::format_bytes(total_size_bytes_uncached as _), - got: re_format::format_bytes(total_size_bytes as _), - }); - } - } - - Ok(()) - } -} - -impl IndexedBucket { - /// Runs the sanity check suite for the entire bucket. - /// - /// Returns an error if anything looks wrong. - pub fn sanity_check(&self) -> SanityResult<()> { - re_tracing::profile_function!(); - - let Self { timeline: _, inner } = self; - - { - let IndexedBucketInner { - is_sorted: _, - time_range, - col_time, - col_insert_id, - col_row_id, - max_row_id, - columns, - size_bytes: _, - } = &*inner.read(); - - // Time ranges are eagerly maintained. - { - let mut times = col_time.clone(); - times.sort(); - - let expected_min = times - .front() - .copied() - .and_then(|t| TimeInt::try_from(t).ok()) - .unwrap_or(TimeInt::MAX); - let expected_max = times - .back() - .copied() - .and_then(|t| TimeInt::try_from(t).ok()) - .unwrap_or(TimeInt::MIN); - let expected_time_range = ResolvedTimeRange::new(expected_min, expected_max); - - if expected_time_range != *time_range { - return Err(SanityError::TimeRangeOutOfSync { - expected: expected_time_range, - got: *time_range, - }); - } - } - - // Make sure `max_row_id` isn't out of sync - { - let expected = col_row_id.iter().max().copied().unwrap_or(RowId::ZERO); - if expected != *max_row_id { - return Err(SanityError::MaxRowIdOutOfSync { - expected, - got: *max_row_id, - }); - } - } - - // All columns should be `Self::num_rows` long. - { - const COLUMN_TIMEPOINT: &str = "rerun.controls.TimePoint"; - - let num_rows = self.num_rows(); - - let column_lengths = [ - (!col_insert_id.is_empty()) - .then(|| (DataStore::insert_id_component_name(), col_insert_id.len())), // - Some((COLUMN_TIMEPOINT.into(), col_time.len())), - Some((RowId::name(), col_row_id.len())), - ] - .into_iter() - .flatten() - .chain( - columns - .iter() - .map(|(component, column)| (*component, column.len())), - ) - .map(|(component, len)| (component, len as u64)); - - for (component, len) in column_lengths { - if len != num_rows { - return Err(SanityError::ColumnLengthMismatch { - component, - expected: num_rows, - got: len, - }); - } - } - } - } - - // Make sure size values aren't out of sync - { - let size_bytes = inner.read().size_bytes; - let size_bytes_uncached = inner.write().compute_size_bytes(); - if size_bytes != size_bytes_uncached { - return Err(SanityError::SizeOutOfSync { - origin: std::any::type_name::(), - expected: re_format::format_bytes(size_bytes_uncached as _), - got: re_format::format_bytes(size_bytes as _), - }); - } - } - - Ok(()) - } -} diff --git a/crates/re_data_store/src/store_stats.rs b/crates/re_data_store/src/store_stats.rs deleted file mode 100644 index d70846a14d3f..000000000000 --- a/crates/re_data_store/src/store_stats.rs +++ /dev/null @@ -1,386 +0,0 @@ -use re_log_types::{EntityPathHash, ResolvedTimeRange, TimePoint}; -use re_types_core::SizeBytes; - -use crate::{store::IndexedBucketInner, DataStore, IndexedBucket, IndexedTable, MetadataRegistry}; - -// --- - -// NOTE: Not implemented as a StoreSubscriber because it also measures implementation details of the -// store (buckets etc), while StoreEvents work at a data-model level. - -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd)] -pub struct DataStoreRowStats { - pub num_rows: u64, - pub num_bytes: u64, -} - -impl std::ops::Sub for DataStoreRowStats { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - Self { - num_rows: self.num_rows - rhs.num_rows, - num_bytes: self.num_bytes - rhs.num_bytes, - } - } -} - -impl std::ops::Add for DataStoreRowStats { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - Self { - num_rows: self.num_rows + rhs.num_rows, - num_bytes: self.num_bytes + rhs.num_bytes, - } - } -} - -#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd)] -pub struct DataStoreStats { - pub type_registry: DataStoreRowStats, - pub metadata_registry: DataStoreRowStats, - - /// `num_rows` is really `num_cells` in this case. - pub static_tables: DataStoreRowStats, - - pub temporal: DataStoreRowStats, - pub temporal_buckets: u64, - - pub total: DataStoreRowStats, -} - -impl std::ops::Sub for DataStoreStats { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - Self { - type_registry: self.type_registry - rhs.type_registry, - metadata_registry: self.metadata_registry - rhs.metadata_registry, - static_tables: self.static_tables - rhs.static_tables, - temporal: self.temporal - rhs.temporal, - temporal_buckets: self.temporal_buckets - rhs.temporal_buckets, - total: self.total - rhs.total, - } - } -} - -impl std::ops::Add for DataStoreStats { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - Self { - type_registry: self.type_registry + rhs.type_registry, - metadata_registry: self.metadata_registry + rhs.metadata_registry, - static_tables: self.static_tables + rhs.static_tables, - temporal: self.temporal + rhs.temporal, - temporal_buckets: self.temporal_buckets + rhs.temporal_buckets, - total: self.total + rhs.total, - } - } -} - -impl DataStoreStats { - pub fn from_store(store: &DataStore) -> Self { - re_tracing::profile_function!(); - - let type_registry = { - re_tracing::profile_scope!("type_registry"); - DataStoreRowStats { - num_rows: store.type_registry.len() as _, - num_bytes: store.type_registry.total_size_bytes(), - } - }; - - let metadata_registry = { - re_tracing::profile_scope!("metadata_registry"); - DataStoreRowStats { - num_rows: store.metadata_registry.len() as _, - num_bytes: store.metadata_registry.total_size_bytes(), - } - }; - - let static_tables = { - re_tracing::profile_scope!("static data"); - DataStoreRowStats { - num_rows: store.num_static_rows(), - num_bytes: store.static_size_bytes(), - } - }; - - let (temporal, temporal_buckets) = { - re_tracing::profile_scope!("temporal"); - ( - DataStoreRowStats { - num_rows: store.num_temporal_rows(), - num_bytes: store.temporal_size_bytes(), - }, - store.num_temporal_buckets(), - ) - }; - - let total = DataStoreRowStats { - num_rows: static_tables.num_rows + temporal.num_rows, - num_bytes: type_registry.num_bytes - + metadata_registry.num_bytes - + static_tables.num_bytes - + temporal.num_bytes, - }; - - Self { - type_registry, - metadata_registry, - static_tables, - temporal, - temporal_buckets, - total, - } - } - - /// Both static & temporal data. - pub fn total_rows_and_bytes(&self) -> (u64, f64) { - let mut num_rows = self.temporal.num_rows + self.metadata_registry.num_rows; - let mut num_bytes = (self.temporal.num_bytes + self.metadata_registry.num_bytes) as f64; - - num_rows += self.static_tables.num_rows; - num_bytes += self.static_tables.num_bytes as f64; - - (num_rows, num_bytes) - } -} - -// --- Data store --- - -impl SizeBytes for MetadataRegistry<(TimePoint, EntityPathHash)> { - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.heap_size_bytes - } -} - -impl SizeBytes for DataStore { - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.static_size_bytes() + self.temporal_size_bytes() // approximate - } -} - -impl DataStore { - /// Returns the number of static rows stored across this entire store. - #[inline] - pub fn num_static_rows(&self) -> u64 { - // A static table only ever contains a single row. - self.static_tables.len() as _ - } - - /// Returns the size of the static data stored across this entire store. - #[inline] - pub fn static_size_bytes(&self) -> u64 { - re_tracing::profile_function!(); - self.static_tables - .values() - .map(|static_table| { - static_table - .cells - .values() - .map(|static_cell| static_cell.cell.total_size_bytes()) - .sum::() - }) - .sum() - } - - /// Returns the number of temporal index rows stored across this entire store, i.e. the sum of - /// the number of rows across all of its temporal indexed tables. - #[inline] - pub fn num_temporal_rows(&self) -> u64 { - re_tracing::profile_function!(); - self.tables.values().map(|table| table.num_rows()).sum() - } - - /// Returns the size of the temporal index data stored across this entire store, i.e. the sum - /// of the size of the data stored across all of its temporal indexed tables, in bytes. - #[inline] - pub fn temporal_size_bytes(&self) -> u64 { - re_tracing::profile_function!(); - self.tables - .values() - .map(|table| table.total_size_bytes()) - .sum() - } - - /// Returns the number of temporal indexed buckets stored across this entire store. - #[inline] - pub fn num_temporal_buckets(&self) -> u64 { - re_tracing::profile_function!(); - self.tables.values().map(|table| table.num_buckets()).sum() - } - - /// Stats for a specific entity path on a specific timeline - pub fn entity_stats( - &self, - timeline: re_log_types::Timeline, - entity_path_hash: re_log_types::EntityPathHash, - ) -> EntityStats { - let mut entity_stats = self.tables.get(&(entity_path_hash, timeline)).map_or( - EntityStats::default(), - |table| EntityStats { - num_rows: table.buckets_num_rows, - size_bytes: table.buckets_size_bytes, - time_range: table.time_range(), - num_static_cells: 0, - static_size_bytes: 0, - }, - ); - - if let Some(static_table) = self.static_tables.get(&entity_path_hash) { - entity_stats.num_static_cells = static_table.cells.len() as _; - entity_stats.static_size_bytes = static_table - .cells - .values() - .map(|static_cell| static_cell.cell.total_size_bytes()) - .sum(); - } - - entity_stats - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct EntityStats { - /// Number of rows in the table. - pub num_rows: u64, - - /// Approximate number of bytes used. - pub size_bytes: u64, - - /// The time covered by the entity. - pub time_range: re_log_types::ResolvedTimeRange, - - /// Number of static cells. - pub num_static_cells: u64, - - /// Approximate number of bytes used for static data. - pub static_size_bytes: u64, -} - -impl Default for EntityStats { - fn default() -> Self { - Self { - num_rows: 0, - size_bytes: 0, - time_range: re_log_types::ResolvedTimeRange::EMPTY, - num_static_cells: 0, - static_size_bytes: 0, - } - } -} - -// --- Temporal --- - -impl IndexedTable { - /// Returns the number of rows stored across this entire table, i.e. the sum of the number - /// of rows stored across all of its buckets. - #[inline] - pub fn num_rows(&self) -> u64 { - self.buckets_num_rows - } - - /// Returns the number of rows stored across this entire table, i.e. the sum of the number - /// of rows stored across all of its buckets. - /// - /// Recomputed from scratch, for sanity checking. - #[inline] - pub(crate) fn num_rows_uncached(&self) -> u64 { - re_tracing::profile_function!(); - self.buckets.values().map(|bucket| bucket.num_rows()).sum() - } - - #[inline] - pub(crate) fn size_bytes_uncached(&self) -> u64 { - re_tracing::profile_function!(); - self.stack_size_bytes() - + self - .buckets - .values() - .map(|bucket| bucket.total_size_bytes()) - .sum::() - } - - /// Returns the number of buckets stored across this entire table. - #[inline] - pub fn num_buckets(&self) -> u64 { - self.buckets.len() as _ - } - - /// The time range covered by this table. - pub fn time_range(&self) -> ResolvedTimeRange { - if let (Some((_, first)), Some((_, last))) = ( - self.buckets.first_key_value(), - self.buckets.last_key_value(), - ) { - first - .inner - .read() - .time_range - .union(last.inner.read().time_range) - } else { - ResolvedTimeRange::EMPTY - } - } -} - -impl SizeBytes for IndexedTable { - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.buckets_size_bytes - } -} - -impl IndexedBucket { - /// Returns the number of rows stored across this bucket. - #[inline] - pub fn num_rows(&self) -> u64 { - self.inner.read().col_time.len() as u64 - } -} - -impl SizeBytes for IndexedBucket { - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.inner.read().size_bytes - } -} - -impl IndexedBucketInner { - /// Computes and caches the size of both the control & component data stored in this bucket, - /// stack and heap included, in bytes. - /// - /// This is a best-effort approximation, adequate for most purposes (stats, - /// triggering GCs, …). - #[inline] - pub fn compute_size_bytes(&mut self) -> u64 { - re_tracing::profile_function!(); - - let Self { - is_sorted, - time_range, - col_time, - col_insert_id, - col_row_id, - max_row_id, - columns, - size_bytes, - } = self; - - *size_bytes = is_sorted.total_size_bytes() - + time_range.total_size_bytes() - + col_time.total_size_bytes() - + col_insert_id.total_size_bytes() - + col_row_id.total_size_bytes() - + max_row_id.total_size_bytes() - + columns.total_size_bytes() - + size_bytes.total_size_bytes(); - - *size_bytes - } -} diff --git a/crates/re_data_store/src/store_write.rs b/crates/re_data_store/src/store_write.rs deleted file mode 100644 index 1df6d2fd04fd..000000000000 --- a/crates/re_data_store/src/store_write.rs +++ /dev/null @@ -1,674 +0,0 @@ -use arrow2::datatypes::DataType; -use itertools::Itertools as _; -use nohash_hasher::IntMap; -use parking_lot::RwLock; - -use re_log::{debug, trace}; -use re_log_types::{ - DataCell, DataCellColumn, DataCellError, DataRow, EntityPathHash, ResolvedTimeRange, RowId, - TimeInt, TimePoint, VecDequeRemovalExt as _, -}; -use re_types_core::{ComponentName, ComponentNameSet, SizeBytes as _}; - -use crate::{ - DataStore, DataStoreConfig, IndexedBucket, IndexedBucketInner, IndexedTable, MetadataRegistry, - StaticCell, StaticTable, StoreDiff, StoreDiffKind, StoreEvent, -}; - -// --- Data store --- - -#[derive(thiserror::Error, Debug)] -pub enum WriteError { - #[error("The incoming data was inconsistent: {0}")] - DataRead(#[from] re_log_types::DataReadError), - - #[error("Error with one or more the underlying data cells")] - DataCell(#[from] DataCellError), - - #[error("The inserted data must contain at least one cell")] - Empty, - - #[error( - "Component '{component}' failed to typecheck: expected {expected:#?} but got {got:#?}" - )] - TypeCheck { - component: ComponentName, - expected: DataType, - got: DataType, - }, - - #[error("Attempted to re-use already taken RowId:{0}")] - ReusedRowId(RowId), -} - -pub type WriteResult = ::std::result::Result; - -impl DataStore { - /// Inserts a [`DataRow`]'s worth of components into the datastore. - pub fn insert_row(&mut self, row: &DataRow) -> WriteResult { - // TODO(cmc): kind & insert_id need to somehow propagate through the span system. - self.insert_id += 1; - - if row.num_cells() == 0 { - return Err(WriteError::Empty); - } - - let DataRow { - row_id, - timepoint, - entity_path, - cells, - } = row; - - self.metadata_registry - .upsert(*row_id, (timepoint.clone(), entity_path.hash()))?; - - re_tracing::profile_function!(); - - // Update type registry. - // TODO(#1809): not only this should be replaced by a central arrow runtime registry, it should - // also be implemented as a changelog subscriber. - for cell in row.cells().iter() { - self.type_registry - .insert(cell.component_name(), cell.datatype().clone()); - } - - let entity_path_hash = entity_path.hash(); - - trace!( - kind = "insert", - id = self.insert_id, - timelines = ?timepoint.iter() - .map(|(timeline, time)| (timeline.name(), timeline.typ().format_utc(*time))) - .collect::>(), - %entity_path, - components = ?cells.iter().map(|cell| cell.component_name()).collect_vec(), - "insertion started…" - ); - - let insert_id = self.config.store_insert_ids.then_some(self.insert_id); - - let diff = if timepoint.is_static() { - let static_table = self - .static_tables - .entry(entity_path_hash) - .or_insert_with(|| StaticTable::new(entity_path.clone())); - - let cells = row - .cells() - .iter() - .filter(|cell| { - static_table - .cells - .get(&cell.component_name()) - // Last-write-wins semantics, where ordering is defined by RowId. - .map_or(true, |static_cell| static_cell.row_id < *row_id) - }) - .collect_vec(); - - for cell in &cells { - static_table.cells.insert( - cell.component_name(), - StaticCell { - insert_id, - row_id: *row_id, - cell: (*cell).clone(), - }, - ); - } - - let mut diff = StoreDiff::addition(*row_id, entity_path.clone()); - diff.with_cells(cells.into_iter().cloned()); - diff - } else { - for (timeline, time) in timepoint.iter() { - let entity_path = entity_path.clone(); // shallow - let index = self - .tables - .entry((entity_path_hash, *timeline)) - .or_insert_with(|| IndexedTable::new(*timeline, entity_path)); - - index.insert_row(&self.config, insert_id, *time, row); - } - - let mut diff = StoreDiff::addition(*row_id, entity_path.clone()); - diff.at_timepoint(timepoint.clone()) - .with_cells(cells.iter().cloned()); - diff - }; - - let event = StoreEvent { - store_id: self.id.clone(), - store_generation: self.generation(), - event_id: self - .event_id - .fetch_add(1, std::sync::atomic::Ordering::Relaxed), - diff, - }; - - { - let events = &[event.clone()]; - - if cfg!(debug_assertions) { - let any_event_other_than_addition = - events.iter().any(|e| e.kind != StoreDiffKind::Addition); - assert!(!any_event_other_than_addition); - } - - Self::on_events(events); - } - - Ok(event) - } -} - -impl MetadataRegistry<(TimePoint, EntityPathHash)> { - fn upsert(&mut self, row_id: RowId, data: (TimePoint, EntityPathHash)) -> WriteResult<()> { - match self.entry(row_id) { - std::collections::btree_map::Entry::Occupied(_) => Err(WriteError::ReusedRowId(row_id)), - std::collections::btree_map::Entry::Vacant(entry) => { - // NOTE: In a map, thus on the heap! - let added_size_bytes = row_id.total_size_bytes() + data.total_size_bytes(); - - // This is valuable information even for a timeless timepoint! - entry.insert(data); - - self.heap_size_bytes += added_size_bytes; - - Ok(()) - } - } - } -} - -// --- Temporal --- - -impl IndexedTable { - pub fn insert_row( - &mut self, - config: &DataStoreConfig, - insert_id: Option, - time: TimeInt, - row: &DataRow, - ) { - re_tracing::profile_function!(); - - let components: ComponentNameSet = row.component_names().collect(); - - // borrowck workaround - let timeline = self.timeline; - let entity_path = self.entity_path.clone(); // shallow - - let (_, bucket) = self.find_bucket_mut(time); - - let len = bucket.num_rows(); - let len_overflow = len > config.indexed_bucket_num_rows; - - if len_overflow { - let bucket_size_before = bucket.total_size_bytes(); - if let Some((min, second_half)) = bucket.split() { - trace!( - kind = "insert", - timeline = %timeline.name(), - time = timeline.typ().format_utc(time), - %entity_path, - len_limit = config.indexed_bucket_num_rows, - len, len_overflow, - new_time_bound = timeline.typ().format_utc(min), - "splitting off indexed bucket following overflow" - ); - - self.buckets_size_bytes += - bucket.total_size_bytes() + second_half.total_size_bytes(); - self.buckets_size_bytes -= bucket_size_before; - self.buckets.insert(min, second_half); - - return self.insert_row(config, insert_id, time, row); - } - - // We couldn't split the bucket, either because it's already too small, or because it - // contains a unique timepoint value that's repeated multiple times. - // - // * If the bucket is that small, then there really is no better thing to do than - // letting it grow some more by appending to it. - // - // * If the timepoint we're trying to insert is smaller or equal to the current upper - // bound of the bucket, then at this point we have no choice but to insert it here - // (by definition, it is impossible that any previous bucket in the chain covers a - // time range that includes this timepoint: buckets are non-overlapping!). - // - // * Otherwise, if the timepoint we're trying to insert is greater than the upper bound - // of the current bucket, then it means that there currently exist no bucket that - // covers a time range which includes this timepoint (if such a bucket existed, then - // we would have stumbled upon it before ever finding the current one!). - // This gives us an opportunity to create a new bucket that starts at the upper - // bound of the current one _excluded_ and that ranges all the way up to the - // timepoint that we're inserting. - // Not only is this a great opportunity to naturally split things up, it's actually - // mandatory to avoid a nasty edge case where one keeps inserting into a full, - // unsplittable bucket and indefinitely creates new single-entry buckets, leading - // to the worst-possible case of fragmentation. - - let (bucket_upper_bound, bucket_len) = { - let guard = bucket.inner.read(); - (guard.col_time.back().copied(), guard.col_time.len()) - }; - - if let Some(upper_bound) = bucket_upper_bound { - if bucket_len > 2 && time.as_i64() > upper_bound { - let new_time_bound = upper_bound + 1; - debug!( - kind = "insert", - timeline = %timeline.name(), - time = timeline.typ().format_utc(time), - %entity_path, - len_limit = config.indexed_bucket_num_rows, - len, len_overflow, - new_time_bound = timeline.typ().format_utc(TimeInt::new_temporal(new_time_bound)), - "creating brand new indexed bucket following overflow" - ); - - let (inner, inner_size_bytes) = { - let mut inner = IndexedBucketInner { - time_range: ResolvedTimeRange::new(time, time), - ..Default::default() - }; - let size_bytes = inner.compute_size_bytes(); - (inner, size_bytes) - }; - self.buckets.insert( - TimeInt::new_temporal(new_time_bound), - IndexedBucket { - timeline, - inner: RwLock::new(inner), - }, - ); - - self.buckets_size_bytes += inner_size_bytes; - return self.insert_row(config, insert_id, time, row); - } - } - - if 0 < config.indexed_bucket_num_rows { - let bucket_time_range = bucket.inner.read().time_range; - - re_log::debug_once!("Failed to split bucket on timeline {}", timeline.name()); - - if 1 < config.indexed_bucket_num_rows - && bucket_time_range.min() == bucket_time_range.max() - { - re_log::warn_once!( - "Found over {} rows with the same timepoint {:?}={} - perhaps you forgot to update or remove the timeline?", - config.indexed_bucket_num_rows, - bucket.timeline.name(), - bucket.timeline.typ().format_utc(bucket_time_range.min()) - ); - } - } - } - - trace!( - kind = "insert", - timeline = %timeline.name(), - time = timeline.typ().format_utc(time), - %entity_path, - ?components, - "inserted into indexed tables" - ); - - let size_bytes = bucket.insert_row(insert_id, time, row, &components); - self.buckets_size_bytes += size_bytes; - self.buckets_num_rows += 1; - - // Insert components last, only if bucket-insert succeeded. - self.all_components.extend(components); - } -} - -impl IndexedBucket { - /// Returns the size in bytes of the inserted arrow data. - fn insert_row( - &mut self, - insert_id: Option, - time: TimeInt, - row: &DataRow, - components: &ComponentNameSet, - ) -> u64 { - re_tracing::profile_function!(); - - let mut size_bytes_added = 0u64; - let _num_rows = self.num_rows() as usize; - - let mut inner = self.inner.write(); - let IndexedBucketInner { - is_sorted, - time_range, - col_time, - col_insert_id, - col_row_id, - max_row_id, - columns, - size_bytes, - } = &mut *inner; - - // append time to primary column and update time range appropriately - - if let (Some(last_time), Some(last_row_id)) = (col_time.back(), col_row_id.back()) { - // NOTE: Within a single timestamp, we use the Row ID as tie-breaker - *is_sorted &= (*last_time, *last_row_id) <= (time.as_i64(), row.row_id()); - } - - col_time.push_back(time.as_i64()); - *time_range = - ResolvedTimeRange::new(time_range.min().min(time), time_range.max().max(time)); - size_bytes_added += time.as_i64().total_size_bytes(); - - // update all control columns - if let Some(insert_id) = insert_id { - col_insert_id.push_back(insert_id); - size_bytes_added += insert_id.total_size_bytes(); - } - col_row_id.push_back(row.row_id()); - *max_row_id = RowId::max(*max_row_id, row.row_id()); - size_bytes_added += row.row_id().total_size_bytes(); - - // append components to their respective columns (2-way merge) - - // 2-way merge, step 1: left-to-right - for cell in row.cells().iter() { - let component_name = cell.component_name(); - let column = columns.entry(component_name).or_insert_with(|| { - let column = DataCellColumn::empty(col_time.len().saturating_sub(1)); - size_bytes_added += component_name.total_size_bytes(); - size_bytes_added += column.total_size_bytes(); - column - }); - size_bytes_added += cell.total_size_bytes(); - column.0.push_back(Some(cell.clone() /* shallow */)); - } - - // 2-way merge, step 2: right-to-left - // - // fill unimpacted columns with null values - for (component_name, column) in &mut *columns { - if !components.contains(component_name) { - let none_cell: Option = None; - size_bytes_added += none_cell.total_size_bytes(); - column.0.push_back(none_cell); - } - } - - *size_bytes += size_bytes_added; - - #[cfg(debug_assertions)] - #[allow(clippy::unwrap_used)] - { - drop(inner); - self.sanity_check().unwrap(); - } - - size_bytes_added - } - - /// Splits the bucket into two, potentially uneven parts. - /// - /// On success..: - /// - the first part is split in place (i.e. modifies `self`), - /// - the second part is returned as a new bucket, - /// - and the minimal bound of that new bucket is returned as a `TimeInt`, for indexing. - /// - /// Returns `None` on failure, i.e. if the bucket cannot be split any further, which can - /// happen either because the bucket is too small to begin with, or because it only contains - /// a single timepoint. - /// - /// # Unsplittable buckets - /// - /// The datastore and query path operate under the general assumption that _all of the data_ - /// for a given timepoint will reside in _one and only one_ bucket. - /// This function makes sure to uphold that restriction, which sometimes means splitting the - /// bucket into two uneven parts, or even not splitting it at all. - /// - /// Run the following command to display a visualization of the store's internal - /// datastructures and better understand how everything fits together: - /// ```text - /// cargo test -p re_data_store -- --nocapture datastore_internal_repr - /// ``` - fn split(&self) -> Option<(TimeInt, Self)> { - let Self { timeline, inner } = self; - - let mut inner1 = inner.write(); - - if inner1.col_time.len() < 2 { - return None; // early exit: can't split the unsplittable - } - - if inner1.time_range.abs_length() == 0 { - // The entire bucket contains only one timepoint, thus it's impossible to find - // a split index to begin with. - return None; - } - - re_tracing::profile_function!(); - - inner1.sort(); - - let IndexedBucketInner { - is_sorted: _, - time_range: time_range1, - col_time: col_time1, - col_insert_id: col_insert_id1, - col_row_id: col_row_id1, - max_row_id: max_row_id1, - columns: columns1, - size_bytes: _, // NOTE: recomputed below - } = &mut *inner1; - - let timeline = *timeline; - - // Used in debug builds to assert that we've left everything in a sane state. - let _num_rows = col_time1.len(); - - let (min2, bucket2) = { - col_time1.make_contiguous(); - let (times1, &[]) = col_time1.as_slices() else { - unreachable!(); - }; - let split_idx = find_split_index(times1).expect("must be splittable at this point"); - - let (time_range2, col_time2, col_insert_id2, col_row_id2) = { - re_tracing::profile_scope!("control"); - // update everything _in place_! - ( - split_time_range_off(split_idx, times1, time_range1), - col_time1.split_off_or_default(split_idx), - col_insert_id1.split_off_or_default(split_idx), - col_row_id1.split_off_or_default(split_idx), - ) - }; - // NOTE: We _have_ to fullscan here: the bucket is sorted by `(Time, RowId)`, there - // could very well be a greater lurking in a lesser entry. - *max_row_id1 = col_row_id1.iter().max().copied().unwrap_or(RowId::ZERO); - - // this updates `columns1` in-place! - let columns2: IntMap<_, _> = { - re_tracing::profile_scope!("data"); - columns1 - .iter_mut() - .map(|(name, column1)| { - if split_idx >= column1.len() { - return (*name, DataCellColumn(Default::default())); - } - - // this updates `column1` in-place! - let column2 = DataCellColumn(column1.split_off(split_idx)); - (*name, column2) - }) - .collect() - }; - - let inner2 = { - // NOTE: We _have_ to fullscan here: the bucket is sorted by `(Time, RowId)`, there - // could very well be a greater lurking in a lesser entry. - let max_row_id2 = col_row_id2.iter().max().copied().unwrap_or(RowId::ZERO); - let mut inner2 = IndexedBucketInner { - is_sorted: true, - time_range: time_range2, - col_time: col_time2, - col_insert_id: col_insert_id2, - col_row_id: col_row_id2, - max_row_id: max_row_id2, - columns: columns2, - size_bytes: 0, // NOTE: computed below - }; - inner2.compute_size_bytes(); - inner2 - }; - let bucket2 = Self { - timeline, - inner: RwLock::new(inner2), - }; - - (time_range2.min(), bucket2) - }; - - inner1.compute_size_bytes(); - - // sanity checks - #[cfg(debug_assertions)] - #[allow(clippy::unwrap_used)] - { - drop(inner1); // sanity checking will grab the lock! - self.sanity_check().unwrap(); - bucket2.sanity_check().unwrap(); - - let num_rows1 = self.num_rows() as i64; - let num_rows2 = bucket2.num_rows() as i64; - debug_assert_eq!( - _num_rows as i64, - num_rows1 + num_rows2, - "expected both buckets to sum up to the length of the original bucket" - ); - } - - Some((min2, bucket2)) - } -} - -/// Finds an optimal split point for the given time index, or `None` if all entries in the index -/// are identical, making it unsplittable. -/// -/// The returned index is _exclusive_: `[0, split_idx)` + `[split_idx; len)`. -/// -/// # Panics -/// -/// This function expects `times` to be sorted! -/// In debug builds, it will panic if that's not the case. -fn find_split_index(times: &[i64]) -> Option { - debug_assert!( - times.windows(2).all(|t| t[0] <= t[1]), - "time index must be sorted before splitting!" - ); - - if times.first() == times.last() { - return None; // early exit: unsplittable - } - - re_tracing::profile_function!(); - - // This can never be lesser than 1 as we never split buckets smaller than 2 entries. - let halfway_idx = times.len() / 2; - let target = times[halfway_idx]; - - // Are we about to split in the middle of a continuous run? Hop backwards to figure it out. - let split_idx1 = Some(times[..halfway_idx].partition_point(|&t| t < target)).filter(|&i| i > 0); - - // Are we about to split in the middle of a continuous run? Hop forwards to figure it out. - let split_idx2 = Some(times[halfway_idx..].partition_point(|&t| t <= target)) - .map(|t| t + halfway_idx) // we skipped that many entries! - .filter(|&t| t < times.len()); - - // Are we in the middle of a backwards continuous run? a forwards continuous run? both? - match (split_idx1, split_idx2) { - // Unsplittable, which cannot happen as we already early-exit earlier. - #[cfg(not(debug_assertions))] - (None, None) => None, - #[cfg(debug_assertions)] - (None, None) => unreachable!(), - - // Backwards run, let's use the first split index. - (Some(split_idx1), None) => Some(split_idx1), - - // Forwards run, let's use the second split index. - (None, Some(split_idx2)) => Some(split_idx2), - - // The run goes both backwards and forwards from the half point: use the split index - // that's the closest to halfway. - (Some(split_idx1), Some(split_idx2)) => { - if halfway_idx.abs_diff(split_idx1) < halfway_idx.abs_diff(split_idx2) { - split_idx1 - } else { - split_idx2 - } - .into() - } - } -} - -#[test] -fn test_find_split_index() { - let test_cases = [ - (vec![1, 1], None), - // - (vec![1, 1, 1], None), - (vec![1, 1, 2], Some(2)), - (vec![0, 1, 1], Some(1)), - // - (vec![1, 1, 1, 1], None), - (vec![1, 1, 1, 2], Some(3)), - (vec![0, 1, 1, 1], Some(1)), - // - (vec![1, 1, 1, 1, 1], None), - (vec![1, 1, 1, 1, 2], Some(4)), - (vec![0, 1, 1, 1, 1], Some(1)), - (vec![0, 1, 1, 1, 2], Some(1)), // first one wins when equal distances - (vec![0, 1, 1, 2, 2], Some(3)), // second one is closer - (vec![0, 0, 1, 2, 2], Some(2)), // first one wins when equal distances - (vec![0, 0, 2, 2, 2], Some(2)), // second one is closer - (vec![0, 0, 0, 2, 2], Some(3)), // first one is closer - ]; - - for (times, expected) in test_cases { - let got = find_split_index(×); - assert_eq!(expected, got); - } -} - -/// Given a time index and a desired split index, splits off the given time range in place, -/// and returns a new time range corresponding to the second part. -/// -/// The split index is exclusive: everything up to `split_idx` (excluded) will end up in the -/// first split. -/// -/// The two resulting time range halves are guaranteed to never overlap. -fn split_time_range_off( - split_idx: usize, - times1: &[i64], - time_range1: &mut ResolvedTimeRange, -) -> ResolvedTimeRange { - let time_range2 = - ResolvedTimeRange::new(TimeInt::new_temporal(times1[split_idx]), time_range1.max()); - - // This can never fail (underflow or OOB) because we never split buckets smaller than 2 - // entries. - time_range1.set_max(times1[split_idx - 1]); - - debug_assert!( - time_range1.max().as_i64() < time_range2.min().as_i64(), - "split resulted in overlapping time ranges: {} <-> {}\n{:#?}", - time_range1.max().as_i64(), - time_range2.min().as_i64(), - (&time_range1, &time_range2), - ); - - time_range2 -} diff --git a/crates/re_data_store/src/test_util.rs b/crates/re_data_store/src/test_util.rs deleted file mode 100644 index 498d7132bd33..000000000000 --- a/crates/re_data_store/src/test_util.rs +++ /dev/null @@ -1,89 +0,0 @@ -use re_log_types::DataTable; - -use crate::{DataStore, DataStoreConfig, WriteError}; - -// --- - -#[doc(hidden)] -#[macro_export] -macro_rules! test_row { - ($entity:ident => [$c0:expr $(,)*]) => {{ - ::re_log_types::DataRow::from_cells1_sized( - ::re_log_types::RowId::new(), - $entity.clone(), - ::re_log_types::TimePoint::default(), - $c0, - ) - .unwrap() - }}; - ($entity:ident @ $frames:tt => [$c0:expr $(,)*]) => {{ - ::re_log_types::DataRow::from_cells1_sized( - ::re_log_types::RowId::new(), - $entity.clone(), - $frames, - $c0, - ) - .unwrap() - }}; - ($entity:ident @ $frames:tt => [$c0:expr, $c1:expr $(,)*]) => {{ - ::re_log_types::DataRow::from_cells2_sized( - ::re_log_types::RowId::new(), - $entity.clone(), - $frames, - ($c0, $c1), - ) - .unwrap() - }}; -} - -pub fn all_configs() -> impl Iterator { - const INDEX_CONFIGS: &[DataStoreConfig] = &[ - DataStoreConfig::DEFAULT, - DataStoreConfig { - indexed_bucket_num_rows: 0, - ..DataStoreConfig::DEFAULT - }, - DataStoreConfig { - indexed_bucket_num_rows: 1, - ..DataStoreConfig::DEFAULT - }, - DataStoreConfig { - indexed_bucket_num_rows: 2, - ..DataStoreConfig::DEFAULT - }, - DataStoreConfig { - indexed_bucket_num_rows: 3, - ..DataStoreConfig::DEFAULT - }, - ]; - INDEX_CONFIGS.iter().map(|idx| DataStoreConfig { - indexed_bucket_num_rows: idx.indexed_bucket_num_rows, - store_insert_ids: idx.store_insert_ids, - }) -} - -pub fn sanity_unwrap(store: &DataStore) { - if let err @ Err(_) = store.sanity_check() { - store.sort_indices_if_needed(); - eprintln!("{store}"); - #[allow(clippy::unwrap_used)] // we want to panic here - err.unwrap(); - } -} - -// We very often re-use RowIds when generating test data. -pub fn insert_table_with_retries(store: &mut DataStore, table: &DataTable) { - #[allow(clippy::unwrap_used)] // ok for tests - for row in table.to_rows() { - let mut row = row.unwrap(); - loop { - match store.insert_row(&row) { - Ok(_) => break, - Err(WriteError::ReusedRowId(_)) => { - row.row_id = row.row_id.next(); - } - err @ Err(_) => err.map(|_| ()).unwrap(), - } - } - } -} diff --git a/crates/re_data_store/tests/correctness.rs b/crates/re_data_store/tests/correctness.rs deleted file mode 100644 index ca913485130d..000000000000 --- a/crates/re_data_store/tests/correctness.rs +++ /dev/null @@ -1,581 +0,0 @@ -//! Correctness tests. -//! -//! Bending and twisting the datastore APIs in all kinds of weird ways to try and break them. - -// https://github.com/rust-lang/rust-clippy/issues/10011 -#![cfg(test)] - -use rand::Rng; - -use re_data_store::{ - test_row, test_util::sanity_unwrap, DataStore, DataStoreConfig, DataStoreStats, - GarbageCollectionOptions, LatestAtQuery, WriteError, -}; -use re_log_types::example_components::{MyColor, MyIndex, MyPoint}; -use re_log_types::{ - build_frame_nr, build_log_time, DataRow, Duration, EntityPath, RowId, Time, TimeInt, TimePoint, - TimeType, Timeline, -}; -use re_types_core::Loggable as _; - -// --- - -fn query_latest_component( - store: &DataStore, - entity_path: &EntityPath, - query: &LatestAtQuery, -) -> Option<(TimeInt, RowId, C)> { - re_tracing::profile_function!(); - - let (data_time, row_id, cells) = - store.latest_at(query, entity_path, C::name(), &[C::name()])?; - let cell = cells.first()?.as_ref()?; - - cell.try_to_native_mono::() - .ok()? - .map(|c| (data_time, row_id, c)) -} - -// --- - -#[test] -fn row_id_ordering_semantics() -> anyhow::Result<()> { - let entity_path: EntityPath = "some_entity".into(); - - let timeline_frame = Timeline::new_sequence("frame"); - let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); - - let point1 = MyPoint::new(1.0, 1.0); - let point2 = MyPoint::new(2.0, 2.0); - - // * Insert `point1` at frame #10 with a random `RowId`. - // * Insert `point2` at frame #10 with a random `RowId`. - // * Query at frame #11 and make sure we get `point2` because random `RowId`s are monotonically - // increasing. - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - Default::default(), - ); - - let row_id = RowId::new(); - let row = DataRow::from_component_batches( - row_id, - timepoint.clone(), - entity_path.clone(), - [&[point1] as _], - )?; - store.insert_row(&row)?; - - let row_id = RowId::new(); - let row = DataRow::from_component_batches( - row_id, - timepoint.clone(), - entity_path.clone(), - [&[point2] as _], - )?; - store.insert_row(&row)?; - - { - let query = LatestAtQuery::new(timeline_frame, 11); - let (_, _, got_point) = - query_latest_component::(&store, &entity_path, &query).unwrap(); - similar_asserts::assert_eq!(point2, got_point); - } - } - - // * Insert `point1` at frame #10 with a random `RowId`. - // * Fail to insert `point2` at frame #10 using `point1`s `RowId` because it is illegal. - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - Default::default(), - ); - - let row_id = RowId::new(); - - let row = DataRow::from_component_batches( - row_id, - timepoint.clone(), - entity_path.clone(), - [&[point1] as _], - )?; - store.insert_row(&row)?; - - let row = DataRow::from_component_batches( - row_id, - timepoint.clone(), - entity_path.clone(), - [&[point2] as _], - )?; - - let res = store.insert_row(&row); - assert!(matches!(res, Err(WriteError::ReusedRowId(_)),)); - } - - // * Insert `point1` at frame #10 with a random `RowId`. - // * Insert `point2` at frame #10 using `point1`'s `RowId`, decremented by one. - // * Query at frame #11 and make sure we get `point1` because of intra-timestamp tie-breaks. - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - Default::default(), - ); - - let row_id1 = RowId::new(); - let row_id2 = row_id1.next(); - - let row = DataRow::from_component_batches( - row_id2, - timepoint.clone(), - entity_path.clone(), - [&[point1] as _], - )?; - store.insert_row(&row)?; - - let row = DataRow::from_component_batches( - row_id1, - timepoint.clone(), - entity_path.clone(), - [&[point2] as _], - )?; - store.insert_row(&row)?; - - { - let query = LatestAtQuery::new(timeline_frame, 11); - let (_, _, got_point) = - query_latest_component::(&store, &entity_path, &query).unwrap(); - similar_asserts::assert_eq!(point1, got_point); - } - } - - // Static data has last-write-wins semantics, as defined by RowId-ordering. - // Timeless is RowId-ordered too! - // - // * Insert static `point1` with a random `RowId`. - // * Insert static `point2` using `point1`'s `RowId`, decremented by one. - // * Query and make sure we get `point1` because of last-write-wins semantics. - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - Default::default(), - ); - - let row_id1 = RowId::new(); - let row_id2 = row_id1.next(); - - let row = DataRow::from_component_batches( - row_id2, - TimePoint::default(), - entity_path.clone(), - [&[point1] as _], - )?; - store.insert_row(&row)?; - - let row = DataRow::from_component_batches( - row_id1, - TimePoint::default(), - entity_path.clone(), - [&[point2] as _], - )?; - store.insert_row(&row)?; - - { - let query = LatestAtQuery::new(Timeline::new_temporal("doesnt_matter"), TimeInt::MAX); - let (_, _, got_point) = - query_latest_component::(&store, &entity_path, &query).unwrap(); - similar_asserts::assert_eq!(point1, got_point); - } - } - - Ok(()) -} - -// --- - -#[test] -fn write_errors() { - re_log::setup_logging(); - - let entity_path = EntityPath::from("this/that"); - - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - Default::default(), - ); - - let mut row = test_row!(entity_path @ [ - build_frame_nr(1), - build_log_time(Time::now()), - ] => [ MyPoint::from_iter(0..1) ]); - - row.row_id = re_log_types::RowId::new(); - store.insert_row(&row).unwrap(); - - row.row_id = row.row_id.next(); - store.insert_row(&row).unwrap(); - - assert!(matches!( - store.insert_row(&row), - Err(WriteError::ReusedRowId(_)), - )); - - let err = store.insert_row(&row).unwrap_err(); - let WriteError::ReusedRowId(err_row_id) = err else { - unreachable!(); - }; - assert_eq!(row.row_id(), err_row_id); - } -} - -// --- - -#[test] -fn latest_at_emptiness_edge_cases() { - re_log::setup_logging(); - - for config in re_data_store::test_util::all_configs() { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - latest_at_emptiness_edge_cases_impl(&mut store); - } -} - -fn latest_at_emptiness_edge_cases_impl(store: &mut DataStore) { - let entity_path = EntityPath::from("this/that"); - let now = Time::now(); - let now_minus_1s = now - Duration::from_secs(1.0); - let now_minus_1s_nanos = now_minus_1s.nanos_since_epoch(); - let frame39 = 39; - let frame40 = 40; - let num_instances = 3; - - store - .insert_row(&test_row!(entity_path @ [ - build_log_time(now), build_frame_nr(frame40), - ] => [MyIndex::from_iter(0..num_instances as _)])) - .unwrap(); - - sanity_unwrap(store); - - let timeline_wrong_name = Timeline::new("lag_time", TimeType::Time); - let timeline_wrong_kind = Timeline::new("log_time", TimeType::Sequence); - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - let timeline_log_time = Timeline::log_time(); - - // empty frame_nr - { - let cells = store.latest_at( - &LatestAtQuery::new(timeline_frame_nr, frame39), - &entity_path, - MyIndex::name(), - &[MyIndex::name()], - ); - assert!(cells.is_none()); - } - - // empty log_time - { - let cells = store.latest_at( - &LatestAtQuery::new(timeline_log_time, now_minus_1s_nanos), - &entity_path, - MyIndex::name(), - &[MyIndex::name()], - ); - assert!(cells.is_none()); - } - - // wrong entity path - { - let cells = store.latest_at( - &LatestAtQuery::new(timeline_frame_nr, frame40), - &EntityPath::from("does/not/exist"), - MyIndex::name(), - &[MyIndex::name()], - ); - assert!(cells.is_none()); - } - - // bunch of non-existing components - { - let components = &["does".into(), "not".into(), "exist".into()]; - let cells = store.latest_at( - &LatestAtQuery::new(timeline_frame_nr, frame40), - &entity_path, - MyIndex::name(), - components, - ); - assert!(cells.is_none()); - } - - // empty component list - { - let cells = store.latest_at( - &LatestAtQuery::new(timeline_frame_nr, frame40), - &entity_path, - MyIndex::name(), - &[], - ); - assert!(cells.is_none()); - } - - // wrong timeline name - { - let cells = store.latest_at( - &LatestAtQuery::new(timeline_wrong_name, frame40), - &EntityPath::from("does/not/exist"), - MyIndex::name(), - &[MyIndex::name()], - ); - assert!(cells.is_none()); - } - - // wrong timeline kind - { - let cells = store.latest_at( - &LatestAtQuery::new(timeline_wrong_kind, frame40), - &EntityPath::from("does/not/exist"), - MyIndex::name(), - &[MyIndex::name()], - ); - assert!(cells.is_none()); - } -} - -// --- - -#[test] -fn gc_correct() { - re_log::setup_logging(); - - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig::default(), - ); - - let stats_empty = DataStoreStats::from_store(&store); - - let mut rng = rand::thread_rng(); - - let num_frames = rng.gen_range(0..=100); - let frames = (0..num_frames).filter(|_| rand::thread_rng().gen()); - for frame_nr in frames { - let num_ents = 10; - for i in 0..num_ents { - let entity_path = EntityPath::from(format!("this/that/{i}")); - let num_instances = rng.gen_range(0..=1_000); - let row = test_row!(entity_path @ [ - build_frame_nr(frame_nr), - ] => [ - MyColor::from_iter(0..num_instances), - ]); - store.insert_row(&row).unwrap(); - } - } - - sanity_unwrap(&store); - check_still_readable(&store); - - let stats = DataStoreStats::from_store(&store); - - let (store_events, stats_diff) = store.gc(&GarbageCollectionOptions::gc_everything()); - let stats_diff = stats_diff + stats_empty; // account for fixed overhead - - assert_eq!( - stats.metadata_registry.num_rows, - stats_diff.metadata_registry.num_rows - ); - assert_eq!( - stats.metadata_registry.num_bytes, - stats_diff.metadata_registry.num_bytes - ); - assert_eq!(stats.temporal.num_rows, stats_diff.temporal.num_rows); - - sanity_unwrap(&store); - check_still_readable(&store); - for event in store_events { - assert!(store.row_metadata(&event.row_id).is_none()); - } - - let (store_events, stats_diff) = store.gc(&GarbageCollectionOptions::gc_everything()); - assert!(store_events.is_empty()); - assert_eq!(DataStoreStats::default(), stats_diff); - - sanity_unwrap(&store); - check_still_readable(&store); -} - -fn check_still_readable(store: &DataStore) { - store.to_data_table().unwrap(); // simple way of checking that everything is still readable -} - -// This used to panic because the GC will decrement the metadata_registry size trackers before -// getting the confirmation that the row was really removed. -#[test] -fn gc_metadata_size() -> anyhow::Result<()> { - for enable_batching in [false, true] { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - Default::default(), - ); - - let point = MyPoint::new(1.0, 1.0); - - for _ in 0..3 { - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::default(), - "xxx".into(), - [&[point] as _], - )?; - store.insert_row(&row).unwrap(); - } - - for _ in 0..2 { - _ = store.gc(&GarbageCollectionOptions { - target: re_data_store::GarbageCollectionTarget::DropAtLeastFraction(1.0), - protect_latest: 1, - purge_empty_tables: false, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching, - time_budget: std::time::Duration::MAX, - }); - _ = store.gc(&GarbageCollectionOptions { - target: re_data_store::GarbageCollectionTarget::DropAtLeastFraction(1.0), - protect_latest: 1, - purge_empty_tables: false, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching, - time_budget: std::time::Duration::MAX, - }); - } - } - - Ok(()) -} - -// --- - -#[test] -fn entity_min_time_correct() -> anyhow::Result<()> { - re_log::setup_logging(); - - for config in re_data_store::test_util::all_configs() { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - entity_min_time_correct_impl(&mut store)?; - } - - Ok(()) -} - -fn entity_min_time_correct_impl(store: &mut DataStore) -> anyhow::Result<()> { - let entity_path = EntityPath::from("this/that"); - let wrong_entity_path = EntityPath::from("this/that/other"); - - let point = MyPoint::new(1.0, 1.0); - let timeline_wrong_name = Timeline::new("lag_time", TimeType::Time); - let timeline_wrong_kind = Timeline::new("log_time", TimeType::Sequence); - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - let timeline_log_time = Timeline::log_time(); - - let now = Time::now(); - let now_plus_one = now + Duration::from_secs(1.0); - let now_minus_one = now - Duration::from_secs(1.0); - - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::default() - .with(timeline_log_time, now) - .with(timeline_frame_nr, 42), - entity_path.clone(), - [&[point] as _], - )?; - - store.insert_row(&row).unwrap(); - - assert!(store - .entity_min_time(&timeline_wrong_name, &entity_path) - .is_none()); - assert!(store - .entity_min_time(&timeline_wrong_kind, &entity_path) - .is_none()); - assert_eq!( - store.entity_min_time(&timeline_frame_nr, &entity_path), - Some(TimeInt::new_temporal(42)) - ); - assert_eq!( - store.entity_min_time(&timeline_log_time, &entity_path), - Some(TimeInt::try_from(now).unwrap()) - ); - assert!(store - .entity_min_time(&timeline_frame_nr, &wrong_entity_path) - .is_none()); - - // insert row in the future, these shouldn't be visible - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::default() - .with(timeline_log_time, now_plus_one) - .with(timeline_frame_nr, 54), - entity_path.clone(), - [&[point] as _], - )?; - store.insert_row(&row).unwrap(); - - assert!(store - .entity_min_time(&timeline_wrong_name, &entity_path) - .is_none()); - assert!(store - .entity_min_time(&timeline_wrong_kind, &entity_path) - .is_none()); - assert_eq!( - store.entity_min_time(&timeline_frame_nr, &entity_path), - Some(TimeInt::new_temporal(42)) - ); - assert_eq!( - store.entity_min_time(&timeline_log_time, &entity_path), - Some(TimeInt::try_from(now).unwrap()) - ); - assert!(store - .entity_min_time(&timeline_frame_nr, &wrong_entity_path) - .is_none()); - - // insert row in the past, these should be visible - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::default() - .with(timeline_log_time, now_minus_one) - .with(timeline_frame_nr, 32), - entity_path.clone(), - [&[point] as _], - )?; - store.insert_row(&row).unwrap(); - - assert!(store - .entity_min_time(&timeline_wrong_name, &entity_path) - .is_none()); - assert!(store - .entity_min_time(&timeline_wrong_kind, &entity_path) - .is_none()); - assert_eq!( - store.entity_min_time(&timeline_frame_nr, &entity_path), - Some(TimeInt::new_temporal(32)) - ); - assert_eq!( - store.entity_min_time(&timeline_log_time, &entity_path), - Some(TimeInt::try_from(now_minus_one).unwrap()) - ); - assert!(store - .entity_min_time(&timeline_frame_nr, &wrong_entity_path) - .is_none()); - - Ok(()) -} diff --git a/crates/re_data_store/tests/data_store.rs b/crates/re_data_store/tests/data_store.rs deleted file mode 100644 index 999202c6072b..000000000000 --- a/crates/re_data_store/tests/data_store.rs +++ /dev/null @@ -1,831 +0,0 @@ -//! Straightforward high-level API tests. -//! -//! Testing & demonstrating expected usage of the datastore APIs, no funny stuff. - -// https://github.com/rust-lang/rust-clippy/issues/10011 -#![cfg(test)] - -use itertools::Itertools; -use rand::Rng; -use re_data_store::{ - test_row, - test_util::{insert_table_with_retries, sanity_unwrap}, - DataStore, DataStoreConfig, DataStoreStats, GarbageCollectionOptions, GarbageCollectionTarget, - LatestAtQuery, RangeQuery, ResolvedTimeRange, TimeInt, -}; -use re_log_types::{ - build_frame_nr, - example_components::{MyColor, MyIndex, MyPoint}, - DataRow, DataTable, EntityPath, TableId, TimeType, Timeline, -}; -use re_types::testing::{build_some_large_structs, LargeStruct}; -use re_types::ComponentNameSet; -use re_types_core::{ComponentName, Loggable as _}; - -// --- LatestComponentsAt --- - -#[test] -fn all_components() { - re_log::setup_logging(); - - let entity_path = EntityPath::from("this/that"); - - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - let frame4 = TimeInt::new_temporal(4); - - let assert_latest_components_at = - |store: &mut DataStore, entity_path: &EntityPath, expected: Option<&[ComponentName]>| { - // Stress test save-to-disk & load-from-disk - let mut store2 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - store.config().clone(), - ); - for table in store.to_data_tables(None) { - insert_table_with_retries(&mut store2, &table); - } - - // Stress test GC - store2.gc(&GarbageCollectionOptions::gc_everything()); - for table in store.to_data_tables(None) { - insert_table_with_retries(&mut store2, &table); - } - - let store = store2; - let timeline = Timeline::new("frame_nr", TimeType::Sequence); - - let component_names = store.all_components(&timeline, entity_path); - - let expected_component_names = expected.map(|expected| { - let expected: ComponentNameSet = expected.iter().copied().collect(); - expected - }); - - store.sort_indices_if_needed(); - assert_eq!( - expected_component_names, component_names, - "expected to find {expected_component_names:?}, found {component_names:?} instead\n{store}", - ); - }; - - // One big bucket, demonstrating the easier-to-reason-about cases. - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig { - indexed_bucket_num_rows: u64::MAX, - ..Default::default() - }, - ); - - let components_a = &[ - MyColor::name(), // added by test, static - LargeStruct::name(), // added by test - ]; - - let components_b = &[ - MyColor::name(), // added by test, static - MyPoint::name(), // added by test - LargeStruct::name(), // added by test - ]; - - let row = test_row!(entity_path => [MyColor::from_iter(0..2)]); - store.insert_row(&row).unwrap(); - - let row = - test_row!(entity_path @ [build_frame_nr(frame1)] => [build_some_large_structs(2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_a)); - - let row = test_row!(entity_path @ [ - build_frame_nr(frame2), - ] => [build_some_large_structs(2), MyPoint::from_iter(0..2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_b)); - - sanity_unwrap(&store); - } - - // Tiny buckets, demonstrating the harder-to-reason-about cases. - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig { - indexed_bucket_num_rows: 0, - ..Default::default() - }, - ); - - // ┌──────────┬─────────────┬────────┬───────────┬──────────┐ - // │ frame_nr ┆ LargeStruct ┆ row_id ┆ insert_id ┆ instance │ - // ╞══════════╪═════════════╪════════╪═══════════╪══════════╡ - // │ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ - // └──────────┴─────────────┴────────┴───────────┴──────────┘ - // ┌──────────┬─────────────┬─────────┬────────┬───────────┬──────────┐ - // │ frame_nr ┆ LargeStruct ┆ point2d ┆ row_id ┆ insert_id ┆ instance │ - // ╞══════════╪═════════════╪═════════╪════════╪═══════════╪══════════╡ - // │ 2 ┆ - ┆ - ┆ 2 ┆ 2 ┆ 2 │ - // ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ - // │ 3 ┆ - ┆ 1 ┆ 3 ┆ 3 ┆ 1 │ - // └──────────┴─────────────┴─────────┴────────┴───────────┴──────────┘ - - let components_a = &[ - MyColor::name(), // added by test, static - LargeStruct::name(), // added by test - ]; - - let components_b = &[ - MyColor::name(), // added by test, static - LargeStruct::name(), // ⚠ inherited before the buckets got split apart! - MyPoint::name(), // added by test - ]; - - let row = test_row!(entity_path => [MyColor::from_iter(0..2)]); - store.insert_row(&row).unwrap(); - - let row = - test_row!(entity_path @ [build_frame_nr(frame1)] => [build_some_large_structs(2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_a)); - - let row = test_row!(entity_path @ [build_frame_nr(frame3)] => [MyPoint::from_iter(0..2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_b)); - - sanity_unwrap(&store); - } - - // Tiny buckets and tricky splits, demonstrating a case that is not only extremely hard to - // reason about, it is technically incorrect. - { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig { - indexed_bucket_num_rows: 0, - ..Default::default() - }, - ); - - // ┌──────────┬─────────────┬─────────┬────────┬───────────┬──────────┐ - // │ frame_nr ┆ LargeStruct ┆ point2d ┆ row_id ┆ insert_id ┆ instance │ - // ╞══════════╪═════════════╪═════════╪════════╪═══════════╪══════════╡ - // │ 1 ┆ - ┆ 1 ┆ 4 ┆ 4 ┆ 1 │ - // ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ - // │ 2 ┆ 1 ┆ - ┆ 1 ┆ 1 ┆ 1 │ - // └──────────┴─────────────┴─────────┴────────┴───────────┴──────────┘ - // ┌──────────┬─────────────┬────────┬───────────┬──────────┐ - // │ frame_nr ┆ LargeStruct ┆ row_id ┆ insert_id ┆ instance │ - // ╞══════════╪═════════════╪════════╪═══════════╪══════════╡ - // │ 3 ┆ 2 ┆ 2 ┆ 2 ┆ 1 │ - // ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ - // │ 4 ┆ 3 ┆ 3 ┆ 3 ┆ 1 │ - // └──────────┴─────────────┴────────┴───────────┴──────────┘ - - let components_a = &[ - MyColor::name(), // added by test, static - LargeStruct::name(), // added by test - ]; - - let components_b = &[ - MyColor::name(), // added by test, static - MyPoint::name(), // added by test but not contained in the second bucket - LargeStruct::name(), // added by test - ]; - - let row = test_row!(entity_path => [MyColor::from_iter(0..2)]); - store.insert_row(&row).unwrap(); - - let row = - test_row!(entity_path @ [build_frame_nr(frame2)] => [build_some_large_structs(2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_a)); - - let row = - test_row!(entity_path @ [build_frame_nr(frame3)] => [build_some_large_structs(2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_a)); - - let row = - test_row!(entity_path @ [build_frame_nr(frame4)] => [build_some_large_structs(2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_a)); - - let row = test_row!(entity_path @ [build_frame_nr(frame1)] => [MyPoint::from_iter(0..2)]); - store.insert_row(&row).unwrap(); - - assert_latest_components_at(&mut store, &entity_path, Some(components_b)); - - sanity_unwrap(&store); - } -} - -// --- LatestAt --- - -#[test] -fn latest_at() { - re_log::setup_logging(); - - for config in re_data_store::test_util::all_configs() { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - latest_at_impl(&mut store); - } -} - -fn latest_at_impl(store: &mut DataStore) { - re_log::setup_logging(); - - let entity_path = EntityPath::from("this/that"); - - let frame0 = TimeInt::new_temporal(0); - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - let frame4 = TimeInt::new_temporal(4); - - let (instances1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); - let row1 = test_row!(entity_path @ [build_frame_nr(frame1)] => [instances1.clone(), colors1]); - - let positions2 = MyPoint::from_iter(0..3); - let row2 = test_row!(entity_path @ [build_frame_nr(frame2)] => [instances1, positions2]); - - let points3 = MyPoint::from_iter(0..10); - let row3 = test_row!(entity_path @ [build_frame_nr(frame3)] => [points3]); - - let colors4 = MyColor::from_iter(0..5); - let row4 = test_row!(entity_path @ [build_frame_nr(frame4)] => [colors4]); - - // injecting some static colors - let colors5 = MyColor::from_iter(0..3); - let row5 = test_row!(entity_path => [colors5]); - - insert_table_with_retries( - store, - &DataTable::from_rows( - TableId::new(), - [ - row1.clone(), - row2.clone(), - row3.clone(), - row4.clone(), - row5.clone(), - ], - ), - ); - - // Stress test save-to-disk & load-from-disk - let mut store2 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - store.config().clone(), - ); - for table in store.to_data_tables(None) { - insert_table_with_retries(&mut store2, &table); - } - // Stress test GC - store2.gc(&GarbageCollectionOptions::gc_everything()); - for table in store.to_data_tables(None) { - insert_table_with_retries(&mut store2, &table); - } - let store = store2; - - sanity_unwrap(&store); - - let assert_latest_components = |frame_nr: TimeInt, rows: &[(ComponentName, &DataRow)]| { - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - - for (component_name, expected) in rows { - let (_, _, cells) = store - .latest_at::<1>( - &LatestAtQuery::new(timeline_frame_nr, frame_nr), - &entity_path, - *component_name, - &[*component_name], - ) - .unwrap(); - - let expected = expected - .cells - .iter() - .filter(|cell| cell.component_name() == *component_name) - .collect_vec(); - let actual = cells.iter().flatten().collect_vec(); - assert_eq!(expected, actual); - } - }; - - assert_latest_components( - frame0, - &[ - (MyColor::name(), &row5), // static - ], - ); - assert_latest_components( - frame1, - &[ - (MyColor::name(), &row5), // static - ], - ); - assert_latest_components( - frame2, - &[(MyColor::name(), &row5), (MyPoint::name(), &row2)], - ); - assert_latest_components( - frame3, - &[(MyColor::name(), &row5), (MyPoint::name(), &row3)], - ); - assert_latest_components( - frame4, - &[(MyColor::name(), &row5), (MyPoint::name(), &row3)], - ); -} - -// --- Range --- - -#[test] -fn range() { - re_log::setup_logging(); - - for config in re_data_store::test_util::all_configs() { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - range_impl(&mut store); - } -} - -fn range_impl(store: &mut DataStore) { - re_log::setup_logging(); - - let entity_path = EntityPath::from("this/that"); - - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - let frame4 = TimeInt::new_temporal(4); - let frame5 = TimeInt::new_temporal(5); - - let insts1 = MyIndex::from_iter(0..3); - let colors1 = MyColor::from_iter(0..3); - let row1 = test_row!(entity_path @ [build_frame_nr(frame1)] => [insts1.clone(), colors1]); - - let positions2 = MyPoint::from_iter(0..3); - let row2 = test_row!(entity_path @ [build_frame_nr(frame2)] => [insts1, positions2]); - - let points3 = MyPoint::from_iter(0..10); - let row3 = test_row!(entity_path @ [build_frame_nr(frame3)] => [points3]); - - let insts4_1 = MyIndex::from_iter(20..25); - let colors4_1 = MyColor::from_iter(0..5); - let row4_1 = test_row!(entity_path @ [build_frame_nr(frame4)] => [insts4_1, colors4_1]); - - let insts4_2 = MyIndex::from_iter(25..30); - let colors4_2 = MyColor::from_iter(0..5); - let row4_2 = test_row!(entity_path @ [build_frame_nr(frame4)] => [insts4_2.clone(), colors4_2]); - - let points4_25 = MyPoint::from_iter(0..5); - let row4_25 = test_row!(entity_path @ [build_frame_nr(frame4)] => [insts4_2, points4_25]); - - let insts4_3 = MyIndex::from_iter(30..35); - let colors4_3 = MyColor::from_iter(0..5); - let row4_3 = test_row!(entity_path @ [build_frame_nr(frame4)] => [insts4_3.clone(), colors4_3]); - - let points4_4 = MyPoint::from_iter(0..5); - let row4_4 = test_row!(entity_path @ [build_frame_nr(frame4)] => [insts4_3, points4_4]); - - // injecting some static colors - let colors5 = MyColor::from_iter(0..8); - let row5 = test_row!(entity_path => [colors5]); - - insert_table_with_retries( - store, - &DataTable::from_rows( - TableId::new(), - [ - row1.clone(), - row2.clone(), - row3.clone(), - row4_1.clone(), - row4_2.clone(), - row4_25.clone(), - row4_3.clone(), - row4_4.clone(), - row5.clone(), - ], - ), - ); - - sanity_unwrap(store); - - // Each entry in `rows_at_times` corresponds to a dataframe that's expected to be returned - // by the range query. - // A single timepoint might have several of those! That's one of the behaviors specific to - // range queries. - #[allow(clippy::type_complexity)] - let assert_range_components = - |time_range: ResolvedTimeRange, - components: [ComponentName; 2], - rows_at_times: &[(TimeInt, &[(ComponentName, &DataRow)])]| { - // Stress test save-to-disk & load-from-disk - let mut store2 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - store.config().clone(), - ); - for table in store.to_data_tables(None) { - insert_table_with_retries(&mut store2, &table); - } - store2.gc(&GarbageCollectionOptions::gc_everything()); - for table in store.to_data_tables(None) { - insert_table_with_retries(&mut store2, &table); - } - let store = store2; - - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - - store.sort_indices_if_needed(); // for assertions below - - let components = [components[0], components[1]]; - let query = RangeQuery::new(timeline_frame_nr, time_range); - let results = store.range(&query, &entity_path, components).collect_vec(); - - let mut results_processed = 0usize; - for (i, (time, _, cells)) in results.into_iter().enumerate() { - let (expected_time, expected_rows) = rows_at_times[i]; - assert_eq!(expected_time, time); - - for (component_name, expected) in expected_rows { - let expected = expected - .cells - .iter() - .filter(|cell| cell.component_name() == *component_name) - .collect_vec(); - let actual = cells.iter().flatten().collect_vec(); - assert_eq!(expected, actual); - - results_processed += 1; - } - } - - let results_processed_expected = rows_at_times.len(); - assert_eq!(results_processed_expected, results_processed); - }; - - // TODO(cmc): bring back some log_time scenarios - - // Unit ranges (multi-PoV) - - assert_range_components( - ResolvedTimeRange::new(frame1, frame1), - [MyColor::name(), MyPoint::name()], - &[ - (TimeInt::STATIC, &[(MyColor::name(), &row5)]), // - ], - ); - assert_range_components( - ResolvedTimeRange::new(frame2, frame2), - [MyColor::name(), MyPoint::name()], - &[ - (TimeInt::STATIC, &[(MyColor::name(), &row5)]), // - (frame2, &[(MyPoint::name(), &row2)]), // - ], - ); - assert_range_components( - ResolvedTimeRange::new(frame3, frame3), - [MyColor::name(), MyPoint::name()], - &[ - (TimeInt::STATIC, &[(MyColor::name(), &row5)]), // - (frame3, &[(MyPoint::name(), &row3)]), // - ], - ); - assert_range_components( - ResolvedTimeRange::new(frame4, frame4), - [MyColor::name(), MyPoint::name()], - &[ - (TimeInt::STATIC, &[(MyColor::name(), &row5)]), // - (frame4, &[(MyPoint::name(), &row4_25)]), - (frame4, &[(MyPoint::name(), &row4_4)]), - ], - ); - assert_range_components( - ResolvedTimeRange::new(frame5, frame5), - [MyColor::name(), MyPoint::name()], - &[ - (TimeInt::STATIC, &[(MyColor::name(), &row5)]), // - ], - ); - - // Full range (multi-PoV) - - assert_range_components( - ResolvedTimeRange::new(frame1, frame5), - [MyColor::name(), MyPoint::name()], - &[ - (TimeInt::STATIC, &[(MyColor::name(), &row5)]), // - (frame2, &[(MyPoint::name(), &row2)]), // - (frame3, &[(MyPoint::name(), &row3)]), // - (frame4, &[(MyPoint::name(), &row4_25)]), - (frame4, &[(MyPoint::name(), &row4_4)]), - ], - ); - - // Infinite range (multi-PoV) - - assert_range_components( - ResolvedTimeRange::new(TimeInt::MIN, TimeInt::MAX), - [MyColor::name(), MyPoint::name()], - &[ - (TimeInt::STATIC, &[(MyColor::name(), &row5)]), // - (frame2, &[(MyPoint::name(), &row2)]), // - (frame3, &[(MyPoint::name(), &row3)]), // - (frame4, &[(MyPoint::name(), &row4_25)]), - (frame4, &[(MyPoint::name(), &row4_4)]), - ], - ); -} - -// --- GC --- - -#[test] -fn gc() { - re_log::setup_logging(); - - for config in re_data_store::test_util::all_configs() { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - gc_impl(&mut store); - } -} - -fn gc_impl(store: &mut DataStore) { - let mut rng = rand::thread_rng(); - - for _ in 0..2 { - let num_ents = 10; - for i in 0..num_ents { - let entity_path = EntityPath::from(format!("this/that/{i}")); - - let num_frames = rng.gen_range(0..=100); - let frames = (0..num_frames).filter(|_| rand::thread_rng().gen()); - for frame_nr in frames { - let num_instances = rng.gen_range(0..=1_000); - let row = test_row!(entity_path @ [ - build_frame_nr(frame_nr) - ] => [ - build_some_large_structs(num_instances as _), - ]); - store.insert_row(&row).unwrap(); - } - } - - sanity_unwrap(store); - _ = store.to_data_table(); // simple way of checking that everything is still readable - - let stats = DataStoreStats::from_store(store); - - let (store_events, stats_diff) = store.gc(&GarbageCollectionOptions { - target: GarbageCollectionTarget::DropAtLeastFraction(1.0 / 3.0), - protect_latest: 0, - purge_empty_tables: false, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - time_budget: std::time::Duration::MAX, - }); - for event in store_events { - assert!(store.row_metadata(&event.row_id).is_none()); - } - - // NOTE: only temporal data and row metadata get purged! - let num_bytes_dropped = - (stats_diff.temporal.num_bytes + stats_diff.metadata_registry.num_bytes) as f64; - let num_bytes_dropped_expected_min = - (stats.temporal.num_bytes + stats.metadata_registry.num_bytes) as f64 * 0.95 / 3.0; - let num_bytes_dropped_expected_max = - (stats.temporal.num_bytes + stats.metadata_registry.num_bytes) as f64 * 1.05 / 3.0; - assert!( - num_bytes_dropped_expected_min <= num_bytes_dropped - && num_bytes_dropped <= num_bytes_dropped_expected_max, - "{} <= {} <= {}", - re_format::format_bytes(num_bytes_dropped_expected_min), - re_format::format_bytes(num_bytes_dropped), - re_format::format_bytes(num_bytes_dropped_expected_max), - ); - } -} - -#[test] -fn protected_gc() { - re_log::setup_logging(); - - for config in re_data_store::test_util::all_configs() { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - protected_gc_impl(&mut store); - } -} - -fn protected_gc_impl(store: &mut DataStore) { - re_log::setup_logging(); - - let entity_path = EntityPath::from("this/that"); - - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - let frame4 = TimeInt::new_temporal(4); - - let (instances1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); - let row1 = test_row!(entity_path @ [build_frame_nr(frame1)] => [instances1.clone(), colors1]); - - let positions2 = MyPoint::from_iter(0..3); - let row2 = test_row!(entity_path @ [build_frame_nr(frame2)] => [instances1, positions2]); - - let points3 = MyPoint::from_iter(0..10); - let row3 = test_row!(entity_path @ [build_frame_nr(frame3)] => [points3]); - - let colors4 = MyColor::from_iter(0..5); - let row4 = test_row!(entity_path @ [build_frame_nr(frame4)] => [colors4]); - - store.insert_row(&row1).unwrap(); - store.insert_row(&row2).unwrap(); - store.insert_row(&row3).unwrap(); - store.insert_row(&row4).unwrap(); - - // Re-insert row1 and row2 as static data as well - let mut static_table = - DataTable::from_rows(TableId::new(), [row1.clone().next(), row2.clone().next()]); - static_table.col_timelines = Default::default(); - insert_table_with_retries(store, &static_table); - - store.gc(&GarbageCollectionOptions { - target: GarbageCollectionTarget::Everything, - protect_latest: 1, - purge_empty_tables: true, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - time_budget: std::time::Duration::MAX, - }); - - let assert_latest_components = |frame_nr: TimeInt, rows: &[(ComponentName, &DataRow)]| { - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - - for (component_name, expected) in rows { - let (_, _, cells) = store - .latest_at::<1>( - &LatestAtQuery::new(timeline_frame_nr, frame_nr), - &entity_path, - *component_name, - &[*component_name], - ) - .unwrap(); - - let expected = expected - .cells - .iter() - .filter(|cell| cell.component_name() == *component_name) - .collect_vec(); - let actual = cells.iter().flatten().collect_vec(); - assert_eq!(expected, actual); - } - }; - - // The static data was preserved - assert_latest_components( - TimeInt::STATIC, - &[(MyColor::name(), &row1), (MyPoint::name(), &row2)], // static - ); - - assert_latest_components( - frame3, - &[ - (MyColor::name(), &row1), // static - (MyPoint::name(), &row2), // static - ], - ); - - assert_latest_components( - frame4, - &[ - (MyColor::name(), &row1), // static - (MyPoint::name(), &row2), // static - ], - ); -} - -#[test] -fn protected_gc_clear() { - re_log::setup_logging(); - - for config in re_data_store::test_util::all_configs() { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - protected_gc_clear_impl(&mut store); - } -} - -fn protected_gc_clear_impl(store: &mut DataStore) { - re_log::setup_logging(); - - let entity_path = EntityPath::from("this/that"); - - let frame0 = TimeInt::new_temporal(0); - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - let frame4 = TimeInt::new_temporal(4); - - let (instances1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); - let row1 = test_row!(entity_path @ [build_frame_nr(frame1)] => [instances1.clone(), colors1]); - - let positions2 = MyPoint::from_iter(0..3); - let row2 = test_row!(entity_path @ [build_frame_nr(frame2)] => [instances1, positions2]); - - let colors2 = MyColor::from_iter(0..0); - let row3 = test_row!(entity_path @ [build_frame_nr(frame3)] => [colors2]); - - let points4 = MyPoint::from_iter(0..0); - let row4 = test_row!(entity_path @ [build_frame_nr(frame4)] => [points4]); - - // Insert the 3 rows as static - let mut static_table = - DataTable::from_rows(TableId::new(), [row1.clone(), row2.clone(), row3.clone()]); - static_table.col_timelines = Default::default(); - insert_table_with_retries(store, &static_table); - - store.gc(&GarbageCollectionOptions { - target: GarbageCollectionTarget::Everything, - protect_latest: 1, - purge_empty_tables: true, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - time_budget: std::time::Duration::MAX, - }); - - let assert_latest_components = |frame_nr: TimeInt, rows: &[(ComponentName, &DataRow)]| { - let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence); - - for (component_name, expected) in rows { - let (_, _, cells) = store - .latest_at::<1>( - &LatestAtQuery::new(timeline_frame_nr, frame_nr), - &entity_path, - *component_name, - &[*component_name], - ) - .unwrap(); - - let expected = expected - .cells - .iter() - .filter(|cell| cell.component_name() == *component_name) - .collect_vec(); - let actual = cells.iter().flatten().collect_vec(); - assert_eq!(expected, actual); - } - }; - - assert_latest_components( - frame0, - &[(MyColor::name(), &row3), (MyPoint::name(), &row2)], - ); - - // The 3 static cells should still be around. - let stats = DataStoreStats::from_store(store); - assert_eq!(stats.static_tables.num_rows, 1); - - // Now erase points and GC again - let mut static_table = DataTable::from_rows(TableId::new(), [row4]); - static_table.col_timelines = Default::default(); - insert_table_with_retries(store, &static_table); - - store.gc(&GarbageCollectionOptions { - target: GarbageCollectionTarget::Everything, - protect_latest: 1, - purge_empty_tables: true, - dont_protect_components: Default::default(), - dont_protect_timelines: Default::default(), - enable_batching: false, - time_budget: std::time::Duration::MAX, - }); - - let stats = DataStoreStats::from_store(store); - assert_eq!(stats.static_tables.num_rows, 1); -} diff --git a/crates/re_data_store/tests/dump.rs b/crates/re_data_store/tests/dump.rs deleted file mode 100644 index 7b8b91c7fa3a..000000000000 --- a/crates/re_data_store/tests/dump.rs +++ /dev/null @@ -1,369 +0,0 @@ -//! Dumping a datastore to log messages and back. - -// https://github.com/rust-lang/rust-clippy/issues/10011 -#![cfg(test)] - -use itertools::Itertools; -use re_data_store::{ - test_row, - test_util::{insert_table_with_retries, sanity_unwrap}, - DataStore, DataStoreStats, GarbageCollectionOptions, ResolvedTimeRange, TimeInt, Timeline, -}; -use re_log_types::{ - build_frame_nr, build_log_time, - example_components::{MyColor, MyIndex, MyPoint}, - DataRow, DataTable, EntityPath, RowId, TableId, TimePoint, -}; - -// --- - -// Panic on RowId clash. -fn insert_table(store: &mut DataStore, table: &DataTable) { - for row in table.to_rows() { - let row = row.unwrap(); - store.insert_row(&row).unwrap(); - } -} - -// --- - -/// Allows adding more data to the same `RowId`. -#[derive(Default)] -struct RowSet(ahash::HashMap); - -impl RowSet { - fn insert_tables(&mut self, tables: impl Iterator) { - for table in tables { - self.insert_table(&table); - } - } - - fn insert_table(&mut self, table: &DataTable) { - for row in table.to_rows() { - self.insert_row(row.unwrap()); - } - } - - fn insert_row(&mut self, row: re_log_types::DataRow) { - match self.0.entry(row.row_id()) { - std::collections::hash_map::Entry::Occupied(mut entry) => { - assert_eq!(entry.get().entity_path(), row.entity_path()); - assert_eq!(entry.get().cells(), row.cells()); - for (timeline, time) in row.timepoint() { - entry.get_mut().timepoint.insert(*timeline, *time); - } - } - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(row); - } - } - } - - fn insert_into(self, store: &mut DataStore) { - let mut rows = self.0.into_values().collect::>(); - rows.sort_by_key(|row| (row.timepoint.clone(), row.row_id)); - for row in rows { - store.insert_row(&row).unwrap(); - } - } -} - -// --- Dump --- - -#[test] -fn data_store_dump() { - re_log::setup_logging(); - - for mut config in re_data_store::test_util::all_configs() { - // NOTE: insert IDs aren't serialized and can be different across runs. - config.store_insert_ids = false; - - let mut store1 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - let mut store2 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - let mut store3 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - - data_store_dump_impl(&mut store1, &mut store2, &mut store3); - - // stress-test GC impl - store1.gc(&GarbageCollectionOptions::gc_everything()); - store2.gc(&GarbageCollectionOptions::gc_everything()); - store3.gc(&GarbageCollectionOptions::gc_everything()); - - data_store_dump_impl(&mut store1, &mut store2, &mut store3); - } -} - -fn data_store_dump_impl(store1: &mut DataStore, store2: &mut DataStore, store3: &mut DataStore) { - let entity_paths = ["this/that", "other", "yet/another/one"]; - let tables = entity_paths - .iter() - .map(|entity_path| create_insert_table(*entity_path)) - .collect_vec(); - - // Fill the first store. - for table in &tables { - // insert temporal - insert_table(store1, table); - - // insert static - let mut table_static = table.clone(); - table_static.col_timelines = Default::default(); - insert_table_with_retries(store1, &table_static); - } - sanity_unwrap(store1); - - // Dump the first store into the second one. - { - // We use a RowSet instead of a DataTable to handle duplicate RowIds. - let mut row_set = RowSet::default(); - row_set.insert_tables(store1.to_data_tables(None)); - row_set.insert_into(store2); - sanity_unwrap(store2); - } - - // Dump the second store into the third one. - { - let mut row_set = RowSet::default(); - row_set.insert_tables(store2.to_data_tables(None)); - row_set.insert_into(store3); - sanity_unwrap(store3); - } - - { - let table_id = TableId::new(); // Reuse TableId so == works - let table1 = DataTable::from_rows(table_id, store1.to_rows().unwrap()); - let table2 = DataTable::from_rows(table_id, store2.to_rows().unwrap()); - let table3 = DataTable::from_rows(table_id, store3.to_rows().unwrap()); - assert!( - table1 == table2, - "First & second stores differ:\n{table1}\n{table2}" - ); - assert!( - table1 == table3, - "First & third stores differ:\n{table1}\n{table3}" - ); - } - - let store1_stats = DataStoreStats::from_store(store1); - let store2_stats = DataStoreStats::from_store(store2); - let store3_stats = DataStoreStats::from_store(store3); - assert!( - store1_stats.temporal.num_bytes <= store2_stats.temporal.num_bytes - && store1_stats.static_tables.num_bytes <= store2_stats.static_tables.num_bytes, - "First store should have <= amount of data of second store:\n\ - {store1_stats:#?}\n{store2_stats:#?}" - ); - assert!( - store2_stats.temporal.num_bytes <= store3_stats.temporal.num_bytes - && store2_stats.static_tables.num_bytes <= store3_stats.static_tables.num_bytes, - "Second store should have <= amount of data of third store:\n\ - {store2_stats:#?}\n{store3_stats:#?}" - ); -} - -// --- Time-based filtering --- - -#[test] -fn data_store_dump_filtered() { - re_log::setup_logging(); - - for mut config in re_data_store::test_util::all_configs() { - // NOTE: insert IDs aren't serialized and can be different across runs. - config.store_insert_ids = false; - - let mut store1 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - let mut store2 = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config.clone(), - ); - - data_store_dump_filtered_impl(&mut store1, &mut store2); - - // stress-test GC impl - store1.gc(&GarbageCollectionOptions::gc_everything()); - store2.gc(&GarbageCollectionOptions::gc_everything()); - - data_store_dump_filtered_impl(&mut store1, &mut store2); - } -} - -fn data_store_dump_filtered_impl(store1: &mut DataStore, store2: &mut DataStore) { - let timeline_frame_nr = Timeline::new_sequence("frame_nr"); - let timeline_log_time = Timeline::log_time(); - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - let frame4 = TimeInt::new_temporal(4); - - let entity_paths = ["this/that", "other", "yet/another/one"]; - let tables = entity_paths - .iter() - .map(|entity_path| create_insert_table(*entity_path)) - .collect_vec(); - - // Fill the first store. - for table in &tables { - insert_table(store1, table); - } - sanity_unwrap(store1); - - // We use a RowSet instead of a DataTable to handle duplicate RowIds. - let mut row_set = RowSet::default(); - - // Dump frame1 from the first store. - row_set.insert_tables( - store1.to_data_tables((timeline_frame_nr, ResolvedTimeRange::new(frame1, frame1)).into()), - ); - // Dump frame2 from the first store. - row_set.insert_tables( - store1.to_data_tables((timeline_frame_nr, ResolvedTimeRange::new(frame2, frame2)).into()), - ); - // Dump frame3 from the first store. - row_set.insert_tables( - store1.to_data_tables((timeline_frame_nr, ResolvedTimeRange::new(frame3, frame3)).into()), - ); - // Dump frame3 _from the other timeline_, from the first store. - // This will produce the same RowIds again! - row_set.insert_tables( - store1.to_data_tables((timeline_log_time, ResolvedTimeRange::new(frame3, frame3)).into()), - ); - // Dump frame4 from the first store. - row_set.insert_tables( - store1.to_data_tables((timeline_frame_nr, ResolvedTimeRange::new(frame4, frame4)).into()), - ); - - row_set.insert_into(store2); - sanity_unwrap(store2); - - { - let table_id = TableId::new(); // Reuse TableId so == works - let table1 = DataTable::from_rows(table_id, store1.to_rows().unwrap()); - let table2 = DataTable::from_rows(table_id, store2.to_rows().unwrap()); - assert!( - table1 == table2, - "First & second stores differ:\n{table1}\n{table2}" - ); - } - - let store1_stats = DataStoreStats::from_store(store1); - let store2_stats = DataStoreStats::from_store(store2); - assert!( - store1_stats.temporal.num_bytes <= store2_stats.temporal.num_bytes - && store1_stats.static_tables.num_bytes <= store2_stats.static_tables.num_bytes, - "First store should have <= amount of data of second store:\n\ - {store1_stats:#?}\n{store2_stats:#?}" - ); -} - -// --- - -fn create_insert_table(entity_path: impl Into) -> DataTable { - let entity_path = entity_path.into(); - - let timeless = TimePoint::default(); - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - let frame4 = TimeInt::new_temporal(4); - - let (instances1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); - let row1 = test_row!(entity_path @ [ - build_frame_nr(frame1), - ] => [instances1.clone(), colors1]); - - let positions2 = MyPoint::from_iter(0..2); - let row2 = test_row!(entity_path @ [ - build_frame_nr(frame2), - ] => [instances1, positions2.clone()]); - - let positions3 = MyPoint::from_iter(0..10); - let row3 = test_row!(entity_path @ [ - build_log_time(frame3.into()) /* ! */, build_frame_nr(frame3), - ] => [positions3]); - - let colors4 = MyColor::from_iter(0..5); - let row4 = test_row!(entity_path @ [ - build_frame_nr(frame4), - ] => [colors4.clone()]); - - let row0 = test_row!(entity_path @ timeless => [positions2, colors4]); - - let mut table = DataTable::from_rows(TableId::new(), [row0, row1, row2, row3, row4]); - table.compute_all_size_bytes(); - - table -} - -// See: https://github.com/rerun-io/rerun/pull/2007 -#[test] -fn data_store_dump_empty_column() { - re_log::setup_logging(); - - // Split tables on 1 row - let mut config = re_data_store::DataStoreConfig { - indexed_bucket_num_rows: 1, - ..re_data_store::DataStoreConfig::DEFAULT - }; - config.store_insert_ids = false; - - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - config, - ); - - data_store_dump_empty_column_impl(&mut store); -} - -fn data_store_dump_empty_column_impl(store: &mut DataStore) { - let entity_path: EntityPath = "points".into(); - let frame1 = TimeInt::new_temporal(1); - let frame2 = TimeInt::new_temporal(2); - let frame3 = TimeInt::new_temporal(3); - - // Start by inserting a table with 2 rows, one with colors, and one with points. - { - let (instances1, colors1) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); - let row1 = test_row!(entity_path @ [ - build_frame_nr(frame1), - ] => [instances1, colors1]); - - let (instances2, positions2) = (MyIndex::from_iter(0..3), MyPoint::from_iter(0..2)); - let row2 = test_row!(entity_path @ [ - build_frame_nr(frame2), - ] => [instances2, positions2]); - let mut table = DataTable::from_rows(TableId::new(), [row1, row2]); - table.compute_all_size_bytes(); - insert_table_with_retries(store, &table); - } - - // Now insert another table with points only. - { - let (instances3, positions3) = (MyIndex::from_iter(0..3), MyColor::from_iter(0..3)); - let row3 = test_row!(entity_path @ [ - build_frame_nr(frame3), - ] => [instances3, positions3]); - let mut table = DataTable::from_rows(TableId::new(), [row3]); - table.compute_all_size_bytes(); - insert_table_with_retries(store, &table); - } - - let data_msgs: Result, _> = store - .to_data_tables(None) - .map(|table| table.to_arrow_msg()) - .collect(); - - // Should end up with 2 tables - assert_eq!(data_msgs.unwrap().len(), 2); -} diff --git a/crates/re_data_store/tests/internals.rs b/crates/re_data_store/tests/internals.rs deleted file mode 100644 index fbd44333ded9..000000000000 --- a/crates/re_data_store/tests/internals.rs +++ /dev/null @@ -1,142 +0,0 @@ -//! Tests running assertions on the internal state of the datastore. -//! -//! They're awful, but sometimes you just have to… - -// https://github.com/rust-lang/rust-clippy/issues/10011 -#![cfg(test)] - -use re_data_store::{DataStore, DataStoreConfig}; -use re_log_types::{ - build_frame_nr, example_components::MyIndex, DataRow, EntityPath, RowId, TimePoint, -}; - -// --- Internals --- - -// TODO(cmc): One should _never_ run assertions on the internal state of the datastore, this -// is a recipe for disaster. -// -// The contract that needs to be asserted here, from the point of view of the actual user, -// is performance: getting the datastore into a pathological topology should show up in -// integration query benchmarks. -// -// In the current state of things, though, it is much easier to test for it that way… so we -// make an exception, for now… -#[test] -fn pathological_bucket_topology() { - re_log::setup_logging(); - - let mut store_forward = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig { - indexed_bucket_num_rows: 10, - ..Default::default() - }, - ); - let mut store_backward = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig { - indexed_bucket_num_rows: 10, - ..Default::default() - }, - ); - - fn store_repeated_frame( - frame_nr: i64, - num: usize, - store_forward: &mut DataStore, - store_backward: &mut DataStore, - ) { - let entity_path = EntityPath::from("this/that"); - let num_instances = 1; - - let timepoint = TimePoint::from([build_frame_nr(frame_nr)]); - for _ in 0..num { - let row = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint.clone(), - MyIndex::from_iter(0..num_instances), - ) - .unwrap(); - store_forward.insert_row(&row).unwrap(); - - let row = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint.clone(), - MyIndex::from_iter(0..num_instances), - ) - .unwrap(); - store_backward.insert_row(&row).unwrap(); - } - } - - fn store_frame_range( - range: core::ops::RangeInclusive, - store_forward: &mut DataStore, - store_backward: &mut DataStore, - ) { - let entity_path = EntityPath::from("this/that"); - let num_instances = 1; - - let rows = range - .map(|frame_nr| { - let timepoint = TimePoint::from([build_frame_nr(frame_nr)]); - DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint, - MyIndex::from_iter(0..num_instances), - ) - .unwrap() - }) - .collect::>(); - - for row in &rows { - store_forward.insert_row(row).unwrap(); - } - - rows.iter().rev().for_each(|row| { - store_backward.insert_row(row).unwrap(); - }); - } - - store_repeated_frame(1000, 10, &mut store_forward, &mut store_backward); - store_frame_range(970..=979, &mut store_forward, &mut store_backward); - store_frame_range(990..=999, &mut store_forward, &mut store_backward); - store_frame_range(980..=989, &mut store_forward, &mut store_backward); - store_repeated_frame(1000, 7, &mut store_forward, &mut store_backward); - store_frame_range(1000..=1009, &mut store_forward, &mut store_backward); - store_repeated_frame(975, 10, &mut store_forward, &mut store_backward); - - { - let num_buckets = store_forward - .iter_indices() - .flat_map(|(_, table)| table.buckets.values()) - .count(); - assert_eq!( - 7usize, - num_buckets, - "pathological topology (forward): {}", - { - store_forward.sort_indices_if_needed(); - store_forward - } - ); - } - { - let num_buckets = store_backward - .iter_indices() - .flat_map(|(_, table)| table.buckets.values()) - .count(); - assert_eq!( - 8usize, - num_buckets, - "pathological topology (backward): {}", - { - store_backward.sort_indices_if_needed(); - store_backward - } - ); - } -} diff --git a/crates/re_data_store/tests/memory_test.rs b/crates/re_data_store/tests/memory_test.rs deleted file mode 100644 index e29a262ac0b5..000000000000 --- a/crates/re_data_store/tests/memory_test.rs +++ /dev/null @@ -1,106 +0,0 @@ -//! Measures the memory overhead of the data store. - -// https://github.com/rust-lang/rust-clippy/issues/10011 -#![cfg(test)] - -use std::sync::atomic::{AtomicUsize, Ordering::Relaxed}; - -thread_local! { - static LIVE_BYTES_IN_THREAD: AtomicUsize = AtomicUsize::new(0); -} - -pub struct TrackingAllocator { - allocator: std::alloc::System, -} - -#[global_allocator] -pub static GLOBAL_ALLOCATOR: TrackingAllocator = TrackingAllocator { - allocator: std::alloc::System, -}; - -#[allow(unsafe_code)] -// SAFETY: -// We just do book-keeping and then let another allocator do all the actual work. -unsafe impl std::alloc::GlobalAlloc for TrackingAllocator { - #[allow(clippy::let_and_return)] - unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { - LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_add(layout.size(), Relaxed)); - - // SAFETY: - // Just deferring - unsafe { self.allocator.alloc(layout) } - } - - unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { - LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_sub(layout.size(), Relaxed)); - - // SAFETY: - // Just deferring - unsafe { self.allocator.dealloc(ptr, layout) }; - } -} - -fn live_bytes() -> usize { - LIVE_BYTES_IN_THREAD.with(|bytes| bytes.load(Relaxed)) -} - -/// Assumes all allocations are on the calling thread. -/// -/// The reason we use thread-local counting is so that -/// the counting won't be confused by any other running threads (e.g. other tests). -fn memory_use(run: impl Fn() -> R) -> usize { - let used_bytes_start = live_bytes(); - let ret = run(); - let bytes_used = live_bytes() - used_bytes_start; - drop(ret); - bytes_used -} - -// ---------------------------------------------------------------------------- - -use re_data_store::{DataStore, DataStoreConfig}; -use re_log_types::{DataRow, RowId, TimePoint, TimeType, Timeline}; -use re_types::components::Scalar; - -/// The memory overhead of storing many scalars in the store. -#[test] -fn scalar_memory_overhead() { - re_log::setup_logging(); - - const NUM_SCALARS: usize = 1024 * 1024; - - let total_mem_use = memory_use(|| { - let mut store = DataStore::new( - re_log_types::StoreId::random(re_log_types::StoreKind::Recording), - DataStoreConfig::default(), - ); - - for i in 0..NUM_SCALARS { - let entity_path = re_log_types::entity_path!("scalar"); - let timepoint = - TimePoint::default().with(Timeline::new("log_time", TimeType::Time), i as i64); - let row = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - timepoint, - vec![Scalar(i as f64)], - ) - .unwrap(); - store.insert_row(&row).unwrap(); - } - - store - }); - - insta::assert_debug_snapshot!( - "scalars_on_one_timeline", - [ - format!("{NUM_SCALARS} scalars"), - format!("{} in total", re_format::format_bytes(total_mem_use as _)), - format!( - "{} per row", - re_format::format_bytes(total_mem_use as f64 / NUM_SCALARS as f64) - ), - ] - ); -} diff --git a/crates/re_data_store/tests/snapshots/memory_test__scalars_on_one_timeline.snap b/crates/re_data_store/tests/snapshots/memory_test__scalars_on_one_timeline.snap deleted file mode 100644 index daab5fba8051..000000000000 --- a/crates/re_data_store/tests/snapshots/memory_test__scalars_on_one_timeline.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: crates/re_data_store/tests/memory_test.rs -expression: "[format!(\"{NUM_SCALARS} scalars\"),\n format!(\"{} in total\", re_format::format_bytes(total_mem_use as _)),\n format!(\"{} per row\",\n re_format::format_bytes(total_mem_use as f64 / NUM_SCALARS as\n f64))]" ---- -[ - "1048576 scalars", - "912 MiB in total", - "912 B per row", -] diff --git a/crates/re_data_ui/Cargo.toml b/crates/re_data_ui/Cargo.toml index 0ea1520dc85a..8b3002d4d0cc 100644 --- a/crates/re_data_ui/Cargo.toml +++ b/crates/re_data_ui/Cargo.toml @@ -19,7 +19,7 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_entity_db.workspace = true re_error.workspace = true re_format.workspace = true diff --git a/crates/re_data_ui/src/annotation_context.rs b/crates/re_data_ui/src/annotation_context.rs index d99e99c39782..e05e15b47081 100644 --- a/crates/re_data_ui/src/annotation_context.rs +++ b/crates/re_data_ui/src/annotation_context.rs @@ -17,7 +17,7 @@ impl crate::EntityDataUi for re_types::components::ClassId { ui: &mut egui::Ui, ui_layout: UiLayout, entity_path: &re_log_types::EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { let annotations = crate::annotations(ctx, query, entity_path); @@ -67,7 +67,7 @@ impl crate::EntityDataUi for re_types::components::KeypointId { ui: &mut egui::Ui, ui_layout: UiLayout, entity_path: &re_log_types::EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { if let Some(info) = annotation_info(ctx, entity_path, query, self.0) { @@ -91,7 +91,7 @@ impl crate::EntityDataUi for re_types::components::KeypointId { fn annotation_info( ctx: &re_viewer_context::ViewerContext<'_>, entity_path: &re_log_types::EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, keypoint_id: KeypointId, ) -> Option { // TODO(#5607): what should happen if the promise is still pending? @@ -113,7 +113,7 @@ impl DataUi for AnnotationContext { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { match ui_layout { diff --git a/crates/re_data_ui/src/app_id.rs b/crates/re_data_ui/src/app_id.rs index b14add0549bf..3489a2022ea6 100644 --- a/crates/re_data_ui/src/app_id.rs +++ b/crates/re_data_ui/src/app_id.rs @@ -12,7 +12,7 @@ impl crate::DataUi for ApplicationId { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { egui::Grid::new("application_id") diff --git a/crates/re_data_ui/src/blueprint_data.rs b/crates/re_data_ui/src/blueprint_data.rs index d010db2a1278..528cf928ae20 100644 --- a/crates/re_data_ui/src/blueprint_data.rs +++ b/crates/re_data_ui/src/blueprint_data.rs @@ -9,7 +9,7 @@ impl DataUi for BlueprintId { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, _ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { entity_path_button_to( diff --git a/crates/re_data_ui/src/blueprint_types.rs b/crates/re_data_ui/src/blueprint_types.rs index 1b2e1d1b0706..e059e72eb42a 100644 --- a/crates/re_data_ui/src/blueprint_types.rs +++ b/crates/re_data_ui/src/blueprint_types.rs @@ -12,7 +12,7 @@ impl DataUi for IncludedSpaceView { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { let space_view: SpaceViewId = self.0.into(); @@ -27,7 +27,7 @@ impl DataUi for SpaceViewMaximized { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { let space_view: SpaceViewId = self.0.into(); diff --git a/crates/re_data_ui/src/component.rs b/crates/re_data_ui/src/component.rs index 8398811528ae..3b7f44ad5e46 100644 --- a/crates/re_data_ui/src/component.rs +++ b/crates/re_data_ui/src/component.rs @@ -20,7 +20,7 @@ impl<'a> DataUi for EntityLatestAtResults<'a> { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { let Some(component_name) = self.results.component_name(db.resolver()) else { diff --git a/crates/re_data_ui/src/component_name.rs b/crates/re_data_ui/src/component_name.rs index bc2f51dddebe..7c71ad80b07b 100644 --- a/crates/re_data_ui/src/component_name.rs +++ b/crates/re_data_ui/src/component_name.rs @@ -10,7 +10,7 @@ impl DataUi for ComponentName { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { if ui_layout == UiLayout::List { diff --git a/crates/re_data_ui/src/component_path.rs b/crates/re_data_ui/src/component_path.rs index b59e971c9141..9161e27a09ab 100644 --- a/crates/re_data_ui/src/component_path.rs +++ b/crates/re_data_ui/src/component_path.rs @@ -10,7 +10,7 @@ impl DataUi for ComponentPath { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { let Self { diff --git a/crates/re_data_ui/src/component_ui_registry.rs b/crates/re_data_ui/src/component_ui_registry.rs index 7260b09df58a..c3bb679819e1 100644 --- a/crates/re_data_ui/src/component_ui_registry.rs +++ b/crates/re_data_ui/src/component_ui_registry.rs @@ -1,4 +1,4 @@ -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityDb; use re_log_types::{external::arrow2, EntityPath}; use re_types::external::arrow2::array::Utf8Array; diff --git a/crates/re_data_ui/src/data.rs b/crates/re_data_ui/src/data.rs index bba6c54b7fc3..be1f1675ddfd 100644 --- a/crates/re_data_ui/src/data.rs +++ b/crates/re_data_ui/src/data.rs @@ -1,7 +1,7 @@ use egui::Ui; -use re_data_store::LatestAtQuery; -use re_entity_db::EntityDb; +use re_chunk_store::LatestAtQuery; +use re_entity_db::EntityDb; use re_format::format_f32; use re_types::blueprint::components::VisualBounds2D; use re_types::components::{LineStrip2D, LineStrip3D, ViewCoordinates}; @@ -18,7 +18,7 @@ impl DataUi for ViewCoordinates { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { match ui_layout { @@ -42,7 +42,7 @@ impl DataUi for re_types::datatypes::Mat3x3 { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, _ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { egui::Grid::new("mat3").num_columns(3).show(ui, |ui| { @@ -70,7 +70,7 @@ impl DataUi for re_types::datatypes::Vec2D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui_layout.data_label(ui, self.to_string()); @@ -83,7 +83,7 @@ impl DataUi for re_types::datatypes::Vec3D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui_layout.data_label(ui, self.to_string()); @@ -96,7 +96,7 @@ impl DataUi for re_types::datatypes::Vec4D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui_layout.data_label(ui, self.to_string()); @@ -109,7 +109,7 @@ impl DataUi for re_types::datatypes::UVec2D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui_layout.data_label(ui, self.to_string()); @@ -122,7 +122,7 @@ impl DataUi for re_types::datatypes::UVec3D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui_layout.data_label(ui, self.to_string()); @@ -135,7 +135,7 @@ impl DataUi for re_types::datatypes::UVec4D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui_layout.data_label(ui, self.to_string()); @@ -161,7 +161,7 @@ impl DataUi for LineStrip2D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { match ui_layout { @@ -209,7 +209,7 @@ impl DataUi for LineStrip3D { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { match ui_layout { diff --git a/crates/re_data_ui/src/data_source.rs b/crates/re_data_ui/src/data_source.rs index 8e49e6b183eb..1793f5927fba 100644 --- a/crates/re_data_ui/src/data_source.rs +++ b/crates/re_data_ui/src/data_source.rs @@ -9,7 +9,7 @@ impl crate::DataUi for re_smart_channel::SmartChannelSource { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui.label(self.to_string()); diff --git a/crates/re_data_ui/src/entity_db.rs b/crates/re_data_ui/src/entity_db.rs index c5d0f3490736..acf21d194eef 100644 --- a/crates/re_data_ui/src/entity_db.rs +++ b/crates/re_data_ui/src/entity_db.rs @@ -12,7 +12,7 @@ impl crate::DataUi for EntityDb { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { if ui_layout == UiLayout::List { diff --git a/crates/re_data_ui/src/entity_path.rs b/crates/re_data_ui/src/entity_path.rs index d5f946bc4609..81aa63bf1181 100644 --- a/crates/re_data_ui/src/entity_path.rs +++ b/crates/re_data_ui/src/entity_path.rs @@ -9,7 +9,7 @@ impl DataUi for re_entity_db::EntityPath { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { InstancePath::entity_all(self.clone()).data_ui(ctx, ui, ui_layout, query, db); diff --git a/crates/re_data_ui/src/image.rs b/crates/re_data_ui/src/image.rs index c7288e026d09..6a585686a5b6 100644 --- a/crates/re_data_ui/src/image.rs +++ b/crates/re_data_ui/src/image.rs @@ -1,7 +1,8 @@ use egui::{Color32, Vec2}; use itertools::Itertools as _; -use re_log_types::{EntityPath, RowId}; +use re_chunk_store::RowId; +use re_log_types::EntityPath; use re_renderer::renderer::ColormappedTexture; use re_types::components::{ClassId, Colormap, DepthMeter}; use re_types::datatypes::{TensorBuffer, TensorData, TensorDimension}; @@ -54,7 +55,7 @@ impl EntityDataUi for re_types::components::TensorData { ui: &mut egui::Ui, ui_layout: UiLayout, entity_path: &EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { re_tracing::profile_function!(); @@ -94,7 +95,7 @@ impl EntityDataUi for re_types::components::TensorData { #[allow(clippy::too_many_arguments)] pub fn tensor_ui( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, ui_layout: UiLayout, diff --git a/crates/re_data_ui/src/image_meaning.rs b/crates/re_data_ui/src/image_meaning.rs index b92a0ceff54d..4b836b162468 100644 --- a/crates/re_data_ui/src/image_meaning.rs +++ b/crates/re_data_ui/src/image_meaning.rs @@ -7,8 +7,8 @@ use re_types::{ pub fn image_meaning_for_entity( entity_path: &EntityPath, - query: &re_data_store::LatestAtQuery, - store: &re_data_store::DataStore, + query: &re_chunk_store::LatestAtQuery, + store: &re_chunk_store::ChunkStore, ) -> TensorDataMeaning { let timeline = &query.timeline(); if store.entity_has_component(timeline, entity_path, &DepthImage::indicator().name()) { diff --git a/crates/re_data_ui/src/instance_path.rs b/crates/re_data_ui/src/instance_path.rs index a9cad38bbc9c..509556267470 100644 --- a/crates/re_data_ui/src/instance_path.rs +++ b/crates/re_data_ui/src/instance_path.rs @@ -11,7 +11,7 @@ impl DataUi for InstancePath { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { let Self { diff --git a/crates/re_data_ui/src/item_ui.rs b/crates/re_data_ui/src/item_ui.rs index ef489b518663..1447130f798b 100644 --- a/crates/re_data_ui/src/item_ui.rs +++ b/crates/re_data_ui/src/item_ui.rs @@ -35,7 +35,7 @@ use super::DataUi; /// Show an entity path and make it selectable. pub fn entity_path_button( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, space_view_id: Option, @@ -55,7 +55,7 @@ pub fn entity_path_button( /// Show the different parts of an entity path and make them selectable. pub fn entity_path_parts_buttons( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, space_view_id: Option, @@ -93,7 +93,7 @@ pub fn entity_path_parts_buttons( /// Show an entity path and make it selectable. pub fn entity_path_button_to( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, space_view_id: Option, @@ -114,7 +114,7 @@ pub fn entity_path_button_to( /// Show an instance id and make it selectable. pub fn instance_path_button( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, space_view_id: Option, @@ -136,7 +136,7 @@ pub fn instance_path_button( /// The choice of icon is based on whether the instance is "empty" as in hasn't any logged component /// _on the current timeline_. pub fn instance_path_icon( - timeline: &re_data_store::Timeline, + timeline: &re_chunk_store::Timeline, db: &re_entity_db::EntityDb, instance_path: &InstancePath, ) -> &'static icons::Icon { @@ -166,7 +166,7 @@ pub fn instance_path_icon( pub fn guess_query_and_db_for_selected_entity<'a>( ctx: &'a ViewerContext<'_>, entity_path: &EntityPath, -) -> (re_data_store::LatestAtQuery, &'a re_entity_db::EntityDb) { +) -> (re_chunk_store::LatestAtQuery, &'a re_entity_db::EntityDb) { if ctx.app_options.inspect_blueprint_timeline && ctx.store_context.blueprint.is_logged_entity(entity_path) { @@ -193,7 +193,7 @@ pub fn guess_instance_path_icon( /// Show an instance id and make it selectable. pub fn instance_path_button_to( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, space_view_id: Option, @@ -207,7 +207,7 @@ pub fn instance_path_button_to( #[allow(clippy::too_many_arguments)] fn instance_path_button_to_ex( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, space_view_id: Option, @@ -242,7 +242,7 @@ fn instance_path_button_to_ex( /// Show the different parts of an instance path and make them selectable. pub fn instance_path_parts_buttons( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, space_view_id: Option, @@ -431,7 +431,7 @@ pub fn component_path_button_to( pub fn data_blueprint_button_to( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ui: &mut egui::Ui, text: impl Into, @@ -526,7 +526,7 @@ pub fn cursor_interact_with_selectable( pub fn instance_hover_card_ui( ui: &mut egui::Ui, ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, instance_path: &InstancePath, ) { @@ -561,7 +561,7 @@ pub fn instance_hover_card_ui( pub fn entity_hover_card_ui( ui: &mut egui::Ui, ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, entity_path: &EntityPath, ) { diff --git a/crates/re_data_ui/src/lib.rs b/crates/re_data_ui/src/lib.rs index 367999e25bd9..c065dbb6c4db 100644 --- a/crates/re_data_ui/src/lib.rs +++ b/crates/re_data_ui/src/lib.rs @@ -2,9 +2,7 @@ //! //! This crate provides ui elements for Rerun component data for the Rerun Viewer. -use itertools::Itertools; - -use re_log_types::{DataCell, EntityPath, TimePoint}; +use re_log_types::{EntityPath, TimePoint}; use re_types::ComponentName; use re_viewer_context::{UiLayout, ViewerContext}; @@ -53,13 +51,13 @@ pub fn sorted_component_list_for_ui<'a>( /// Types implementing [`DataUi`] can display themselves in an [`egui::Ui`]. pub trait DataUi { - /// If you need to lookup something in the data store, use the given query to do so. + /// If you need to lookup something in the chunk store, use the given query to do so. fn data_ui( &self, ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ); @@ -73,14 +71,14 @@ pub trait DataUi { /// /// This is given the context of the entity it is part of so it can do queries. pub trait EntityDataUi { - /// If you need to lookup something in the data store, use the given query to do so. + /// If you need to lookup something in the chunk store, use the given query to do so. fn entity_data_ui( &self, ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, entity_path: &EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ); } @@ -95,7 +93,7 @@ where ui: &mut egui::Ui, ui_layout: UiLayout, entity_path: &EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { // This ensures that UI state is maintained per entity. For example, the collapsed state for @@ -114,7 +112,7 @@ impl DataUi for TimePoint { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, _ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { ui.vertical(|ui| { @@ -130,50 +128,11 @@ impl DataUi for TimePoint { } } -impl DataUi for [DataCell] { - fn data_ui( - &self, - _ctx: &ViewerContext<'_>, - ui: &mut egui::Ui, - ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, - _db: &re_entity_db::EntityDb, - ) { - let mut sorted = self.to_vec(); - sorted.sort_by_key(|cb| cb.component_name()); - - match ui_layout { - UiLayout::List => { - ui.label(sorted.iter().map(format_cell).join(", ")); - } - - UiLayout::SelectionPanelFull - | UiLayout::SelectionPanelLimitHeight - | UiLayout::Tooltip => { - ui.vertical(|ui| { - for component_bundle in &sorted { - ui.label(format_cell(component_bundle)); - } - }); - } - } - } -} - -fn format_cell(cell: &DataCell) -> String { - // TODO(emilk): if there's only once instance, and the byte size is small, then deserialize and show the value. - format!( - "{}x {}", - cell.num_instances(), - cell.component_name().short_name() - ) -} - // --------------------------------------------------------------------------- pub fn annotations( ctx: &ViewerContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, entity_path: &re_entity_db::EntityPath, ) -> std::sync::Arc { re_tracing::profile_function!(); diff --git a/crates/re_data_ui/src/log_msg.rs b/crates/re_data_ui/src/log_msg.rs index e4952e038027..644f12e71b07 100644 --- a/crates/re_data_ui/src/log_msg.rs +++ b/crates/re_data_ui/src/log_msg.rs @@ -1,11 +1,8 @@ -use re_log_types::{ - ArrowMsg, BlueprintActivationCommand, DataTable, LogMsg, SetStoreInfo, StoreInfo, -}; -use re_ui::{ContextExt as _, UiExt as _}; +use re_log_types::{BlueprintActivationCommand, LogMsg, SetStoreInfo, StoreInfo}; +use re_ui::UiExt as _; use re_viewer_context::{UiLayout, ViewerContext}; use super::DataUi; -use crate::item_ui; impl DataUi for LogMsg { fn data_ui( @@ -13,12 +10,12 @@ impl DataUi for LogMsg { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { match self { Self::SetStoreInfo(msg) => msg.data_ui(ctx, ui, ui_layout, query, db), - Self::ArrowMsg(_, msg) => msg.data_ui(ctx, ui, ui_layout, query, db), + Self::ArrowMsg(_, _) => {} Self::BlueprintActivationCommand(BlueprintActivationCommand { blueprint_id, make_active, @@ -38,7 +35,7 @@ impl DataUi for SetStoreInfo { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, _ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { let Self { row_id: _, info } = self; @@ -95,49 +92,3 @@ impl DataUi for SetStoreInfo { }); } } - -impl DataUi for ArrowMsg { - fn data_ui( - &self, - ctx: &ViewerContext<'_>, - ui: &mut egui::Ui, - ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, - db: &re_entity_db::EntityDb, - ) { - let table = match DataTable::from_arrow_msg(self) { - Ok(table) => table, - Err(err) => { - ui.label( - ui.ctx() - .error_text(format!("Error parsing ArrowMsg: {err}")), - ); - return; - } - }; - - // TODO(cmc): Come up with something a bit nicer once data tables become a common sight. - for row in table.to_rows() { - match row { - Ok(row) => { - egui::Grid::new("fields").num_columns(2).show(ui, |ui| { - ui.monospace("entity_path:"); - item_ui::entity_path_button(ctx, query, db, ui, None, row.entity_path()); - ui.end_row(); - - ui.monospace("time_point:"); - row.timepoint().data_ui(ctx, ui, ui_layout, query, db); - ui.end_row(); - - ui.monospace("components:"); - row.cells().data_ui(ctx, ui, ui_layout, query, db); - ui.end_row(); - }); - } - Err(err) => { - ui.label(ui.ctx().error_text(err.to_string())); - } - } - } - } -} diff --git a/crates/re_data_ui/src/pinhole.rs b/crates/re_data_ui/src/pinhole.rs index 76a08c9e940f..8b070c901558 100644 --- a/crates/re_data_ui/src/pinhole.rs +++ b/crates/re_data_ui/src/pinhole.rs @@ -10,7 +10,7 @@ impl DataUi for PinholeProjection { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { match ui_layout { @@ -44,7 +44,7 @@ impl DataUi for Resolution { _ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - _query: &re_data_store::LatestAtQuery, + _query: &re_chunk_store::LatestAtQuery, _db: &re_entity_db::EntityDb, ) { let [x, y] = self.0 .0; diff --git a/crates/re_data_ui/src/rotation3d.rs b/crates/re_data_ui/src/rotation3d.rs index 55edf2b05829..f90bacf12acb 100644 --- a/crates/re_data_ui/src/rotation3d.rs +++ b/crates/re_data_ui/src/rotation3d.rs @@ -12,7 +12,7 @@ impl DataUi for components::Rotation3D { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { self.0.data_ui(ctx, ui, ui_layout, query, db); @@ -25,7 +25,7 @@ impl DataUi for datatypes::Rotation3D { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { match self { diff --git a/crates/re_data_ui/src/store_id.rs b/crates/re_data_ui/src/store_id.rs index e8809dae0e75..39068dc4f1e9 100644 --- a/crates/re_data_ui/src/store_id.rs +++ b/crates/re_data_ui/src/store_id.rs @@ -6,7 +6,7 @@ impl crate::DataUi for re_log_types::StoreId { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { if let Some(entity_db) = ctx.store_context.bundle.get(self) { diff --git a/crates/re_data_ui/src/transform3d.rs b/crates/re_data_ui/src/transform3d.rs index de2a17ef84d8..9380ab260fc0 100644 --- a/crates/re_data_ui/src/transform3d.rs +++ b/crates/re_data_ui/src/transform3d.rs @@ -10,7 +10,7 @@ impl DataUi for re_types::components::Transform3D { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { match ui_layout { @@ -53,7 +53,7 @@ impl DataUi for re_types::components::OutOfTreeTransform3D { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { re_types::components::Transform3D(self.0).data_ui(ctx, ui, ui_layout, query, db); @@ -67,7 +67,7 @@ impl DataUi for Transform3D { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { match ui_layout { @@ -97,7 +97,7 @@ impl DataUi for TranslationRotationScale3D { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { let Self { @@ -139,7 +139,7 @@ impl DataUi for Scale3D { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { match self { @@ -159,7 +159,7 @@ impl DataUi for TranslationAndMat3x3 { ctx: &ViewerContext<'_>, ui: &mut egui::Ui, ui_layout: UiLayout, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, db: &re_entity_db::EntityDb, ) { let Self { diff --git a/crates/re_edit_ui/src/color.rs b/crates/re_edit_ui/src/color.rs index bc58344d2934..b4a0b4663071 100644 --- a/crates/re_edit_ui/src/color.rs +++ b/crates/re_edit_ui/src/color.rs @@ -1,7 +1,7 @@ use re_types::{components::Color, external::arrow2, Loggable}; use re_ui::UiExt; use re_viewer_context::{ - external::{re_data_store::LatestAtQuery, re_entity_db::EntityDb, re_log_types::EntityPath}, + external::{re_chunk_store::LatestAtQuery, re_entity_db::EntityDb, re_log_types::EntityPath}, UiLayout, ViewerContext, }; diff --git a/crates/re_edit_ui/src/datatype_editors/singleline_string.rs b/crates/re_edit_ui/src/datatype_editors/singleline_string.rs index ce64d3bf86f4..7556f4a583e9 100644 --- a/crates/re_edit_ui/src/datatype_editors/singleline_string.rs +++ b/crates/re_edit_ui/src/datatype_editors/singleline_string.rs @@ -5,7 +5,7 @@ use re_types::{ }; use re_ui::UiExt as _; use re_viewer_context::{ - external::{re_data_store::LatestAtQuery, re_entity_db::EntityDb, re_log_types::EntityPath}, + external::{re_chunk_store::LatestAtQuery, re_entity_db::EntityDb, re_log_types::EntityPath}, UiLayout, ViewerContext, }; diff --git a/crates/re_entity_db/Cargo.toml b/crates/re_entity_db/Cargo.toml index 41e755c9330c..a094f0aaf9e1 100644 --- a/crates/re_entity_db/Cargo.toml +++ b/crates/re_entity_db/Cargo.toml @@ -28,7 +28,8 @@ serde = ["dep:serde", "re_log_types/serde"] [dependencies] re_build_info.workspace = true -re_data_store.workspace = true +re_chunk = { workspace = true, features = ["serde"] } +re_chunk_store.workspace = true re_format.workspace = true re_int_histogram.workspace = true re_log.workspace = true diff --git a/crates/re_entity_db/examples/memory_usage.rs b/crates/re_entity_db/examples/memory_usage.rs index 36354232eacb..78fab26a30b5 100644 --- a/crates/re_entity_db/examples/memory_usage.rs +++ b/crates/re_entity_db/examples/memory_usage.rs @@ -51,7 +51,8 @@ fn live_bytes() -> usize { // ---------------------------------------------------------------------------- -use re_log_types::{entity_path, example_components::MyPoint, DataRow, RowId, StoreId, StoreKind}; +use re_chunk::{Chunk, RowId}; +use re_log_types::{entity_path, example_components::MyPoint, StoreId, StoreKind}; fn main() { log_messages(); @@ -114,23 +115,21 @@ fn log_messages() { { let used_bytes_start = live_bytes(); - let table = Box::new( - DataRow::from_cells1( + let chunk = Chunk::builder("points".into()) + .with_component_batches( RowId::new(), - entity_path!("points"), [build_frame_nr(TimeInt::ZERO)], - MyPoint::from_iter(0..1), + [&MyPoint::from_iter(0..1) as _], ) - .unwrap() - .into_table(), - ); - let table_bytes = live_bytes() - used_bytes_start; + .build() + .unwrap(); + let chunk_bytes = live_bytes() - used_bytes_start; let log_msg = Box::new(LogMsg::ArrowMsg( store_id.clone(), - table.to_arrow_msg().unwrap(), + chunk.to_arrow_msg().unwrap(), )); let log_msg_bytes = live_bytes() - used_bytes_start; - println!("Arrow payload containing a Pos2 uses {table_bytes} bytes in RAM"); + println!("Arrow payload containing a Pos2 uses {chunk_bytes} bytes in RAM"); let encoded = encode_log_msg(&log_msg); println!( "Arrow LogMsg containing a Pos2 uses {}-{log_msg_bytes} bytes in RAM, and {} bytes encoded", @@ -140,20 +139,18 @@ fn log_messages() { { let used_bytes_start = live_bytes(); - let table = Box::new( - DataRow::from_cells1( + let chunk = Chunk::builder("points".into()) + .with_component_batches( RowId::new(), - entity_path!("points"), [build_frame_nr(TimeInt::ZERO)], - MyPoint::from_iter(0..NUM_POINTS as u32), + [&MyPoint::from_iter(0..NUM_POINTS as u32) as _], ) - .unwrap() - .into_table(), - ); - let table_bytes = live_bytes() - used_bytes_start; - let log_msg = Box::new(LogMsg::ArrowMsg(store_id, table.to_arrow_msg().unwrap())); + .build() + .unwrap(); + let chunk_bytes = live_bytes() - used_bytes_start; + let log_msg = Box::new(LogMsg::ArrowMsg(store_id, chunk.to_arrow_msg().unwrap())); let log_msg_bytes = live_bytes() - used_bytes_start; - println!("Arrow payload containing a Pos2 uses {table_bytes} bytes in RAM"); + println!("Arrow payload containing a Pos2 uses {chunk_bytes} bytes in RAM"); let encoded = encode_log_msg(&log_msg); println!( "Arrow LogMsg containing {NUM_POINTS}x Pos2 uses {}-{log_msg_bytes} bytes in RAM, and {} bytes encoded", diff --git a/crates/re_entity_db/src/entity_db.rs b/crates/re_entity_db/src/entity_db.rs index f9991444e872..f98714a52cd7 100644 --- a/crates/re_entity_db/src/entity_db.rs +++ b/crates/re_entity_db/src/entity_db.rs @@ -1,14 +1,17 @@ +use std::sync::Arc; + use itertools::Itertools; use nohash_hasher::IntMap; use parking_lot::Mutex; -use re_data_store::{ - DataStore, DataStoreConfig, GarbageCollectionOptions, StoreEvent, StoreSubscriber, +use re_chunk::{Chunk, ChunkResult, RowId}; +use re_chunk_store::{ + ChunkStore, ChunkStoreConfig, ChunkStoreEvent, ChunkStoreSubscriber, GarbageCollectionOptions, + GarbageCollectionTarget, }; use re_log_types::{ - ApplicationId, ComponentPath, DataRow, DataTable, DataTableResult, EntityPath, EntityPathHash, - LogMsg, ResolvedTimeRange, ResolvedTimeRangeF, RowId, SetStoreInfo, StoreId, StoreInfo, - StoreKind, Timeline, + ApplicationId, ComponentPath, EntityPath, EntityPathHash, LogMsg, ResolvedTimeRange, + ResolvedTimeRangeF, SetStoreInfo, StoreId, StoreInfo, StoreKind, Timeline, }; use re_types_core::{Archetype, Loggable}; @@ -16,62 +19,9 @@ use crate::{Error, TimesPerTimeline}; // ---------------------------------------------------------------------------- -/// See [`insert_row_with_retries`]. -const MAX_INSERT_ROW_ATTEMPTS: usize = 1_000; - -/// See [`insert_row_with_retries`]. -const DEFAULT_INSERT_ROW_STEP_SIZE: u64 = 100; - /// See [`GarbageCollectionOptions::time_budget`]. const DEFAULT_GC_TIME_BUDGET: std::time::Duration = std::time::Duration::from_micros(3500); // empirical -/// Inserts a [`DataRow`] into the [`DataStore`], retrying in case of duplicated `RowId`s. -/// -/// Retries a maximum of `num_attempts` times if the row couldn't be inserted because of a -/// duplicated [`RowId`], bumping the [`RowId`]'s internal counter by a random number -/// (up to `step_size`) between attempts. -/// -/// Returns the actual [`DataRow`] that was successfully inserted, if any. -/// -/// The default value of `num_attempts` (see [`MAX_INSERT_ROW_ATTEMPTS`]) should be (way) more than -/// enough for all valid use cases. -/// -/// When using this function, please add a comment explaining the rationale. -fn insert_row_with_retries( - store: &mut DataStore, - mut row: DataRow, - num_attempts: usize, - step_size: u64, -) -> re_data_store::WriteResult { - fn random_u64() -> u64 { - let mut bytes = [0_u8; 8]; - getrandom::getrandom(&mut bytes).map_or(0, |_| u64::from_le_bytes(bytes)) - } - - for i in 0..num_attempts { - match store.insert_row(&row) { - Ok(event) => return Ok(event), - Err(re_data_store::WriteError::ReusedRowId(_)) => { - // TODO(#1894): currently we produce duplicate row-ids when hitting the "save" button. - // This means we hit this code path when loading an .rrd file that was saved from the viewer. - // In the future a row-id clash should probably either be considered an error (with a loud warning) - // or an ignored idempotent operation (with the assumption that if the RowId is the same, so is the data). - // In any case, we cannot log loudly here. - re_log::trace!( - "Found duplicated RowId ({}) during insert. Incrementing it by random offset (retry {}/{})…", - row.row_id, - i + 1, - num_attempts - ); - row.row_id = row.row_id.incremented_by(random_u64() % step_size + 1); - } - Err(err) => return Err(err), - } - } - - Err(re_data_store::WriteError::ReusedRowId(row.row_id())) -} - // ---------------------------------------------------------------------------- /// An in-memory database built from a stream of [`LogMsg`]es. @@ -109,7 +59,7 @@ pub struct EntityDb { tree: crate::EntityTree, /// Stores all components for all entities for all timelines. - data_store: DataStore, + data_store: ChunkStore, /// The active promise resolver for this DB. resolver: re_query::PromiseResolver, @@ -122,9 +72,9 @@ pub struct EntityDb { impl EntityDb { pub fn new(store_id: StoreId) -> Self { - let data_store = - re_data_store::DataStore::new(store_id.clone(), DataStoreConfig::default()); + let data_store = ChunkStore::new(store_id.clone(), ChunkStoreConfig::default()); let query_caches = re_query::Caches::new(&data_store); + Self { data_source: None, set_store_info: None, @@ -140,34 +90,13 @@ impl EntityDb { } } - /// Helper function to create a recording from a [`StoreInfo`] and some [`DataRow`]s. - /// - /// This is useful to programmatically create recordings from within the viewer, which cannot - /// use the `re_sdk`, which is not Wasm-compatible. - pub fn from_info_and_rows( - store_info: StoreInfo, - rows: impl IntoIterator, - ) -> Result { - let mut entity_db = Self::new(store_info.store_id.clone()); - - entity_db.set_store_info(SetStoreInfo { - row_id: RowId::new(), - info: store_info, - }); - for row in rows { - entity_db.add_data_row(row)?; - } - - Ok(entity_db) - } - #[inline] pub fn tree(&self) -> &crate::EntityTree { &self.tree } #[inline] - pub fn data_store(&self) -> &DataStore { + pub fn data_store(&self) -> &ChunkStore { &self.data_store } @@ -201,7 +130,7 @@ impl EntityDb { #[inline] pub fn latest_at( &self, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, entity_path: &EntityPath, component_names: impl IntoIterator, ) -> re_query::LatestAtResults { @@ -221,7 +150,7 @@ impl EntityDb { pub fn latest_at_component( &self, entity_path: &EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, ) -> Option> { self.query_caches().latest_at_component::( self.store(), @@ -243,7 +172,7 @@ impl EntityDb { pub fn latest_at_component_quiet( &self, entity_path: &EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, ) -> Option> { self.query_caches().latest_at_component_quiet::( self.store(), @@ -257,7 +186,7 @@ impl EntityDb { pub fn latest_at_component_at_closest_ancestor( &self, entity_path: &EntityPath, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, ) -> Option<(EntityPath, re_query::LatestAtMonoResult)> { self.query_caches() .latest_at_component_at_closest_ancestor::( @@ -269,7 +198,7 @@ impl EntityDb { } #[inline] - pub fn store(&self) -> &DataStore { + pub fn store(&self) -> &ChunkStore { &self.data_store } @@ -335,14 +264,15 @@ impl EntityDb { } } - pub fn num_rows(&self) -> usize { - self.data_store.num_static_rows() as usize + self.data_store.num_temporal_rows() as usize + #[inline] + pub fn num_rows(&self) -> u64 { + self.data_store.stats().total().total_num_rows } - /// Return the current `StoreGeneration`. This can be used to determine whether the + /// Return the current `ChunkStoreGeneration`. This can be used to determine whether the /// database has been modified since the last time it was queried. #[inline] - pub fn generation(&self) -> re_data_store::StoreGeneration { + pub fn generation(&self) -> re_chunk_store::ChunkStoreGeneration { self.data_store.generation() } @@ -402,8 +332,11 @@ impl EntityDb { LogMsg::SetStoreInfo(msg) => self.set_store_info(msg.clone()), LogMsg::ArrowMsg(_, arrow_msg) => { - let table = DataTable::from_arrow_msg(arrow_msg)?; - self.add_data_table(table)?; + self.last_modified_at = web_time::Instant::now(); + + let mut chunk = re_chunk::Chunk::from_arrow_msg(arrow_msg)?; + chunk.sort_if_unsorted(); + self.add_chunk(&Arc::new(chunk))?; } LogMsg::BlueprintActivationCommand(_) => { @@ -414,59 +347,26 @@ impl EntityDb { Ok(()) } - pub fn add_data_table(&mut self, mut table: DataTable) -> Result<(), Error> { - // TODO(#1760): Compute the size of the datacells in the batching threads on the clients. - table.compute_all_size_bytes(); - - for row in table.to_rows() { - self.add_data_row(row?)?; - } - - self.last_modified_at = web_time::Instant::now(); + pub fn add_chunk(&mut self, chunk: &Arc) -> Result<(), Error> { + let store_event = self.data_store.insert_chunk(chunk)?; - Ok(()) - } + self.register_entity_path(chunk.entity_path()); - /// Inserts a [`DataRow`] into the database. - pub fn add_data_row(&mut self, row: DataRow) -> Result<(), Error> { - re_tracing::profile_function!(format!("num_cells={}", row.num_cells())); + if self.latest_row_id < chunk.row_id_range().map(|(_, row_id_max)| row_id_max) { + self.latest_row_id = chunk.row_id_range().map(|(_, row_id_max)| row_id_max); + } - self.register_entity_path(&row.entity_path); + if let Some(store_event) = store_event { + // Update our internal views by notifying them of resulting [`ChunkStoreEvent`]s. + let original_store_events = &[store_event]; + self.times_per_timeline.on_events(original_store_events); + self.query_caches.on_events(original_store_events); + self.tree.on_store_additions(original_store_events); - if self - .latest_row_id - .map_or(true, |latest| latest < row.row_id) - { - self.latest_row_id = Some(row.row_id); + // We inform the stats last, since it measures e2e latency. + self.stats.on_events(original_store_events); } - // ## RowId duplication - // - // We shouldn't be attempting to retry in this instance: a duplicated RowId at this stage - // is likely a user error. - // - // We only do so because, the way our 'save' feature is currently implemented in the - // viewer can result in a single row's worth of data to be split across several insertions - // when loading that data back (because we dump per-bucket, and RowIds get duplicated - // across buckets). - // - // TODO(#1894): Remove this once the save/load process becomes RowId-driven. - let store_event = insert_row_with_retries( - &mut self.data_store, - row, - MAX_INSERT_ROW_ATTEMPTS, - DEFAULT_INSERT_ROW_STEP_SIZE, - )?; - - // Update our internal views by notifying them of resulting [`StoreEvent`]s. - let original_store_events = &[store_event]; - self.times_per_timeline.on_events(original_store_events); - self.query_caches.on_events(original_store_events); - self.tree.on_store_additions(original_store_events); - - // We inform the stats last, since it measures e2e latency. - self.stats.on_events(original_store_events); - Ok(()) } @@ -484,9 +384,8 @@ impl EntityDb { re_tracing::profile_function!(); self.gc(&GarbageCollectionOptions { - target: re_data_store::GarbageCollectionTarget::Everything, + target: GarbageCollectionTarget::Everything, protect_latest: 1, // TODO(jleibs): Bump this after we have an undo buffer - purge_empty_tables: true, dont_protect_components: [ re_types_core::components::ClearIsRecursive::name(), re_types_core::archetypes::Clear::indicator().name(), @@ -496,7 +395,6 @@ impl EntityDb { dont_protect_timelines: [Timeline::log_tick(), Timeline::log_time()] .into_iter() .collect(), - enable_batching: false, time_budget: DEFAULT_GC_TIME_BUDGET, }); } @@ -507,14 +405,10 @@ impl EntityDb { assert!((0.0..=1.0).contains(&fraction_to_purge)); self.gc(&GarbageCollectionOptions { - target: re_data_store::GarbageCollectionTarget::DropAtLeastFraction( - fraction_to_purge as _, - ), + target: GarbageCollectionTarget::DropAtLeastFraction(fraction_to_purge as _), protect_latest: 1, - purge_empty_tables: false, dont_protect_components: Default::default(), dont_protect_timelines: Default::default(), - enable_batching: false, time_budget: DEFAULT_GC_TIME_BUDGET, }); } @@ -526,14 +420,14 @@ impl EntityDb { re_log::trace!( num_row_ids_dropped = store_events.len(), - size_bytes_dropped = re_format::format_bytes(stats_diff.total.num_bytes as _), + size_bytes_dropped = re_format::format_bytes(stats_diff.total().total_size_bytes as _), "purged datastore" ); self.on_store_deletions(&store_events); } - fn on_store_deletions(&mut self, store_events: &[StoreEvent]) { + fn on_store_deletions(&mut self, store_events: &[ChunkStoreEvent]) { re_tracing::profile_function!(); let Self { @@ -570,11 +464,9 @@ impl EntityDb { pub fn to_messages( &self, time_selection: Option<(Timeline, ResolvedTimeRangeF)>, - ) -> DataTableResult> { + ) -> ChunkResult> { re_tracing::profile_function!(); - self.store().sort_indices_if_needed(); - let set_store_info_msg = self .store_info_msg() .map(|msg| Ok(LogMsg::SetStoreInfo(msg.clone()))); @@ -586,11 +478,28 @@ impl EntityDb { ) }); - let data_messages = self.store().to_data_tables(time_filter).map(|table| { - table - .to_arrow_msg() - .map(|msg| LogMsg::ArrowMsg(self.store_id().clone(), msg)) - }); + let data_messages = self + .store() + .iter_chunks() + .filter(|chunk| { + let Some((timeline, time_range)) = time_filter else { + return true; + }; + + // TODO(cmc): chunk.slice_time_selection(time_selection) + chunk + .timelines() + .get(&timeline) + .map_or(false, |time_chunk| { + time_range.contains(time_chunk.time_range().min()) + || time_range.contains(time_chunk.time_range().max()) + }) + }) + .map(|chunk| { + chunk + .to_arrow_msg() + .map(|msg| LogMsg::ArrowMsg(self.store_id().clone(), msg)) + }); // If this is a blueprint, make sure to include the `BlueprintActivationCommand` message. // We generally use `to_messages` to export a blueprint via "save". In that @@ -620,8 +529,6 @@ impl EntityDb { pub fn clone_with_new_id(&self, new_id: StoreId) -> Result { re_tracing::profile_function!(); - self.store().sort_indices_if_needed(); - let mut new_db = Self::new(new_id.clone()); new_db.last_modified_at = self.last_modified_at; @@ -641,13 +548,13 @@ impl EntityDb { new_info.cloned_from = Some(self.store_id().clone()); new_db.set_store_info(SetStoreInfo { - row_id: RowId::new(), + row_id: *RowId::new(), info: new_info, }); } - for row in self.store().to_rows()? { - new_db.add_data_row(row)?; + for chunk in self.store().iter_chunks() { + new_db.add_chunk(&Arc::clone(chunk))?; } Ok(new_db) @@ -655,9 +562,10 @@ impl EntityDb { } impl re_types_core::SizeBytes for EntityDb { + #[inline] fn heap_size_bytes(&self) -> u64 { // TODO(emilk): size of entire EntityDb, including secondary indices etc - self.data_store.heap_size_bytes() + self.data_store().stats().total().total_size_bytes } } @@ -668,23 +576,29 @@ pub struct IngestionStatistics { e2e_latency_sec_history: Mutex>, } -impl StoreSubscriber for IngestionStatistics { +impl ChunkStoreSubscriber for IngestionStatistics { + #[inline] fn name(&self) -> String { "rerun.testing.store_subscribers.IngestionStatistics".into() } + #[inline] fn as_any(&self) -> &dyn std::any::Any { self } + #[inline] fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self } - fn on_events(&mut self, events: &[StoreEvent]) { + #[inline] + fn on_events(&mut self, events: &[ChunkStoreEvent]) { for event in events { if event.store_id == self.store_id { - self.on_new_row_id(event.row_id); + for row_id in event.diff.chunk.row_ids() { + self.on_new_row_id(row_id); + } } } } diff --git a/crates/re_entity_db/src/entity_tree.rs b/crates/re_entity_db/src/entity_tree.rs index 5640b6f564ea..bd8c071fdad7 100644 --- a/crates/re_entity_db/src/entity_tree.rs +++ b/crates/re_entity_db/src/entity_tree.rs @@ -4,21 +4,20 @@ use ahash::HashSet; use itertools::Itertools; use nohash_hasher::IntMap; -use re_data_store::{StoreDiff, StoreDiffKind, StoreEvent, StoreSubscriber}; -use re_log_types::{ - ComponentPath, EntityPath, EntityPathHash, EntityPathPart, RowId, TimeInt, Timeline, -}; +use re_chunk::RowId; +use re_chunk_store::{ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent, ChunkStoreSubscriber}; +use re_log_types::{ComponentPath, EntityPath, EntityPathHash, EntityPathPart, TimeInt, Timeline}; use re_types_core::ComponentName; // Used all over in docstrings. #[allow(unused_imports)] -use re_data_store::DataStore; +use re_chunk_store::ChunkStore; use crate::TimeHistogramPerTimeline; // ---------------------------------------------------------------------------- -/// A recursive, manually updated [`re_data_store::StoreSubscriber`] that maintains the entity hierarchy. +/// A recursive, manually updated [`ChunkStoreSubscriber`] that maintains the entity hierarchy. /// /// The tree contains a list of subtrees, and so on recursively. pub struct EntityTree { @@ -35,9 +34,9 @@ pub struct EntityTree { pub subtree: SubtreeInfo, } -// NOTE: This is only to let people know that this is in fact a [`StoreSubscriber`], so they A) don't try +// NOTE: This is only to let people know that this is in fact a [`ChunkStoreSubscriber`], so they A) don't try // to implement it on their own and B) don't try to register it. -impl StoreSubscriber for EntityTree { +impl ChunkStoreSubscriber for EntityTree { fn name(&self) -> String { "rerun.store_subscribers.EntityTree".into() } @@ -51,7 +50,7 @@ impl StoreSubscriber for EntityTree { } #[allow(clippy::unimplemented)] - fn on_events(&mut self, _events: &[StoreEvent]) { + fn on_events(&mut self, _events: &[ChunkStoreEvent]) { unimplemented!( r"EntityTree view is maintained manually, see `EntityTree::on_store_{{additions|deletions}}`" ); @@ -89,38 +88,44 @@ pub struct SubtreeInfo { impl SubtreeInfo { /// Assumes the event has been filtered to be part of this subtree. - fn on_event(&mut self, event: &StoreEvent) { + fn on_event(&mut self, event: &ChunkStoreEvent) { use re_types_core::SizeBytes as _; match event.kind { - StoreDiffKind::Addition => { - self.time_histogram - .add(&event.times, event.num_components() as _); - - for cell in event.cells.values() { - self.data_bytes += cell.total_size_bytes(); - } + ChunkStoreDiffKind::Addition => { + let times = event + .chunk + .timelines() + .iter() + .map(|(&timeline, time_chunk)| (timeline, time_chunk.times_raw())) + .collect_vec(); + self.time_histogram.add(×, event.num_components() as _); + + self.data_bytes += event.chunk.total_size_bytes(); } - StoreDiffKind::Deletion => { + ChunkStoreDiffKind::Deletion => { + let times = event + .chunk + .timelines() + .iter() + .map(|(&timeline, time_chunk)| (timeline, time_chunk.times_raw())) + .collect_vec(); self.time_histogram - .remove(&event.timepoint(), event.num_components() as _); - - for cell in event.cells.values() { - let removed_bytes = cell.total_size_bytes(); - self.data_bytes = - self.data_bytes - .checked_sub(removed_bytes) - .unwrap_or_else(|| { - re_log::debug!( - store_id = %event.store_id, - entity_path = %event.diff.entity_path, - current = self.data_bytes, - removed = removed_bytes, - "book keeping underflowed" - ); - u64::MIN - }); - } + .remove(×, event.num_components() as _); + + let removed_bytes = event.chunk.total_size_bytes(); + self.data_bytes + .checked_sub(removed_bytes) + .unwrap_or_else(|| { + re_log::debug!( + store_id = %event.store_id, + entity_path = %event.chunk.entity_path(), + current = self.data_bytes, + removed = removed_bytes, + "book keeping underflowed" + ); + u64::MIN + }); } } } @@ -132,7 +137,7 @@ impl SubtreeInfo { } } -/// Maintains an optimized representation of a batch of [`StoreEvent`]s specifically designed to +/// Maintains an optimized representation of a batch of [`ChunkStoreEvent`]s specifically designed to /// accelerate garbage collection of [`EntityTree`]s. /// /// See [`EntityTree::on_store_deletions`]. @@ -149,26 +154,40 @@ pub struct CompactedStoreEvents { } impl CompactedStoreEvents { - pub fn new(store_events: &[&StoreEvent]) -> Self { + pub fn new(store_events: &[&ChunkStoreEvent]) -> Self { let mut this = Self { - row_ids: store_events.iter().map(|event| event.row_id).collect(), + row_ids: store_events + .iter() + .flat_map(|event| event.chunk.row_ids()) + .collect(), temporal: Default::default(), timeless: Default::default(), }; for event in store_events { if event.is_static() { - let per_component = this.timeless.entry(event.entity_path.hash()).or_default(); - for component_name in event.cells.keys() { - *per_component.entry(*component_name).or_default() += + let per_component = this + .timeless + .entry(event.chunk.entity_path().hash()) + .or_default(); + for component_name in event.chunk.component_names() { + *per_component.entry(component_name).or_default() += event.delta().unsigned_abs(); } } else { - for &(timeline, time) in &event.times { - let per_timeline = this.temporal.entry(event.entity_path.hash()).or_default(); - let per_component = per_timeline.entry(timeline).or_default(); - for component_name in event.cells.keys() { - per_component.entry(*component_name).or_default().push(time); + for (&timeline, time_chunk) in event.chunk.timelines() { + let per_timeline = this + .temporal + .entry(event.chunk.entity_path().hash()) + .or_default(); + for &time in time_chunk.times_raw() { + let per_component = per_timeline.entry(timeline).or_default(); + for component_name in event.chunk.component_names() { + per_component + .entry(component_name) + .or_default() + .push(TimeInt::new_temporal(time)); + } } } } @@ -217,21 +236,24 @@ impl EntityTree { .and_then(|per_timeline| per_timeline.get(timeline)) } - /// Updates the [`EntityTree`] by applying a batch of [`StoreEvent`]s. + /// Updates the [`EntityTree`] by applying a batch of [`ChunkStoreEvent`]s. /// /// Only reacts to additions (`event.kind == StoreDiffKind::Addition`). - pub fn on_store_additions(&mut self, events: &[StoreEvent]) { + pub fn on_store_additions(&mut self, events: &[ChunkStoreEvent]) { re_tracing::profile_function!(); - for event in events.iter().filter(|e| e.kind == StoreDiffKind::Addition) { + for event in events + .iter() + .filter(|e| e.kind == ChunkStoreDiffKind::Addition) + { self.on_store_addition(event); } } - fn on_store_addition(&mut self, event: &StoreEvent) { + fn on_store_addition(&mut self, event: &ChunkStoreEvent) { re_tracing::profile_function!(); - let entity_path = &event.diff.entity_path; + let entity_path = event.chunk.entity_path(); // Book-keeping for each level in the hierarchy: let mut tree = self; @@ -250,24 +272,32 @@ impl EntityTree { } /// Handles the addition of new data into the tree. - fn on_added_data(&mut self, store_diff: &StoreDiff) { - for component_name in store_diff.cells.keys() { + fn on_added_data(&mut self, store_diff: &ChunkStoreDiff) { + for component_name in store_diff.chunk.component_names() { let component_path = - ComponentPath::new(store_diff.entity_path.clone(), *component_name); + ComponentPath::new(store_diff.chunk.entity_path().clone(), component_name); let per_component = self .entity .components .entry(component_path.component_name) .or_default(); - per_component.add(&store_diff.times, 1); + per_component.add( + &store_diff + .chunk + .timelines() + .iter() + .map(|(&timeline, time_chunk)| (timeline, time_chunk.times_raw())) + .collect_vec(), + 1, + ); } } - /// Updates the [`EntityTree`] by applying a batch of [`StoreEvent`]s. + /// Updates the [`EntityTree`] by applying a batch of [`ChunkStoreEvent`]s. /// /// Only reacts to deletions (`event.kind == StoreDiffKind::Deletion`). - pub fn on_store_deletions(&mut self, store_events: &[&StoreEvent]) { + pub fn on_store_deletions(&mut self, store_events: &[&ChunkStoreEvent]) { re_tracing::profile_function!(); let Self { @@ -280,18 +310,29 @@ impl EntityTree { // Only keep events relevant to this branch of the tree. let subtree_events = store_events .iter() - .filter(|e| e.entity_path.starts_with(path)) + .filter(|e| e.diff.chunk.entity_path().starts_with(path)) .copied() // NOTE: not actually copying, just removing the superfluous ref layer .collect_vec(); { re_tracing::profile_scope!("entity"); - for event in subtree_events.iter().filter(|e| &e.entity_path == path) { - for component_name in event.cells.keys() { - if let Some(histo) = entity.components.get_mut(component_name) { - histo.remove(&event.timepoint(), 1); + for event in subtree_events + .iter() + .filter(|e| e.chunk.entity_path() == path) + { + for component_name in event.chunk.component_names() { + if let Some(histo) = entity.components.get_mut(&component_name) { + histo.remove( + &event + .chunk + .timelines() + .iter() + .map(|(timeline, time_chunk)| (*timeline, time_chunk.times_raw())) + .collect_vec(), + 1, + ); if histo.is_empty() { - entity.components.remove(component_name); + entity.components.remove(&component_name); } } } diff --git a/crates/re_entity_db/src/instance_path.rs b/crates/re_entity_db/src/instance_path.rs index 40308e999466..c11e86676187 100644 --- a/crates/re_entity_db/src/instance_path.rs +++ b/crates/re_entity_db/src/instance_path.rs @@ -1,6 +1,7 @@ use std::{hash::Hash, str::FromStr}; -use re_log_types::{DataPath, EntityPath, EntityPathHash, Instance, PathParseError, RowId}; +use re_chunk::RowId; +use re_log_types::{DataPath, EntityPath, EntityPathHash, Instance, PathParseError}; use crate::{EntityDb, VersionedInstancePath, VersionedInstancePathHash}; diff --git a/crates/re_entity_db/src/lib.rs b/crates/re_entity_db/src/lib.rs index f8852d1d9c31..2ef69e2b24c2 100644 --- a/crates/re_entity_db/src/lib.rs +++ b/crates/re_entity_db/src/lib.rs @@ -22,33 +22,26 @@ pub use self::{ versioned_instance_path::{VersionedInstancePath, VersionedInstancePathHash}, }; -use re_log_types::DataTableError; pub use re_log_types::{EntityPath, EntityPathPart, TimeInt, Timeline}; pub mod external { - pub use re_data_store; + pub use re_chunk_store; pub use re_query; } // ---------------------------------------------------------------------------- -/// The errors that can occur when misusing the data store. +/// The errors that can occur when misusing the chunk store. /// /// Most of these indicate a problem with either the logging SDK, /// or how the logging SDK is being used (PEBKAC). #[derive(thiserror::Error, Debug)] pub enum Error { - #[error("The incoming data was inconsistent: {0}")] - DataRead(#[from] re_log_types::DataReadError), - - #[error("Error with one the underlying data table: {0}")] - DataTable(#[from] DataTableError), - #[error(transparent)] - Write(#[from] re_data_store::WriteError), + Write(#[from] re_chunk_store::ChunkStoreError), #[error(transparent)] - DataRow(#[from] re_log_types::DataRowError), + Chunk(#[from] re_chunk::ChunkError), } pub type Result = std::result::Result; diff --git a/crates/re_entity_db/src/store_bundle.rs b/crates/re_entity_db/src/store_bundle.rs index fd25eee34650..60217655881d 100644 --- a/crates/re_entity_db/src/store_bundle.rs +++ b/crates/re_entity_db/src/store_bundle.rs @@ -10,7 +10,7 @@ pub enum StoreLoadError { Decode(#[from] re_log_encoding::decoder::DecodeError), #[error(transparent)] - DataStore(#[from] crate::Error), + ChunkStore(#[from] crate::Error), } /// Stores many [`EntityDb`]s of recordings and blueprints. @@ -99,7 +99,7 @@ impl StoreBundle { re_log::debug!("Creating a new blueprint {id}"); blueprint_db.set_store_info(re_log_types::SetStoreInfo { - row_id: re_log_types::RowId::new(), + row_id: *re_chunk::RowId::new(), info: re_log_types::StoreInfo { application_id: id.as_str().into(), store_id: id.clone(), diff --git a/crates/re_entity_db/src/time_histogram_per_timeline.rs b/crates/re_entity_db/src/time_histogram_per_timeline.rs index c2452e2bc406..b26e45749854 100644 --- a/crates/re_entity_db/src/time_histogram_per_timeline.rs +++ b/crates/re_entity_db/src/time_histogram_per_timeline.rs @@ -1,7 +1,7 @@ use std::collections::BTreeMap; -use re_data_store::{StoreEvent, StoreSubscriber}; -use re_log_types::{TimeInt, TimePoint, Timeline}; +use re_chunk_store::{ChunkStoreEvent, ChunkStoreSubscriber}; +use re_log_types::Timeline; // --- @@ -61,8 +61,8 @@ impl TimeHistogramPerTimeline { self.times.values().map(|hist| hist.total_count()).sum() } - pub fn add(&mut self, times: &[(Timeline, TimeInt)], n: u32) { - if times.is_empty() { + pub fn add(&mut self, times_per_timeline: &[(Timeline, &[i64])], n: u32) { + if times_per_timeline.is_empty() { self.num_static_messages = self .num_static_messages .checked_add(n as u64) @@ -75,17 +75,17 @@ impl TimeHistogramPerTimeline { u64::MAX }); } else { - for &(timeline, time) in times { - self.times - .entry(timeline) - .or_default() - .increment(time.as_i64(), n); + for &(timeline, times) in times_per_timeline { + let histogram = self.times.entry(timeline).or_default(); + for &time in times { + histogram.increment(time, n); + } } } } - pub fn remove(&mut self, timepoint: &TimePoint, n: u32) { - if timepoint.is_static() { + pub fn remove(&mut self, times_per_timeline: &[(Timeline, &[i64])], n: u32) { + if times_per_timeline.is_empty() { self.num_static_messages = self .num_static_messages .checked_sub(n as u64) @@ -99,20 +99,22 @@ impl TimeHistogramPerTimeline { u64::MIN }); } else { - for (timeline, time_value) in timepoint.iter() { - let hist = self.times.entry(*timeline).or_default(); - hist.decrement(time_value.as_i64(), n); - if hist.is_empty() { - self.times.remove(timeline); + for &(timeline, times) in times_per_timeline { + let histogram = self.times.entry(timeline).or_default(); + for &time in times { + histogram.decrement(time, n); + } + if histogram.is_empty() { + self.times.remove(&timeline); } } } } } -// NOTE: This is only to let people know that this is in fact a [`StoreSubscriber`], so they A) don't try +// NOTE: This is only to let people know that this is in fact a [`ChunkStoreSubscriber`], so they A) don't try // to implement it on their own and B) don't try to register it. -impl StoreSubscriber for TimeHistogramPerTimeline { +impl ChunkStoreSubscriber for TimeHistogramPerTimeline { #[inline] fn name(&self) -> String { "rerun.store_subscriber.TimeHistogramPerTimeline".into() @@ -129,7 +131,7 @@ impl StoreSubscriber for TimeHistogramPerTimeline { } #[allow(clippy::unimplemented)] - fn on_events(&mut self, _events: &[StoreEvent]) { + fn on_events(&mut self, _events: &[ChunkStoreEvent]) { unimplemented!( r"TimeHistogramPerTimeline view is maintained as a sub-view of `EntityTree`", ); diff --git a/crates/re_entity_db/src/times_per_timeline.rs b/crates/re_entity_db/src/times_per_timeline.rs index 0687b33ffd49..ef94bb8425c4 100644 --- a/crates/re_entity_db/src/times_per_timeline.rs +++ b/crates/re_entity_db/src/times_per_timeline.rs @@ -1,13 +1,13 @@ use std::collections::BTreeMap; -use re_data_store::{StoreEvent, StoreSubscriber}; +use re_chunk_store::{ChunkStoreEvent, ChunkStoreSubscriber}; use re_log_types::{TimeInt, Timeline}; // --- pub type TimeCounts = BTreeMap; -/// A [`StoreSubscriber`] that keeps track of all unique timestamps on each [`Timeline`]. +/// A [`ChunkStoreSubscriber`] that keeps track of all unique timestamps on each [`Timeline`]. pub struct TimesPerTimeline(BTreeMap); impl std::ops::Deref for TimesPerTimeline { @@ -33,7 +33,7 @@ impl Default for TimesPerTimeline { } } -impl StoreSubscriber for TimesPerTimeline { +impl ChunkStoreSubscriber for TimesPerTimeline { #[inline] fn name(&self) -> String { "rerun.store_subscriber.TimesPerTimeline".into() @@ -50,42 +50,45 @@ impl StoreSubscriber for TimesPerTimeline { } #[inline] - fn on_events(&mut self, events: &[StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { re_tracing::profile_function!(format!("num_events={}", events.len())); for event in events { - for &(timeline, time) in &event.times { + for (&timeline, time_chunk) in event.chunk.timelines() { let per_time = self.0.entry(timeline).or_default(); - let count = per_time.entry(time).or_default(); - - let delta = event.delta(); - - if delta < 0 { - *count = count.checked_sub(delta.unsigned_abs()).unwrap_or_else(|| { - re_log::debug!( - store_id = %event.store_id, - entity_path = %event.diff.entity_path, - current = count, - removed = delta.unsigned_abs(), - "book keeping underflowed" - ); - u64::MIN - }); - } else { - *count = count.checked_add(delta.unsigned_abs()).unwrap_or_else(|| { - re_log::debug!( - store_id = %event.store_id, - entity_path = %event.diff.entity_path, - current = count, - removed = delta.unsigned_abs(), - "book keeping overflowed" - ); - u64::MAX - }); - } - if *count == 0 { - per_time.remove(&time); + for time in time_chunk.times() { + let count = per_time.entry(time).or_default(); + + let delta = event.delta(); + + if delta < 0 { + *count = count.checked_sub(delta.unsigned_abs()).unwrap_or_else(|| { + re_log::debug!( + store_id = %event.store_id, + entity_path = %event.chunk.entity_path(), + current = count, + removed = delta.unsigned_abs(), + "book keeping underflowed" + ); + u64::MIN + }); + } else { + *count = count.checked_add(delta.unsigned_abs()).unwrap_or_else(|| { + re_log::debug!( + store_id = %event.store_id, + entity_path = %event.chunk.entity_path(), + current = count, + removed = delta.unsigned_abs(), + "book keeping overflowed" + ); + u64::MAX + }); + } + + if *count == 0 { + per_time.remove(&time); + } } } } diff --git a/crates/re_entity_db/src/versioned_instance_path.rs b/crates/re_entity_db/src/versioned_instance_path.rs index e7dc24e4a0a8..e376e626b7fd 100644 --- a/crates/re_entity_db/src/versioned_instance_path.rs +++ b/crates/re_entity_db/src/versioned_instance_path.rs @@ -1,6 +1,6 @@ use std::hash::Hash; -use re_log_types::RowId; +use re_chunk::RowId; use crate::{InstancePath, InstancePathHash}; diff --git a/crates/re_entity_db/tests/clear.rs b/crates/re_entity_db/tests/clear.rs index 9a020084d9c6..5f3e32f45745 100644 --- a/crates/re_entity_db/tests/clear.rs +++ b/crates/re_entity_db/tests/clear.rs @@ -1,11 +1,14 @@ // https://github.com/rust-lang/rust-clippy/issues/10011 #![cfg(test)] -use re_data_store::LatestAtQuery; +use std::sync::Arc; + +use re_chunk::{Chunk, RowId}; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityDb; use re_log_types::{ example_components::{MyColor, MyIndex, MyPoint}, - DataRow, EntityPath, RowId, StoreId, TimeInt, TimePoint, Timeline, + EntityPath, StoreId, TimeInt, TimePoint, Timeline, }; use re_query::PromiseResolver; use re_types_core::{archetypes::Clear, components::ClearIsRecursive, AsComponents}; @@ -53,14 +56,11 @@ fn clears() -> anyhow::Result<()> { let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); let point = MyPoint::new(1.0, 2.0); let color = MyColor::from(0xFF0000FF); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_parent.clone(), - [&[point] as _, &[color] as _], - )?; + let chunk = Chunk::builder(entity_path_parent.clone()) + .with_component_batches(row_id, timepoint, [&[point] as _, &[color] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -80,14 +80,11 @@ fn clears() -> anyhow::Result<()> { let row_id = RowId::new(); let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); let point = MyPoint::new(42.0, 43.0); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_child1.clone(), - [&[point] as _], - )?; + let chunk = Chunk::builder(entity_path_child1.clone()) + .with_component_batches(row_id, timepoint, [&[point] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -104,14 +101,11 @@ fn clears() -> anyhow::Result<()> { let row_id = RowId::new(); let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); let color = MyColor::from(0x00AA00DD); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_child2.clone(), - [&[color] as _], - )?; + let chunk = Chunk::builder(entity_path_child2.clone()) + .with_component_batches(row_id, timepoint, [&[color] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -130,14 +124,15 @@ fn clears() -> anyhow::Result<()> { let row_id = RowId::new(); let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); let clear = Clear::flat(); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_parent.clone(), - clear.as_component_batches().iter().map(|b| b.as_ref()), - )?; + let chunk = Chunk::builder(entity_path_parent.clone()) + .with_component_batches( + row_id, + timepoint, + clear.as_component_batches().iter().map(|b| b.as_ref()), + ) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -167,14 +162,15 @@ fn clears() -> anyhow::Result<()> { let row_id = RowId::new(); let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); let clear = Clear::recursive(); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_parent.clone(), - clear.as_component_batches().iter().map(|b| b.as_ref()), - )?; + let chunk = Chunk::builder(entity_path_parent.clone()) + .with_component_batches( + row_id, + timepoint, + clear.as_component_batches().iter().map(|b| b.as_ref()), + ) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -203,14 +199,11 @@ fn clears() -> anyhow::Result<()> { let row_id = RowId::new(); let timepoint = TimePoint::from_iter([(timeline_frame, 9)]); let instance = MyIndex(0); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_parent.clone(), - [&[instance] as _], - )?; + let chunk = Chunk::builder(entity_path_parent.clone()) + .with_component_batches(row_id, timepoint, [&[instance] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 9); @@ -234,14 +227,11 @@ fn clears() -> anyhow::Result<()> { let timepoint = TimePoint::from_iter([(timeline_frame, 9)]); let point = MyPoint::new(42.0, 43.0); let color = MyColor::from(0xBBBBBBBB); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_child1.clone(), - [&[point] as _, &[color] as _], - )?; + let chunk = Chunk::builder(entity_path_child1.clone()) + .with_component_batches(row_id, timepoint, [&[point] as _, &[color] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 9); @@ -270,14 +260,11 @@ fn clears() -> anyhow::Result<()> { let timepoint = TimePoint::from_iter([(timeline_frame, 9)]); let color = MyColor::from(0x00AA00DD); let point = MyPoint::new(66.0, 666.0); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_child2.clone(), - [&[color] as _, &[point] as _], - )?; + let chunk = Chunk::builder(entity_path_child2.clone()) + .with_component_batches(row_id, timepoint, [&[color] as _, &[point] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 9); @@ -304,14 +291,11 @@ fn clears() -> anyhow::Result<()> { let row_id = RowId::new(); let timepoint = TimePoint::from_iter([(timeline_frame, 9)]); let color = MyColor::from(0x00AA00DD); - let row = DataRow::from_component_batches( - row_id, - timepoint, - entity_path_grandchild.clone(), - [&[color] as _], - )?; + let chunk = Chunk::builder(entity_path_grandchild.clone()) + .with_component_batches(row_id, timepoint, [&[color] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 9); @@ -349,14 +333,11 @@ fn clears_respect_index_order() -> anyhow::Result<()> { let timepoint = TimePoint::from_iter([(timeline_frame, 10)]); let point = MyPoint::new(1.0, 2.0); - let row = DataRow::from_component_batches( - row_id2, - timepoint.clone(), - entity_path.clone(), - [&[point] as _], - )?; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batches(row_id2, timepoint.clone(), [&[point] as _]) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -366,14 +347,15 @@ fn clears_respect_index_order() -> anyhow::Result<()> { } let clear = Clear::recursive(); - let row = DataRow::from_component_batches( - row_id1, // older row id! - timepoint.clone(), - entity_path.clone(), - clear.as_component_batches().iter().map(|b| b.as_ref()), - )?; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id1, // older row id! + timepoint.clone(), + clear.as_component_batches().iter().map(|b| b.as_ref()), + ) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -389,14 +371,15 @@ fn clears_respect_index_order() -> anyhow::Result<()> { } let clear = Clear::recursive(); - let row = DataRow::from_component_batches( - row_id3, // newer row id! - timepoint, - entity_path.clone(), - clear.as_component_batches().iter().map(|b| b.as_ref()), - )?; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batches( + row_id3, // newer row id! + timepoint.clone(), + clear.as_component_batches().iter().map(|b| b.as_ref()), + ) + .build()?; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; { let query = LatestAtQuery::new(timeline_frame, 11); @@ -414,8 +397,6 @@ fn clears_respect_index_order() -> anyhow::Result<()> { #[test] fn clear_and_gc() -> anyhow::Result<()> { - use re_data_store::DataStoreStats; - if true { // TODO(#6552): Keeping this around for now so we don't forget about it, but this cannot work with // read-time clears. @@ -437,37 +418,36 @@ fn clear_and_gc() -> anyhow::Result<()> { let point = MyPoint::new(1.0, 2.0); - let row = DataRow::from_component_batches( - RowId::new(), - timepoint.clone(), - entity_path.clone(), - [&[point] as _], - )?; - - db.add_data_row(row)?; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(RowId::new(), timepoint.clone(), &[point] as _) + .build()?; + db.add_chunk(&Arc::new(chunk))?; + eprintln!("{}", db.store()); db.gc_everything_but_the_latest_row_on_non_default_timelines(); - let stats = DataStoreStats::from_store(db.store()); - assert_eq!(stats.temporal.num_rows, 1); + let stats = db.store().stats(); + assert_eq!(stats.temporal_chunks.total_num_rows, 1); - let clear = DataRow::from_component_batches( - RowId::new(), - timepoint.clone(), - entity_path.clone(), - Clear::recursive() - .as_component_batches() - .iter() - .map(|b| b.as_ref()), - )?; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batches( + RowId::new(), + timepoint.clone(), + Clear::recursive() + .as_component_batches() + .iter() + .map(|b| b.as_ref()), + ) + .build()?; - db.add_data_row(clear)?; + db.add_chunk(&Arc::new(chunk))?; + eprintln!("{}", db.store()); db.gc_everything_but_the_latest_row_on_non_default_timelines(); // No rows should remain because the table should have been purged - let stats = DataStoreStats::from_store(db.store()); - assert_eq!(stats.temporal.num_rows, 0); + let stats = db.store().stats(); + assert_eq!(stats.temporal_chunks.total_num_rows, 0); // EntityTree should be empty again when we end since everything was GC'd assert_eq!(db.tree().num_children_and_fields(), 0); diff --git a/crates/re_entity_db/tests/time_histograms.rs b/crates/re_entity_db/tests/time_histograms.rs index d7234eeb46db..1712e627a7ed 100644 --- a/crates/re_entity_db/tests/time_histograms.rs +++ b/crates/re_entity_db/tests/time_histograms.rs @@ -1,14 +1,15 @@ // https://github.com/rust-lang/rust-clippy/issues/10011 #![cfg(test)] -use std::collections::BTreeSet; +use std::{collections::BTreeSet, sync::Arc}; -use re_data_store::GarbageCollectionOptions; +use re_chunk::{Chunk, ChunkId, RowId}; +use re_chunk_store::GarbageCollectionOptions; use re_entity_db::EntityDb; use re_int_histogram::RangeI64; use re_log_types::{ example_components::{MyColor, MyIndex, MyPoint}, - DataRow, EntityPath, RowId, StoreId, TimeInt, TimePoint, Timeline, + EntityPath, StoreId, TimeInt, TimePoint, Timeline, }; use re_types_core::{components::ClearIsRecursive, ComponentName, Loggable}; @@ -30,18 +31,19 @@ fn time_histograms() -> anyhow::Result<()> { // Single top-level entity, explicitly logged `MyIndex`s. { - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::from_iter([ - (timeline_frame, 42), // - (timeline_other, 666), // - (timeline_yet_another, 1), // - ]), - entity_parent.clone(), - [&MyIndex::from_iter(0..10) as _], - )?; - - db.add_data_row(row)?; + let chunk = Chunk::builder(entity_parent.clone()) + .with_component_batches( + RowId::new(), + TimePoint::from_iter([ + (timeline_frame, 42), // + (timeline_other, 666), // + (timeline_yet_another, 1), // + ]), + [&MyIndex::from_iter(0..10) as _], + ) + .build()?; + + db.add_chunk(&Arc::new(chunk))?; // times per timeline assert_times_per_timeline( @@ -80,24 +82,26 @@ fn time_histograms() -> anyhow::Result<()> { // Grand-child, multiple components, auto-generated `MyIndex`s. { - let row = { + let chunk = { let num_instances = 3; let points: Vec<_> = (0..num_instances) .map(|i| MyPoint::new(0.0, i as f32)) .collect(); let colors = vec![MyColor::from(0xFF0000FF)]; - DataRow::from_component_batches( - RowId::new(), - TimePoint::from_iter([ - (timeline_frame, 42), // - (timeline_yet_another, 1), // - ]), - entity_grandchild.clone(), - [&points as _, &colors as _], - )? + Chunk::builder(entity_grandchild.clone()) + .with_component_batches( + RowId::new(), + TimePoint::from_iter([ + (timeline_frame, 42), // + (timeline_yet_another, 1), // + ]), + [&points as _, &colors as _], + ) + .build()? }; + let chunk = Arc::new(chunk); - db.add_data_row(row.clone())?; + db.add_chunk(&chunk)?; assert_times_per_timeline( &db, @@ -181,7 +185,7 @@ fn time_histograms() -> anyhow::Result<()> { ] as [(_, Option<&[_]>); 2], ); - db.add_data_row(row)?; // same row a second time! + db.add_chunk(&Arc::new(chunk.clone_as(ChunkId::new(), RowId::new())))?; // same chunk a second time! // times per timeline assert_times_per_timeline( @@ -234,21 +238,22 @@ fn time_histograms() -> anyhow::Result<()> { // Grand-child, timeless additions. { - let row = { + let chunk = { let num_instances = 6; let colors = vec![MyColor::from(0x00DD00FF); num_instances]; - DataRow::from_component_batches( - RowId::new(), - TimePoint::default(), - "entity".into(), - [ - &MyIndex::from_iter(0..num_instances as _) as _, - &colors as _, - ], - )? + Chunk::builder("entity".into()) + .with_component_batches( + RowId::new(), + TimePoint::default(), + [ + &MyIndex::from_iter(0..num_instances as _) as _, + &colors as _, + ], + ) + .build()? }; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; // times per timeline assert_times_per_timeline( @@ -321,29 +326,30 @@ fn time_histograms() -> anyhow::Result<()> { // Completely unrelated entity. { - let row = { + let chunk = { let num_instances = 3; let points: Vec<_> = (0..num_instances) .map(|i| MyPoint::new(0.0, i as f32)) .collect(); let colors = vec![MyColor::from(0xFF0000FF)]; - DataRow::from_component_batches( - RowId::new(), - TimePoint::from_iter([ - (timeline_frame, 1234), // - (timeline_other, 1235), // - (timeline_yet_another, 1236), // - ]), - entity_unrelated.clone(), - [ - &MyIndex::from_iter(0..num_instances) as _, - &points as _, - &colors as _, - ], - )? + Chunk::builder(entity_unrelated.clone()) + .with_component_batches( + RowId::new(), + TimePoint::from_iter([ + (timeline_frame, 1234), // + (timeline_other, 1235), // + (timeline_yet_another, 1236), // + ]), + [ + &MyIndex::from_iter(0..num_instances) as _, + &points as _, + &colors as _, + ], + ) + .build()? }; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; assert_times_per_timeline( &db, @@ -452,18 +458,19 @@ fn time_histograms() -> anyhow::Result<()> { // Immediate clear. { - let row = { - DataRow::from_component_batches( - RowId::new(), - TimePoint::from_iter([ - (timeline_frame, 1000), // - ]), - entity_parent.clone(), - [&[ClearIsRecursive(true)] as _], - )? + let chunk = { + Chunk::builder(entity_parent.clone()) + .with_component_batches( + RowId::new(), + TimePoint::from_iter([ + (timeline_frame, 1000), // + ]), + [&[ClearIsRecursive(true)] as _], + ) + .build()? }; - db.add_data_row(row)?; + db.add_chunk(&Arc::new(chunk))?; assert_times_per_timeline( &db, diff --git a/crates/re_log_encoding/Cargo.toml b/crates/re_log_encoding/Cargo.toml index 58ffc30fcddd..721b42462c08 100644 --- a/crates/re_log_encoding/Cargo.toml +++ b/crates/re_log_encoding/Cargo.toml @@ -43,6 +43,7 @@ stream_from_http = [ # Rerun: re_build_info.workspace = true +re_chunk.workspace = true re_log_types.workspace = true re_log.workspace = true re_smart_channel.workspace = true diff --git a/crates/re_log_encoding/benches/msg_encode_benchmark.rs b/crates/re_log_encoding/benches/msg_encode_benchmark.rs index 6a8c768ae705..7ff51a71a494 100644 --- a/crates/re_log_encoding/benches/msg_encode_benchmark.rs +++ b/crates/re_log_encoding/benches/msg_encode_benchmark.rs @@ -7,10 +7,11 @@ compile_error!("msg_encode_benchmark requires 'decoder' and 'encoder' features." #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; +use re_chunk::{Chunk, RowId, TransportChunk}; use re_log_types::{ entity_path, example_components::{MyColor, MyPoint}, - DataRow, DataTable, LogMsg, RowId, StoreId, StoreKind, TableId, TimeInt, TimeType, Timeline, + LogMsg, StoreId, StoreKind, TimeInt, TimeType, Timeline, }; use criterion::{criterion_group, criterion_main, Criterion}; @@ -54,19 +55,23 @@ fn decode_log_msgs(mut bytes: &[u8]) -> Vec { messages } -fn generate_messages(store_id: &StoreId, tables: &[DataTable]) -> Vec { - tables +fn generate_messages(store_id: &StoreId, chunks: &[Chunk]) -> Vec { + chunks .iter() - .map(|table| LogMsg::ArrowMsg(store_id.clone(), table.to_arrow_msg().unwrap())) + .map(|chunk| LogMsg::ArrowMsg(store_id.clone(), chunk.to_arrow_msg().unwrap())) .collect() } -fn decode_tables(messages: &[LogMsg]) -> Vec { +fn decode_chunks(messages: &[LogMsg]) -> Vec { messages .iter() .map(|log_msg| { if let LogMsg::ArrowMsg(_, arrow_msg) = log_msg { - DataTable::from_arrow_msg(arrow_msg).unwrap() + Chunk::from_transport(&TransportChunk { + schema: arrow_msg.schema.clone(), + data: arrow_msg.chunk.clone(), + }) + .unwrap() } else { unreachable!() } @@ -75,19 +80,20 @@ fn decode_tables(messages: &[LogMsg]) -> Vec { } fn mono_points_arrow(c: &mut Criterion) { - fn generate_tables() -> Vec { + fn generate_chunks() -> Vec { (0..NUM_POINTS) .map(|i| { - DataTable::from_rows( - TableId::ZERO, - [DataRow::from_cells2( + Chunk::builder(entity_path!("points", i.to_string())) + .with_component_batches( RowId::ZERO, - entity_path!("points", i.to_string()), [build_frame_nr(TimeInt::ZERO)], - (MyPoint::from_iter(0..1), MyColor::from_iter(0..1)), + [ + &MyPoint::from_iter(0..1) as _, + &MyColor::from_iter(0..1) as _, + ], ) - .unwrap()], - ) + .build() + .unwrap() }) .collect() } @@ -97,18 +103,18 @@ fn mono_points_arrow(c: &mut Criterion) { let mut group = c.benchmark_group("mono_points_arrow"); group.throughput(criterion::Throughput::Elements(NUM_POINTS as _)); group.bench_function("generate_message_bundles", |b| { - b.iter(generate_tables); + b.iter(generate_chunks); }); - let tables = generate_tables(); + let chunks = generate_chunks(); group.bench_function("generate_messages", |b| { - b.iter(|| generate_messages(&store_id, &tables)); + b.iter(|| generate_messages(&store_id, &chunks)); }); - let messages = generate_messages(&store_id, &tables); + let messages = generate_messages(&store_id, &chunks); group.bench_function("encode_log_msg", |b| { b.iter(|| encode_log_msgs(&messages)); }); group.bench_function("encode_total", |b| { - b.iter(|| encode_log_msgs(&generate_messages(&store_id, &generate_tables()))); + b.iter(|| encode_log_msgs(&generate_messages(&store_id, &generate_chunks()))); }); let encoded = encode_log_msgs(&messages); @@ -121,31 +127,31 @@ fn mono_points_arrow(c: &mut Criterion) { }); group.bench_function("decode_message_bundles", |b| { b.iter(|| { - let tables = decode_tables(&messages); - assert_eq!(tables.len(), messages.len()); - tables + let chunks = decode_chunks(&messages); + assert_eq!(chunks.len(), messages.len()); + chunks }); }); group.bench_function("decode_total", |b| { - b.iter(|| decode_tables(&decode_log_msgs(&encoded))); + b.iter(|| decode_chunks(&decode_log_msgs(&encoded))); }); } } fn mono_points_arrow_batched(c: &mut Criterion) { - fn generate_table() -> DataTable { - DataTable::from_rows( - TableId::ZERO, - (0..NUM_POINTS).map(|i| { - DataRow::from_cells2( - RowId::ZERO, - entity_path!("points", i.to_string()), - [build_frame_nr(TimeInt::ZERO)], - (MyPoint::from_iter(0..1), MyColor::from_iter(0..1)), - ) - .unwrap() - }), - ) + fn generate_chunk() -> Chunk { + let mut builder = Chunk::builder("points".into()); + for _ in 0..NUM_POINTS { + builder = builder.with_component_batches( + RowId::ZERO, + [build_frame_nr(TimeInt::ZERO)], + [ + &MyPoint::from_iter(0..1) as _, + &MyColor::from_iter(0..1) as _, + ], + ); + } + builder.build().unwrap() } { @@ -153,18 +159,18 @@ fn mono_points_arrow_batched(c: &mut Criterion) { let mut group = c.benchmark_group("mono_points_arrow_batched"); group.throughput(criterion::Throughput::Elements(NUM_POINTS as _)); group.bench_function("generate_message_bundles", |b| { - b.iter(generate_table); + b.iter(generate_chunk); }); - let tables = [generate_table()]; + let chunks = [generate_chunk()]; group.bench_function("generate_messages", |b| { - b.iter(|| generate_messages(&store_id, &tables)); + b.iter(|| generate_messages(&store_id, &chunks)); }); - let messages = generate_messages(&store_id, &tables); + let messages = generate_messages(&store_id, &chunks); group.bench_function("encode_log_msg", |b| { b.iter(|| encode_log_msgs(&messages)); }); group.bench_function("encode_total", |b| { - b.iter(|| encode_log_msgs(&generate_messages(&store_id, &[generate_table()]))); + b.iter(|| encode_log_msgs(&generate_messages(&store_id, &[generate_chunk()]))); }); let encoded = encode_log_msgs(&messages); @@ -177,32 +183,30 @@ fn mono_points_arrow_batched(c: &mut Criterion) { }); group.bench_function("decode_message_bundles", |b| { b.iter(|| { - let bundles = decode_tables(&messages); + let bundles = decode_chunks(&messages); assert_eq!(bundles.len(), messages.len()); bundles }); }); group.bench_function("decode_total", |b| { - b.iter(|| decode_tables(&decode_log_msgs(&encoded))); + b.iter(|| decode_chunks(&decode_log_msgs(&encoded))); }); } } fn batch_points_arrow(c: &mut Criterion) { - fn generate_tables() -> Vec { - vec![DataTable::from_rows( - TableId::ZERO, - [DataRow::from_cells2( + fn generate_chunks() -> Vec { + vec![Chunk::builder(entity_path!("points")) + .with_component_batches( RowId::ZERO, - entity_path!("points"), [build_frame_nr(TimeInt::ZERO)], - ( - MyPoint::from_iter(0..NUM_POINTS as u32), - MyColor::from_iter(0..NUM_POINTS as u32), - ), + [ + &MyPoint::from_iter(0..NUM_POINTS as u32) as _, + &MyColor::from_iter(0..NUM_POINTS as u32) as _, + ], ) - .unwrap()], - )] + .build() + .unwrap()] } { @@ -210,18 +214,18 @@ fn batch_points_arrow(c: &mut Criterion) { let mut group = c.benchmark_group("batch_points_arrow"); group.throughput(criterion::Throughput::Elements(NUM_POINTS as _)); group.bench_function("generate_message_bundles", |b| { - b.iter(generate_tables); + b.iter(generate_chunks); }); - let tables = generate_tables(); + let chunks = generate_chunks(); group.bench_function("generate_messages", |b| { - b.iter(|| generate_messages(&store_id, &tables)); + b.iter(|| generate_messages(&store_id, &chunks)); }); - let messages = generate_messages(&store_id, &tables); + let messages = generate_messages(&store_id, &chunks); group.bench_function("encode_log_msg", |b| { b.iter(|| encode_log_msgs(&messages)); }); group.bench_function("encode_total", |b| { - b.iter(|| encode_log_msgs(&generate_messages(&store_id, &generate_tables()))); + b.iter(|| encode_log_msgs(&generate_messages(&store_id, &generate_chunks()))); }); let encoded = encode_log_msgs(&messages); @@ -234,13 +238,13 @@ fn batch_points_arrow(c: &mut Criterion) { }); group.bench_function("decode_message_bundles", |b| { b.iter(|| { - let tables = decode_tables(&messages); - assert_eq!(tables.len(), messages.len()); - tables + let chunks = decode_chunks(&messages); + assert_eq!(chunks.len(), messages.len()); + chunks }); }); group.bench_function("decode_total", |b| { - b.iter(|| decode_tables(&decode_log_msgs(&encoded))); + b.iter(|| decode_chunks(&decode_log_msgs(&encoded))); }); } } diff --git a/crates/re_log_encoding/src/decoder/mod.rs b/crates/re_log_encoding/src/decoder/mod.rs index 8178f8c3e6f6..6f5182110630 100644 --- a/crates/re_log_encoding/src/decoder/mod.rs +++ b/crates/re_log_encoding/src/decoder/mod.rs @@ -237,15 +237,15 @@ impl Iterator for Decoder { #[cfg(all(feature = "decoder", feature = "encoder"))] #[test] fn test_encode_decode() { + use re_chunk::RowId; use re_log_types::{ - ApplicationId, LogMsg, RowId, SetStoreInfo, StoreId, StoreInfo, StoreKind, StoreSource, - Time, + ApplicationId, LogMsg, SetStoreInfo, StoreId, StoreInfo, StoreKind, StoreSource, Time, }; let rrd_version = CrateVersion::LOCAL; let messages = vec![LogMsg::SetStoreInfo(SetStoreInfo { - row_id: RowId::new(), + row_id: *RowId::new(), info: StoreInfo { application_id: ApplicationId("test".to_owned()), store_id: StoreId::random(StoreKind::Recording), diff --git a/crates/re_log_encoding/src/decoder/stream.rs b/crates/re_log_encoding/src/decoder/stream.rs index dfc8b8297006..d2c41a592345 100644 --- a/crates/re_log_encoding/src/decoder/stream.rs +++ b/crates/re_log_encoding/src/decoder/stream.rs @@ -232,14 +232,10 @@ fn is_chunk_empty(chunk: &Chunk) -> bool { #[cfg(test)] mod tests { - use re_log_types::ApplicationId; - use re_log_types::RowId; - use re_log_types::SetStoreInfo; - use re_log_types::StoreId; - use re_log_types::StoreInfo; - use re_log_types::StoreKind; - use re_log_types::StoreSource; - use re_log_types::Time; + use re_chunk::RowId; + use re_log_types::{ + ApplicationId, SetStoreInfo, StoreId, StoreInfo, StoreKind, StoreSource, Time, + }; use crate::encoder::Encoder; use crate::EncodingOptions; @@ -248,7 +244,7 @@ mod tests { fn fake_log_msg() -> LogMsg { LogMsg::SetStoreInfo(SetStoreInfo { - row_id: RowId::ZERO, + row_id: *RowId::ZERO, info: StoreInfo { application_id: ApplicationId::unknown(), store_id: StoreId::from_string(StoreKind::Recording, "test".into()), diff --git a/crates/re_log_types/src/arrow_msg.rs b/crates/re_log_types/src/arrow_msg.rs index d428c97b7042..09fdc9eeac7b 100644 --- a/crates/re_log_types/src/arrow_msg.rs +++ b/crates/re_log_types/src/arrow_msg.rs @@ -1,15 +1,17 @@ //! [`ArrowMsg`] is the [`crate::LogMsg`] sub-type containing an Arrow payload. //! //! We have custom implementations of [`serde::Serialize`] and [`serde::Deserialize`] that wraps -//! the inner Arrow serialization of [`Schema`] and [`Chunk`]. +//! the inner Arrow serialization of [`ArrowSchema`] and [`ArrowChunk`]. use std::sync::Arc; -use crate::{TableId, TimePoint}; -use arrow2::{array::Array, chunk::Chunk, datatypes::Schema}; +use crate::TimePoint; +use arrow2::{ + array::Array as ArrowArray, chunk::Chunk as ArrowChunk, datatypes::Schema as ArrowSchema, +}; /// An arbitrary callback to be run when an [`ArrowMsg`], and more specifically the -/// Arrow [`Chunk`] within it, goes out of scope. +/// [`ArrowChunk`] within it, goes out of scope. /// /// If the [`ArrowMsg`] has been cloned in a bunch of places, the callback will run for each and /// every instance. @@ -18,10 +20,10 @@ use arrow2::{array::Array, chunk::Chunk, datatypes::Schema}; // TODO(#6412): probably don't need this anymore. #[allow(clippy::type_complexity)] #[derive(Clone)] -pub struct ArrowChunkReleaseCallback(Arc>) + Send + Sync>); +pub struct ArrowChunkReleaseCallback(Arc>) + Send + Sync>); impl std::ops::Deref for ArrowChunkReleaseCallback { - type Target = dyn Fn(Chunk>) + Send + Sync; + type Target = dyn Fn(ArrowChunk>) + Send + Sync; #[inline] fn deref(&self) -> &Self::Target { @@ -31,7 +33,7 @@ impl std::ops::Deref for ArrowChunkReleaseCallback { impl From for ArrowChunkReleaseCallback where - F: Fn(Chunk>) + Send + Sync + 'static, + F: Fn(ArrowChunk>) + Send + Sync + 'static, { #[inline] fn from(f: F) -> Self { @@ -68,8 +70,8 @@ impl std::fmt::Debug for ArrowChunkReleaseCallback { #[derive(Clone, Debug, PartialEq)] #[must_use] pub struct ArrowMsg { - /// Unique identifier for the [`crate::DataTable`] in this message. - pub table_id: TableId, + /// Unique identifier for the chunk in this message. + pub chunk_id: re_tuid::Tuid, /// The maximum values for all timelines across the entire batch of data. /// @@ -78,10 +80,10 @@ pub struct ArrowMsg { pub timepoint_max: TimePoint, /// Schema for all control & data columns. - pub schema: Schema, + pub schema: ArrowSchema, /// Data for all control & data columns. - pub chunk: Chunk>, + pub chunk: ArrowChunk>, // pub on_release: Option>, pub on_release: Option, @@ -119,7 +121,7 @@ impl serde::Serialize for ArrowMsg { .map_err(|err| serde::ser::Error::custom(err.to_string()))?; let mut inner = serializer.serialize_tuple(3)?; - inner.serialize_element(&self.table_id)?; + inner.serialize_element(&self.chunk_id)?; inner.serialize_element(&self.timepoint_max)?; inner.serialize_element(&serde_bytes::ByteBuf::from(buf))?; inner.end() @@ -149,11 +151,11 @@ impl<'de> serde::Deserialize<'de> for ArrowMsg { { re_tracing::profile_scope!("ArrowMsg::deserialize"); - let table_id: Option = seq.next_element()?; + let table_id: Option = seq.next_element()?; let timepoint_max: Option = seq.next_element()?; let buf: Option = seq.next_element()?; - if let (Some(table_id), Some(timepoint_max), Some(buf)) = + if let (Some(chunk_id), Some(timepoint_max), Some(buf)) = (table_id, timepoint_max, buf) { let mut cursor = std::io::Cursor::new(buf); @@ -181,7 +183,7 @@ impl<'de> serde::Deserialize<'de> for ArrowMsg { .map_err(|err| serde::de::Error::custom(format!("Arrow error: {err}")))?; if chunks.is_empty() { - return Err(serde::de::Error::custom("No Chunk found in stream")); + return Err(serde::de::Error::custom("No ArrowChunk found in stream")); } if chunks.len() > 1 { return Err(serde::de::Error::custom(format!( @@ -193,7 +195,7 @@ impl<'de> serde::Deserialize<'de> for ArrowMsg { let chunk = chunks.into_iter().next().unwrap(); Ok(ArrowMsg { - table_id, + chunk_id, timepoint_max, schema, chunk, diff --git a/crates/re_log_types/src/data_cell.rs b/crates/re_log_types/src/data_cell.rs deleted file mode 100644 index 302e5ff902ff..000000000000 --- a/crates/re_log_types/src/data_cell.rs +++ /dev/null @@ -1,686 +0,0 @@ -use std::sync::Arc; - -use arrow2::datatypes::{DataType, Field, Metadata}; - -use re_types_core::{Component, ComponentBatch, ComponentName, DeserializationError, SizeBytes}; - -// --- - -#[derive(thiserror::Error, Debug)] -pub enum DataCellError { - #[error("Unsupported datatype: {0:?}")] - UnsupportedDatatype(arrow2::datatypes::DataType), - - #[error("Could not serialize/deserialize data to/from Arrow: {0}")] - Arrow(#[from] arrow2::error::Error), - - #[error("Could not deserialize data from Arrow: {0}")] - LoggableDeserialize(#[from] re_types_core::DeserializationError), - - #[error("Could not serialize data from Arrow: {0}")] - LoggableSerialize(#[from] re_types_core::SerializationError), - - // Needed to handle TryFrom -> T - #[error("Infallible")] - Unreachable(#[from] std::convert::Infallible), -} - -pub type DataCellResult = ::std::result::Result; - -// --- - -/// A cell's worth of data, i.e. a uniform array of values for a given component type. -/// This is the leaf type in our data model. -/// -/// A `DataCell` can be constructed from either an iterable of native `Component`s or directly -/// from a slice of arrow data. -/// -/// Behind the scenes, a `DataCell` is backed by an erased arrow array living on the heap, which -/// is likely to point into a larger batch of contiguous memory that it shares with its peers. -/// Cloning a `DataCell` is thus cheap (shallow, ref-counted). -/// -/// ## Layout -/// -/// A cell is an array of component instances: `[C, C, C, …]`. -/// -/// Consider this example: -/// ```ignore -/// let points: &[MyPoint] = &[[10.0, 10.0].into(), [20.0, 20.0].into(), [30.0, 30.0].into()]; -/// let cell = DataCell::from(points); -/// // Or, alternatively: -/// let cell = DataCell::from_component::([[10.0, 10.0], [20.0, 20.0], [30.0, 30.0]]); -/// ``` -/// -/// The cell's datatype is now a `StructArray`: -/// ```ignore -/// Struct([ -/// Field { name: "x", data_type: Float32, is_nullable: false, metadata: {} }, -/// Field { name: "y", data_type: Float32, is_nullable: false, metadata: {} }, -/// ]) -/// ``` -/// -/// Or, visualized as a cell within a larger table: -/// ```text -/// ┌──────────────────────────────────────────────────┐ -/// │ rerun.components.Point2D │ -/// ╞══════════════════════════════════════════════════╡ -/// │ [{x: 10, y: 10}, {x: 20, y: 20}, {x: 30, y: 30}] │ -/// └──────────────────────────────────────────────────┘ -/// ``` -/// -/// ## Example -/// -/// ```rust -/// # use itertools::Itertools as _; -/// # -/// # use re_log_types::DataCell; -/// # use re_log_types::example_components::MyPoint; -/// # use re_types_core::Loggable as _; -/// # -/// let points: &[MyPoint] = &[ -/// MyPoint { x: 10.0, y: 10.0 }, -/// MyPoint { x: 20.0, y: 20.0 }, -/// MyPoint { x: 30.0, y: 30.0 }, -/// ]; -/// let _cell = DataCell::from(points); -/// -/// // Or, alternatively: -/// let cell = DataCell::from_component::([ -/// MyPoint { x: 10.0, y: 10.0 }, -/// MyPoint { x: 20.0, y: 20.0 }, -/// MyPoint { x: 30.0, y: 30.0 }, -/// ]); -/// -/// eprintln!("{:#?}", cell.datatype()); -/// eprintln!("{cell}"); -/// # -/// # assert_eq!(MyPoint::name(), cell.component_name()); -/// # assert_eq!(3, cell.num_instances()); -/// # assert_eq!(cell.datatype(), &MyPoint::arrow_datatype()); -/// # -/// # assert_eq!(points, cell.to_native().as_slice()); -/// ``` -/// -#[derive(Debug, Clone)] -pub struct DataCell { - /// While the arrow data is already refcounted, the contents of the `DataCell` still have to - /// be wrapped in an `Arc` to work around performance issues in `arrow2`. - /// - /// See [`DataCellInner`] for more information. - pub inner: Arc, -} - -impl DataCell { - #[inline] - pub fn as_ptr(&self) -> *const DataCellInner { - Arc::as_ptr(&self.inner) - } -} - -impl PartialEq for DataCell { - fn eq(&self, rhs: &Self) -> bool { - let Self { inner: lhs_inner } = self; - let Self { inner: rhs_inner } = rhs; - - // NOTE: Compare the inner pointers first, and only if they don't match actually do a full - // contents comparison. - // Arc normally handles this automatically if T implements `Eq`, but in our case - // `DataCellInner` cannot implement `Eq`. - // Still, the optimization is valid, and so here we are. - Arc::as_ptr(lhs_inner) == Arc::as_ptr(rhs_inner) || self.inner == rhs.inner - } -} - -/// The actual contents of a [`DataCell`]. -/// -/// Despite the fact that the arrow data is already refcounted, this has to live separately, behind -/// an `Arc`, to work around performance issues in `arrow2` that stem from its heavy use of nested -/// virtual calls. -/// -/// See #1746 for details. -#[derive(Debug, Clone)] -pub struct DataCellInner { - /// Name of the component type used in this cell. - // - // TODO(#1696): Store this within the datatype itself. - pub(crate) name: ComponentName, - - /// The pre-computed size of the cell (stack + heap) as well as its underlying arrow data, - /// in bytes. - /// - /// This is always zero unless [`Self::compute_size_bytes`] has been called, which is a very - /// costly operation. - pub(crate) size_bytes: u64, - - /// A uniformly typed list of values for the given component type: `[C, C, C, …]` - /// - /// Includes the data, its schema and probably soon the component metadata - /// (e.g. the `ComponentName`). - /// - /// Internally this is always stored as an erased arrow array to avoid bad surprises with - /// frequent boxing/unboxing down the line. - /// Internally, this is most likely a slice of another, larger array (batching!). - pub(crate) values: Box, -} - -impl PartialEq for DataCellInner { - #[inline] - fn eq(&self, rhs: &Self) -> bool { - let Self { - name, - size_bytes: _, // we ignore the size (it may be 0 = uncomputed) - values, - } = self; - - name == &rhs.name && values.eq(&rhs.values) - } -} - -// TODO(#1696): We shouldn't have to specify the component name separately, this should be -// part of the metadata by using an extension. -// TODO(#1696): Check that the array is indeed a leaf / component type when building a cell from an -// arrow payload. -impl DataCell { - /// Builds a new `DataCell` from a component batch. - #[inline] - pub fn from_component_batch( - batch: &dyn ComponentBatch, - ) -> re_types_core::SerializationResult { - batch - .to_arrow() - .map(|arrow| Self::from_arrow(batch.name(), arrow)) - } - - /// Builds a new `DataCell` from a uniform iterable of native component values. - /// - /// Fails if the given iterable cannot be serialized to arrow, which should never happen when - /// using Rerun's built-in components. - #[inline] - pub fn try_from_native<'a, C>( - values: impl IntoIterator>>, - ) -> DataCellResult - where - C: Component + Clone + 'a, - { - Ok(Self::from_arrow(C::name(), C::to_arrow(values)?)) - } - - /// Builds a new `DataCell` from a uniform iterable of native component values. - /// - /// Fails if the given iterable cannot be serialized to arrow, which should never happen when - /// using Rerun's built-in components. - #[inline] - pub fn try_from_native_sparse<'a, C>( - values: impl IntoIterator>>>, - ) -> DataCellResult - where - C: Component + Clone + 'a, - { - Ok(Self::from_arrow(C::name(), C::to_arrow_opt(values)?)) - } - - /// Builds a new `DataCell` from a uniform iterable of native component values. - /// - /// Panics if the given iterable cannot be serialized to arrow, which should never happen when - /// using Rerun's built-in components. - /// See [`Self::try_from_native`] for the fallible alternative. - #[inline] - pub fn from_native<'a, C>( - values: impl IntoIterator>>, - ) -> Self - where - C: Component + Clone + 'a, - { - // NOTE: see function description why it's okay here - #[allow(clippy::unwrap_used)] - Self::try_from_native(values).unwrap() - } - - /// Builds a new `DataCell` from a uniform iterable of native component values. - /// - /// Panics if the given iterable cannot be serialized to arrow, which should never happen when - /// using Rerun's built-in components. - /// See [`Self::try_from_native`] for the fallible alternative. - #[inline] - pub fn from_native_sparse<'a, C>( - values: impl IntoIterator>>>, - ) -> Self - where - C: Component + Clone + 'a, - { - // NOTE: see function description why it's okay here - #[allow(clippy::unwrap_used)] - Self::try_from_native_sparse(values).unwrap() - } - - /// Builds a cell from an iterable of items that can be turned into a [`Component`]. - #[inline] - pub fn from_component<'a, C>(values: impl IntoIterator>) -> Self - where - C: Component + Clone + 'a, - C: Into<::std::borrow::Cow<'a, C>>, - { - Self::from_native(values.into_iter().map(Into::into)) - } - - /// Builds a cell from an iterable of items that can be turned into a [`Component`]. - #[inline] - pub fn from_component_sparse<'a, C>( - values: impl IntoIterator>>, - ) -> Self - where - C: Component + Clone + 'a, - C: Into<::std::borrow::Cow<'a, C>>, - { - Self::from_native_sparse(values.into_iter().map(|value| value.map(Into::into))) - } - - /// Builds a new `DataCell` from an arrow array. - /// - /// Fails if the array is not a valid list of components. - #[inline] - #[allow(clippy::unnecessary_wraps)] // TODO(cmc): check that it is indeed a component datatype - pub fn try_from_arrow( - name: ComponentName, - values: Box, - ) -> DataCellResult { - Ok(Self { - inner: Arc::new(DataCellInner { - name, - size_bytes: 0, - values, - }), - }) - } - - /// Builds a new `DataCell` from an arrow array. - /// - /// Panics if the array is not a valid list of components. - /// See [`Self::try_from_arrow`] for the fallible alternative. - #[inline] - pub fn from_arrow(name: ComponentName, values: Box) -> Self { - // NOTE: see function description why it's okay here - #[allow(clippy::unwrap_used)] - Self::try_from_arrow(name, values).unwrap() - } - - // --- - - /// Builds an empty `DataCell` from a native component type. - #[inline] - pub fn from_native_empty() -> Self { - Self::from_arrow_empty(C::name(), C::arrow_field().data_type) - } - - /// Builds an empty `DataCell` from an arrow datatype. - /// - /// Fails if the datatype is not a valid component type. - #[inline] - #[allow(clippy::unnecessary_wraps)] // TODO(cmc): check that it is indeed a component datatype - pub fn try_from_arrow_empty( - name: ComponentName, - datatype: arrow2::datatypes::DataType, - ) -> DataCellResult { - let mut inner = DataCellInner { - name, - size_bytes: 0, - values: arrow2::array::new_empty_array(datatype), - }; - inner.compute_size_bytes(); - - Ok(Self { - inner: Arc::new(inner), - }) - } - - /// Builds an empty `DataCell` from an arrow datatype. - /// - /// Panics if the datatype is not a valid component type. - /// See [`Self::try_from_arrow_empty`] for a fallible alternative. - #[inline] - pub fn from_arrow_empty(name: ComponentName, datatype: arrow2::datatypes::DataType) -> Self { - // NOTE: see function description why it's okay here - #[allow(clippy::unwrap_used)] - Self::try_from_arrow_empty(name, datatype).unwrap() - } - - // --- - - /// Returns the contents of the cell as an arrow array (shallow clone). - /// - /// Avoid using raw arrow arrays unless you absolutely have to: prefer working directly with - /// `DataCell`s, `DataRow`s & `DataTable`s instead. - /// If you do use them, try to keep the scope as short as possible: holding on to a raw array - /// might prevent the datastore from releasing memory from garbage collected data. - #[inline] - pub fn to_arrow(&self) -> Box { - self.inner.values.clone() /* shallow */ - } - - /// Returns the contents of the cell as a reference to an arrow array. - /// - /// Avoid using raw arrow arrays unless you absolutely have to: prefer working directly with - /// `DataCell`s, `DataRow`s & `DataTable`s instead. - /// If you do use them, try to keep the scope as short as possible: holding on to a raw array - /// might prevent the datastore from releasing memory from garbage collected data. - #[inline] - pub fn as_arrow_ref(&self) -> &dyn arrow2::array::Array { - &*self.inner.values - } - - /// Returns the contents of the cell as an arrow array (shallow clone) wrapped in a unit-length - /// list-array. - /// - /// Useful when dealing with cells of different lengths in context that don't allow for it. - /// - /// * Before: `[C, C, C, …]` - /// * After: `ListArray[ [C, C, C, C] ]` - // - // TODO(#1696): this shouldn't be public, need to make it private once the store has been - // patched to use datacells directly. - // TODO(cmc): effectively, this returns a `DataColumn`… think about that. - #[doc(hidden)] - #[inline] - pub fn to_arrow_monolist(&self) -> Box { - use arrow2::{array::ListArray, offset::Offsets}; - - let values = self.to_arrow(); - let datatype = self.datatype().clone(); - - let datatype = ListArray::::default_datatype(datatype); - let offsets = Offsets::try_from_lengths(std::iter::once(self.num_instances() as usize)) - .unwrap_or_default() - .into(); - let validity = None; - - ListArray::::new(datatype, offsets, values, validity).boxed() - } - - /// Returns the contents of the cell as an iterator of native components. - /// - /// Fails if the underlying arrow data cannot be deserialized into `C`. - #[inline] - pub fn try_to_native<'a, C: Component + 'a>(&'a self) -> DataCellResult> { - // NOTE(#3850): Don't add a profile scope here: the profiler overhead is too big for this fast function. - // re_tracing::profile_function!(C::name().as_str()); - - Ok(C::from_arrow(self.inner.values.as_ref())?) - } - - /// Returns the contents of an expected mono-component as an `Option`. - /// - /// Fails if the underlying arrow data cannot be deserialized into `C`. - #[inline] - pub fn try_to_native_mono<'a, C: Component + 'a>(&'a self) -> DataCellResult> { - // NOTE(#3850): Don't add a profile scope here: the profiler overhead is too big for this fast function. - // re_tracing::profile_function!(C::name().as_str()); - - let mut instances = C::from_arrow_opt(self.inner.values.as_ref())?.into_iter(); - - let result = match instances.next() { - // It's ok to have no result from the iteration: this is what we - // should see for a cleared component (logged as an empty set). - None => Ok(None), - // It's not ok to have a null in a mono-component array. It should - // have been logged as an empty set, so we consider this to be missing - // data. - Some(component) => Ok(Some(component.ok_or_else(|| { - DeserializationError::MissingData { - backtrace: backtrace::Backtrace::new_unresolved(), - } - })?)), - }; - - if instances.next().is_some() { - re_log::warn_once!("Unexpected batch for {}", C::name()); - } - - result - } - - /// Returns the contents of the cell as an iterator of native components. - /// - /// Panics if the underlying arrow data cannot be deserialized into `C`. - /// See [`Self::try_to_native`] for a fallible alternative. - #[inline] - pub fn to_native<'a, C: Component + 'a>(&'a self) -> Vec { - // NOTE: see function description why it's okay here - #[allow(clippy::unwrap_used)] - self.try_to_native().unwrap() - } - - /// Returns the contents of the cell as an iterator of native optional components. - /// - /// Fails if the underlying arrow data cannot be deserialized into `C`. - #[inline] - pub fn try_to_native_opt<'a, C: Component + 'a>(&'a self) -> DataCellResult>> { - // NOTE(#3850): Don't add a profile scope here: the profiler overhead is too big for this fast function. - // re_tracing::profile_function!(C::name().as_str()); - - Ok(C::from_arrow_opt(self.inner.values.as_ref())?) - } - - /// Returns the contents of the cell as an iterator of native optional components. - /// - /// Panics if the underlying arrow data cannot be deserialized into `C`. - /// See [`Self::try_to_native_opt`] for a fallible alternative. - #[inline] - pub fn to_native_opt<'a, C: Component + 'a>(&'a self) -> Vec> { - // NOTE: see function description why it's okay here - #[allow(clippy::unwrap_used)] - self.try_to_native_opt().unwrap() - } -} - -impl DataCell { - /// The name of the component type stored in the cell. - #[inline] - pub fn component_name(&self) -> ComponentName { - self.inner.name - } - - /// The type of the component stored in the cell, i.e. the cell is an array of that type. - #[inline] - pub fn datatype(&self) -> &arrow2::datatypes::DataType { - self.inner.values.data_type() - } - - /// The length of the cell's array, i.e. how many component instances are in the cell? - #[inline] - pub fn num_instances(&self) -> u32 { - self.inner.values.len() as _ - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.inner.values.is_empty() - } - - /// Returns `true` if the underlying array is dense (no nulls). - #[inline] - pub fn is_dense(&self) -> bool { - if let Some(validity) = self.as_arrow_ref().validity() { - validity.unset_bits() == 0 - } else { - true - } - } - - /// Returns `true` if the underlying array is both sorted (increasing order) and contains only - /// unique values. - /// - /// The cell must be dense, otherwise the result of this method is undefined. - pub fn is_sorted_and_unique(&self) -> DataCellResult { - use arrow2::{ - array::{Array, PrimitiveArray}, - types::NativeType, - }; - - debug_assert!(self.is_dense()); - - let arr = self.as_arrow_ref(); - - fn is_sorted_and_unique_primitive(arr: &dyn Array) -> bool { - // NOTE: unwrap cannot fail, checked by caller just below - #[allow(clippy::unwrap_used)] - let values = arr.as_any().downcast_ref::>().unwrap(); - values.values().windows(2).all(|v| v[0] < v[1]) - } - - // TODO(cmc): support more datatypes as the need arise. - match arr.data_type() { - DataType::Int8 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::Int16 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::Int32 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::Int64 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::UInt8 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::UInt16 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::UInt32 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::UInt64 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::Float32 => Ok(is_sorted_and_unique_primitive::(arr)), - DataType::Float64 => Ok(is_sorted_and_unique_primitive::(arr)), - _ => Err(DataCellError::UnsupportedDatatype(arr.data_type().clone())), - } - } -} - -// --- - -impl<'a, C> From<&'a [C]> for DataCell -where - C: Component + Clone + 'a, - &'a C: Into<::std::borrow::Cow<'a, C>>, -{ - #[inline] - fn from(values: &'a [C]) -> Self { - Self::from_native(values.iter()) - } -} - -impl<'a, C> From<[C; 1]> for DataCell -where - C: Component + Clone + 'a, - C: Into<::std::borrow::Cow<'a, C>>, -{ - #[inline] - fn from(values: [C; 1]) -> Self { - Self::from_native(values) - } -} - -impl<'a, C> From<&'a Vec> for DataCell -where - C: Component + Clone + 'a, - &'a C: Into<::std::borrow::Cow<'a, C>>, -{ - #[inline] - fn from(c: &'a Vec) -> Self { - c.as_slice().into() - } -} - -impl<'a, C> From> for DataCell -where - C: Component + Clone + 'a, - C: Into<::std::borrow::Cow<'a, C>>, -{ - #[inline] - fn from(c: Vec) -> Self { - Self::from_native(c) - } -} - -// --- - -impl std::fmt::Display for DataCell { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_fmt(format_args!( - "DataCell({})", - re_format::format_bytes(self.inner.size_bytes as _) - ))?; - re_format_arrow::format_dataframe( - Metadata::default(), - [Field::new( - self.component_name().to_string(), - self.datatype().clone(), - false, - )], - [self.to_arrow_monolist()], - ) - .fmt(f) - } -} - -// --- - -impl DataCell { - /// Compute and cache the total size (stack + heap) of the inner cell and its underlying arrow - /// array, in bytes. - /// This does nothing if the size has already been computed and cached before. - /// - /// The caller must the sole owner of this cell, as this requires mutating an `Arc` under the - /// hood. Returns false otherwise. - /// - /// Beware: this is _very_ costly! - #[inline] - pub fn compute_size_bytes(&mut self) -> bool { - if let Some(inner) = Arc::get_mut(&mut self.inner) { - inner.compute_size_bytes(); - return true; - } - - if self.inner.size_bytes == 0 { - re_log::error_once!( - "cell size could _not_ be computed (the cell has already been shared)" - ); - return false; - } - - true - } -} - -impl SizeBytes for DataCell { - #[inline] - fn heap_size_bytes(&self) -> u64 { - if 0 < self.inner.size_bytes { - self.inner.size_bytes - } else { - // NOTE: Relying on unsized cells is always a mistake, but it isn't worth crashing - // the viewer when in release mode. - debug_assert!( - false, - "called `DataCell::heap_size_bytes() without computing it first" - ); - re_log::warn_once!("called `DataCell::heap_size_bytes() without computing it first"); - 0 - } - } -} - -impl DataCellInner { - /// Compute and cache the total size (stack + heap) of the cell and its underlying arrow array, - /// in bytes. - /// This does nothing if the size has already been computed and cached before. - /// - /// Beware: this is _very_ costly! - #[inline] - pub fn compute_size_bytes(&mut self) { - let Self { - name, - size_bytes, - values, - } = self; - - // NOTE: The computed size cannot ever be zero. - if *size_bytes > 0 { - return; - } - - let values: &dyn arrow2::array::Array = values.as_ref(); - *size_bytes = name.total_size_bytes() - + size_bytes.total_size_bytes() - + values.data_type().total_size_bytes() - + values.total_size_bytes(); - } -} diff --git a/crates/re_log_types/src/data_row.rs b/crates/re_log_types/src/data_row.rs deleted file mode 100644 index 38dabf17134a..000000000000 --- a/crates/re_log_types/src/data_row.rs +++ /dev/null @@ -1,614 +0,0 @@ -use ahash::HashSetExt; -use arrow2::datatypes::{Field, Metadata}; -use nohash_hasher::IntSet; -use smallvec::SmallVec; - -use re_types_core::{AsComponents, ComponentName, SizeBytes}; - -use crate::{DataCell, DataCellError, DataTable, EntityPath, TableId, TimePoint}; - -// --- - -/// An error that can occur because a row in the store has inconsistent columns. -#[derive(thiserror::Error, Debug)] -pub enum DataReadError { - #[error( - "Same component type present multiple times within a single row: \ - '{component}' in '{entity_path}'" - )] - DupedComponent { - entity_path: EntityPath, - component: ComponentName, - }, -} - -pub type DataReadResult = ::std::result::Result; - -/// A problem with a row of data in the store. -#[derive(thiserror::Error, Debug)] -pub enum DataRowError { - #[error(transparent)] - DataRead(#[from] DataReadError), - - #[error("Error with one or more the underlying data cells: {0}")] - DataCell(#[from] DataCellError), - - #[error("Could not serialize/deserialize data to/from Arrow: {0}")] - Arrow(#[from] arrow2::error::Error), - - // Needed to handle TryFrom -> T - #[error("Infallible")] - Unreachable(#[from] std::convert::Infallible), -} - -pub type DataRowResult = ::std::result::Result; - -// --- - -pub type DataCellVec = SmallVec<[DataCell; 4]>; - -/// A row's worth of [`DataCell`]s: a collection of independent [`DataCell`]s with different -/// underlying datatypes and pointing to different parts of the heap. -/// -/// Each cell in the row corresponds to a different column of the same row. -#[derive(Debug, Clone, PartialEq)] -pub struct DataCellRow(pub DataCellVec); - -impl std::ops::Deref for DataCellRow { - type Target = [DataCell]; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::ops::DerefMut for DataCellRow { - #[inline] - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -impl std::ops::Index for DataCellRow { - type Output = DataCell; - - #[inline] - fn index(&self, index: usize) -> &Self::Output { - &self.0[index] - } -} - -impl std::ops::IndexMut for DataCellRow { - #[inline] - fn index_mut(&mut self, index: usize) -> &mut Self::Output { - &mut self.0[index] - } -} - -impl SizeBytes for DataCellRow { - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.0.heap_size_bytes() - } -} - -// --- - -/// A unique ID for a [`DataRow`]. -/// -/// ## Semantics -/// -/// [`RowId`]s play an important role in how we store, query and garbage collect data. -/// -/// ### Storage -/// -/// [`RowId`]s must be unique within a `DataStore`. This is enforced by the store's APIs. -/// -/// This makes it easy to build and maintain secondary indices around `RowId`s with few to no -/// extraneous state tracking. -/// -/// ### Query -/// -/// Queries (both latest-at & range semantics) will defer to `RowId` order as a tie-breaker when -/// looking at several rows worth of data that rest at the exact same timestamp. -/// -/// In pseudo-code: -/// ```text -/// rr.set_time_sequence("frame", 10) -/// -/// rr.log("my_entity", point1, row_id=#1) -/// rr.log("my_entity", point2, row_id=#0) -/// -/// rr.query("my_entity", at=("frame", 10)) # returns `point1` -/// ``` -/// -/// Think carefully about your `RowId`s when logging a lot of data at the same timestamp. -/// -/// ### Garbage collection -/// -/// Garbage collection happens in `RowId`-order, which roughly means that it happens in the -/// logger's wall-clock order. -/// -/// This has very important implications where inserting data far into the past or into the future: -/// think carefully about your `RowId`s in these cases. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -pub struct RowId(pub(crate) re_tuid::Tuid); - -impl std::fmt::Display for RowId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) - } -} - -impl RowId { - pub const ZERO: Self = Self(re_tuid::Tuid::ZERO); - pub const MAX: Self = Self(re_tuid::Tuid::MAX); - - /// Create a new unique [`RowId`] based on the current time. - #[allow(clippy::new_without_default)] - #[inline] - pub fn new() -> Self { - Self(re_tuid::Tuid::new()) - } - - /// Returns the next logical [`RowId`]. - /// - /// Beware: wrong usage can easily lead to conflicts. - /// Prefer [`RowId::new`] when unsure. - #[must_use] - #[inline] - pub fn next(&self) -> Self { - Self(self.0.next()) - } - - /// Returns the `n`-next logical [`RowId`]. - /// - /// This is equivalent to calling [`RowId::next`] `n` times. - /// Wraps the monotonically increasing back to zero on overflow. - /// - /// Beware: wrong usage can easily lead to conflicts. - /// Prefer [`RowId::new`] when unsure. - #[must_use] - #[inline] - pub fn incremented_by(&self, n: u64) -> Self { - Self(self.0.incremented_by(n)) - } - - /// When the `RowId` was created, in nanoseconds since unix epoch. - #[inline] - pub fn nanoseconds_since_epoch(&self) -> u64 { - self.0.nanoseconds_since_epoch() - } -} - -impl SizeBytes for RowId { - #[inline] - fn heap_size_bytes(&self) -> u64 { - 0 - } - - #[inline] - fn is_pod() -> bool { - true - } -} - -impl std::ops::Deref for RowId { - type Target = re_tuid::Tuid; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::ops::DerefMut for RowId { - #[inline] - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -re_types_core::delegate_arrow_tuid!(RowId as "rerun.controls.RowId"); - -/// A row's worth of data, i.e. an event: a list of [`DataCell`]s associated with an auto-generated -/// `RowId`, a user-specified [`TimePoint`] and [`EntityPath`], and an expected number of -/// instances. -/// This is the middle layer in our data model. -/// -/// Behind the scenes, a `DataRow` is backed by a collection of independent [`DataCell`]s which -/// likely refer to unrelated/non-contiguous parts of the heap. -/// Cloning a `DataRow` is not too costly but needs to be avoided on the happy path. -/// -/// ## Field visibility -/// -/// To facilitate destructuring (`let DataRow { .. } = row`), all the fields in `DataRow` are -/// public. -/// -/// Modifying any of these fields from outside this crate is considered undefined behavior. -/// Use the appropriate getters and setters instead. -/// -/// ## Layout -/// -/// A row is a collection of cells where each cell can have an arbitrary number of -/// instances: `[[C1, C1, C1], [], [C3], [C4, C4, C4], …]`. -/// -/// Consider this example: -/// ```ignore -/// let points: &[MyPoint] = &[[10.0, 10.0].into(), [20.0, 20.0].into()]; -/// let colors: &[_] = &[MyColor::from_rgb(128, 128, 128)]; -/// let labels: &[MyLabel] = &[]; -/// let row = DataRow::from_cells3(row_id, timepoint, ent_path, (points, colors, labels)); -/// ``` -/// -/// A row has no arrow representation nor datatype of its own, as it is merely a collection of -/// independent cells. -/// -/// Visualized in the context of a larger table, it is simply a row of cells: -/// ```text -/// ┌──────────────────────────────────┬─────────────────┬───────┐ -/// │ Point2D ┆ Color ┆ Text │ -/// ╞══════════════════════════════════╪═════════════════╪═══════╡ -/// │ [{x: 10, y: 10}, {x: 20, y: 20}] ┆ [2155905279] ┆ [] │ -/// └──────────────────────────────────┴─────────────────┴───────┘ -/// ``` -/// -/// ## Example -/// -/// ```rust -/// # use re_log_types::{ -/// # example_components::{MyColor, MyLabel, MyPoint}, -/// # DataRow, RowId, Timeline, -/// # }; -/// # -/// # let row_id = RowId::ZERO; -/// # let timepoint = [ -/// # (Timeline::new_sequence("frame_nr"), 42), // -/// # (Timeline::new_sequence("clock"), 666), // -/// # ]; -/// # -/// let points: &[MyPoint] = &[MyPoint { x: 10.0, y: 10.0}, MyPoint { x: 20.0, y: 20.0 }]; -/// let colors: &[_] = &[MyColor(0xff7f7f7f)]; -/// let labels: &[MyLabel] = &[]; -/// -/// let row = DataRow::from_cells3( -/// row_id, -/// "a/b/c", -/// timepoint, -/// (points, colors, labels), -/// ).unwrap(); -/// eprintln!("{row}"); -/// ``` -#[derive(Debug, Clone)] -pub struct DataRow { - /// Auto-generated `TUID`, uniquely identifying this event and keeping track of the client's - /// wall-clock. - pub row_id: RowId, - - /// User-specified [`TimePoint`] for this event. - pub timepoint: TimePoint, - - /// User-specified [`EntityPath`] for this event. - pub entity_path: EntityPath, - - /// The actual cells (== columns, == components). - pub cells: DataCellRow, -} - -impl DataRow { - /// Builds a new `DataRow` from anything implementing [`AsComponents`]. - pub fn from_archetype( - row_id: RowId, - timepoint: TimePoint, - entity_path: EntityPath, - as_components: &dyn AsComponents, - ) -> anyhow::Result { - re_tracing::profile_function!(); - - let batches = as_components.as_component_batches(); - Self::from_component_batches( - row_id, - timepoint, - entity_path, - batches.iter().map(|batch| batch.as_ref()), - ) - } - - /// Builds a new `DataRow` from anything implementing [`AsComponents`]. - pub fn from_component_batches<'a>( - row_id: RowId, - timepoint: TimePoint, - entity_path: EntityPath, - comp_batches: impl IntoIterator, - ) -> anyhow::Result { - re_tracing::profile_function!(); - - let data_cells = comp_batches - .into_iter() - .map(DataCell::from_component_batch) - .collect::, _>>()?; - - let mut row = Self::from_cells(row_id, timepoint, entity_path, data_cells)?; - row.compute_all_size_bytes(); - Ok(row) - } - - /// Builds a new `DataRow` from an iterable of [`DataCell`]s. - /// - /// Fails if two or more cells share the same component type. - pub fn from_cells( - row_id: RowId, - timepoint: impl Into, - entity_path: impl Into, - cells: impl IntoIterator, - ) -> DataReadResult { - let cells = DataCellRow(cells.into_iter().collect()); - - let entity_path = entity_path.into(); - let timepoint = timepoint.into(); - - let mut components = IntSet::with_capacity(cells.len()); - for cell in &*cells { - let component = cell.component_name(); - - if !components.insert(component) { - return Err(DataReadError::DupedComponent { - entity_path, - component, - }); - } - } - - Ok(Self { - row_id, - entity_path, - timepoint, - cells, - }) - } - - /// Consumes the [`DataRow`] and returns a new one with an incremented [`RowId`]. - #[inline] - pub fn next(self) -> Self { - Self { - row_id: self.row_id.next(), - ..self - } - } - - /// Turns the `DataRow` into a single-row [`DataTable`]. - #[inline] - pub fn into_table(self) -> DataTable { - DataTable::from_rows(TableId::new(), [self]) - } -} - -impl SizeBytes for DataRow { - fn heap_size_bytes(&self) -> u64 { - let Self { - row_id, - timepoint, - entity_path, - cells, - } = self; - - row_id.heap_size_bytes() - + timepoint.heap_size_bytes() - + entity_path.heap_size_bytes() - + cells.heap_size_bytes() - } -} - -impl DataRow { - #[inline] - pub fn row_id(&self) -> RowId { - self.row_id - } - - #[inline] - pub fn timepoint(&self) -> &TimePoint { - &self.timepoint - } - - #[inline] - pub fn entity_path(&self) -> &EntityPath { - &self.entity_path - } - - #[inline] - pub fn num_cells(&self) -> usize { - self.cells.len() - } - - #[inline] - pub fn component_names(&self) -> impl ExactSizeIterator + '_ { - self.cells.iter().map(|cell| cell.component_name()) - } - - #[inline] - pub fn cells(&self) -> &DataCellRow { - &self.cells - } - - #[inline] - pub fn into_cells(self) -> DataCellRow { - self.cells - } - - /// Returns the index of the cell with the given component type in the row, if it exists. - /// - /// This is `O(n)`. - #[inline] - pub fn find_cell(&self, component: &ComponentName) -> Option { - self.cells - .iter() - .map(|cell| cell.component_name()) - .position(|name| name == *component) - } - - /// Compute and cache the total (heap) allocated size of each individual underlying - /// [`DataCell`]. - /// This does nothing for cells whose size has already been computed and cached before. - /// - /// Beware: this is _very_ costly! - #[inline] - pub fn compute_all_size_bytes(&mut self) { - for cell in &mut self.cells.0 { - cell.compute_size_bytes(); - } - } -} - -// --- - -impl DataRow { - /// A helper that combines [`Self::from_cells1`] followed by [`Self::compute_all_size_bytes`]. - /// - /// See respective documentations for more information. - /// - /// Beware: this is costly! - pub fn from_cells1_sized( - row_id: RowId, - entity_path: impl Into, - timepoint: impl Into, - into_cells: C0, - ) -> DataReadResult - where - C0: Into, - { - let mut this = Self::from_cells( - row_id, - timepoint.into(), - entity_path.into(), - [into_cells.into()], - )?; - this.compute_all_size_bytes(); - Ok(this) - } - - pub fn from_cells1( - row_id: RowId, - entity_path: impl Into, - timepoint: impl Into, - into_cells: C0, - ) -> DataRowResult - where - C0: TryInto, - DataRowError: From<>::Error>, - { - Ok(Self::from_cells( - row_id, - timepoint.into(), - entity_path.into(), - [into_cells.try_into()?], - )?) - } - - /// A helper that combines [`Self::from_cells2`] followed by [`Self::compute_all_size_bytes`]. - /// - /// See respective documentations for more information. - /// - /// Beware: this is costly! - pub fn from_cells2_sized( - row_id: RowId, - entity_path: impl Into, - timepoint: impl Into, - into_cells: (C0, C1), - ) -> DataRowResult - where - C0: Into, - C1: Into, - { - let mut this = Self::from_cells( - row_id, - timepoint.into(), - entity_path.into(), - [ - into_cells.0.into(), // - into_cells.1.into(), // - ], - )?; - this.compute_all_size_bytes(); - Ok(this) - } - - pub fn from_cells2( - row_id: RowId, - entity_path: impl Into, - timepoint: impl Into, - into_cells: (C0, C1), - ) -> DataRowResult - where - C0: TryInto, - C1: TryInto, - DataRowError: From<>::Error>, - DataRowError: From<>::Error>, - { - Ok(Self::from_cells( - row_id, - timepoint.into(), - entity_path.into(), - [ - into_cells.0.try_into()?, // - into_cells.1.try_into()?, // - ], - )?) - } - - pub fn from_cells3( - row_id: RowId, - entity_path: impl Into, - timepoint: impl Into, - into_cells: (C0, C1, C2), - ) -> DataRowResult - where - C0: TryInto, - C1: TryInto, - C2: TryInto, - DataRowError: From<>::Error>, - DataRowError: From<>::Error>, - DataRowError: From<>::Error>, - { - Ok(Self::from_cells( - row_id, - timepoint.into(), - entity_path.into(), - [ - into_cells.0.try_into()?, // - into_cells.1.try_into()?, // - into_cells.2.try_into()?, // - ], - )?) - } -} - -// --- - -impl std::fmt::Display for DataRow { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "Row #{} @ '{}'", self.row_id, self.entity_path)?; - for (timeline, time) in &self.timepoint { - writeln!( - f, - "- {}: {}", - timeline.name(), - timeline.typ().format_utc(*time) - )?; - } - - re_format_arrow::format_dataframe( - Metadata::default(), - self.cells.iter().map(|cell| { - Field::new( - cell.component_name().to_string(), - cell.datatype().clone(), - false, - ) - }), - self.cells.iter().map(|cell| cell.to_arrow_monolist()), - ) - .fmt(f) - } -} diff --git a/crates/re_log_types/src/data_table.rs b/crates/re_log_types/src/data_table.rs deleted file mode 100644 index 87a40d7adcf4..000000000000 --- a/crates/re_log_types/src/data_table.rs +++ /dev/null @@ -1,1284 +0,0 @@ -use std::{ - collections::{BTreeMap, VecDeque}, - sync::Arc, -}; - -use ahash::HashMap; -use itertools::{izip, Itertools as _}; -use nohash_hasher::IntSet; - -use re_types_core::{ComponentName, Loggable, SizeBytes}; - -use crate::{ - data_row::DataReadResult, ArrowMsg, DataCell, DataCellError, DataRow, DataRowError, EntityPath, - RowId, TimePoint, Timeline, -}; - -// --- - -#[derive(thiserror::Error, Debug)] -pub enum DataTableError { - #[error("The schema has a column {0:?} that is missing in the data")] - MissingColumn(String), - - #[error( - "Trying to deserialize time column data with invalid datatype: {name:?} ({datatype:#?})" - )] - NotATimeColumn { name: String, datatype: DataType }, - - #[error("Trying to deserialize column data that doesn't contain any ListArrays: {0:?}")] - NotAColumn(String), - - #[error("Error with one or more the underlying data rows: {0}")] - DataRow(#[from] DataRowError), - - #[error("Error with one or more the underlying data cells: {0}")] - DataCell(#[from] DataCellError), - - #[error("Could not serialize/deserialize component instances to/from Arrow: {0}")] - Arrow(#[from] arrow2::error::Error), - - #[error("Could not serialize component instances to/from Arrow: {0}")] - Serialization(#[from] re_types_core::SerializationError), - - #[error("Could not deserialize component instances to/from Arrow: {0}")] - Deserialization(#[from] re_types_core::DeserializationError), - - // Needed to handle TryFrom -> T - #[error("Infallible")] - Unreachable(#[from] std::convert::Infallible), -} - -pub type DataTableResult = ::std::result::Result; - -// --- - -pub type RowIdVec = VecDeque; - -pub type TimeOptVec = VecDeque>; - -pub type TimePointVec = VecDeque; - -pub type ErasedTimeVec = VecDeque; - -pub type EntityPathVec = VecDeque; - -pub type DataCellOptVec = VecDeque>; - -/// A column's worth of [`DataCell`]s: a sparse collection of [`DataCell`]s that share the same -/// underlying type and likely point to shared, contiguous memory. -/// -/// Each cell in the column corresponds to a different row of the same column. -#[derive(Default, Debug, Clone, PartialEq)] -pub struct DataCellColumn(pub DataCellOptVec); - -impl std::ops::Deref for DataCellColumn { - type Target = VecDeque>; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::ops::DerefMut for DataCellColumn { - #[inline] - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -impl std::ops::Index for DataCellColumn { - type Output = Option; - - #[inline] - fn index(&self, index: usize) -> &Self::Output { - &self.0[index] - } -} - -impl std::ops::IndexMut for DataCellColumn { - #[inline] - fn index_mut(&mut self, index: usize) -> &mut Self::Output { - &mut self.0[index] - } -} - -impl DataCellColumn { - #[inline] - pub fn empty(num_rows: usize) -> Self { - Self(vec![None; num_rows].into()) - } - - /// Compute and cache the size of each individual underlying [`DataCell`]. - /// This does nothing for cells whose size has already been computed and cached before. - /// - /// Beware: this is _very_ costly! - #[inline] - pub fn compute_all_size_bytes(&mut self) { - re_tracing::profile_function!(); - for cell in &mut self.0 { - cell.as_mut().map(|cell| cell.compute_size_bytes()); - } - } -} - -impl SizeBytes for DataCellColumn { - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.0.heap_size_bytes() - } -} - -// --- - -/// A unique ID for a [`DataTable`]. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -pub struct TableId(pub(crate) re_tuid::Tuid); - -impl std::fmt::Display for TableId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) - } -} - -impl TableId { - pub const ZERO: Self = Self(re_tuid::Tuid::ZERO); - - /// Create a new unique [`TableId`] based on the current time. - #[allow(clippy::new_without_default)] - #[inline] - pub fn new() -> Self { - Self(re_tuid::Tuid::new()) - } - - /// Returns the next logical [`TableId`]. - /// - /// Beware: wrong usage can easily lead to conflicts. - /// Prefer [`TableId::new`] when unsure. - #[must_use] - #[inline] - pub fn next(&self) -> Self { - Self(self.0.next()) - } - - /// Returns the `n`-next logical [`TableId`]. - /// - /// This is equivalent to calling [`TableId::next`] `n` times. - /// Wraps the monotonically increasing back to zero on overflow. - /// - /// Beware: wrong usage can easily lead to conflicts. - /// Prefer [`TableId::new`] when unsure. - #[must_use] - #[inline] - pub fn incremented_by(&self, n: u64) -> Self { - Self(self.0.incremented_by(n)) - } -} - -impl SizeBytes for TableId { - #[inline] - fn heap_size_bytes(&self) -> u64 { - 0 - } - - #[inline] - fn is_pod() -> bool { - true - } -} - -impl std::ops::Deref for TableId { - type Target = re_tuid::Tuid; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::ops::DerefMut for TableId { - #[inline] - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -re_types_core::delegate_arrow_tuid!(TableId as "rerun.controls.TableId"); - -/// A sparse table's worth of data, i.e. a batch of events: a collection of [`DataRow`]s. -/// This is the top-level layer in our data model. -/// -/// Behind the scenes, a `DataTable` is organized in columns, where columns are represented by -/// sparse lists of [`DataCell`]s. -/// Cells within a single list are likely to reference shared, contiguous heap memory. -/// -/// Cloning a `DataTable` can be _very_ costly depending on the contents. -/// -/// ## Field visibility -/// -/// To facilitate destructuring (`let DataTable { .. } = row`), all the fields in `DataTable` are -/// public. -/// -/// Modifying any of these fields from outside this crate is considered undefined behavior. -/// Use the appropriate getters and setters instead. -/// -/// ## Layout -/// -/// A table is a collection of sparse rows, which are themselves collections of cells, where each -/// cell can contain an arbitrary number of instances: -/// ```text -/// [ -/// [[C1, C1, C1], [], [C3], [C4, C4, C4], …], -/// [None, [C2, C2], [], [C4], …], -/// [None, [C2, C2], [], None, …], -/// … -/// ] -/// ``` -/// -/// Consider this example: -/// ```ignore -/// let row0 = { -/// let points: &[MyPoint] = &[[10.0, 10.0].into(), [20.0, 20.0].into()]; -/// let colors: &[_] = &[MyColor::from_rgb(128, 128, 128)]; -/// let labels: &[Label] = &[]; -/// DataRow::from_cells3(RowId::new(), "a", timepoint(1, 1), (points, colors, labels))? -/// }; -/// let row1 = { -/// let colors: &[MyColor] = &[]; -/// DataRow::from_cells1(RowId::new(), "b", timepoint(1, 2), colors)? -/// }; -/// let row2 = { -/// let colors: &[_] = &[MyColor::from_rgb(255, 255, 255)]; -/// let labels: &[_] = &[Label("hey".into())]; -/// DataRow::from_cells2(RowId::new(), "c", timepoint(2, 1), (colors, labels))? -/// }; -/// let table = DataTable::from_rows(table_id, [row0, row1, row2]); -/// ``` -/// -/// A table has no arrow representation nor datatype of its own, as it is merely a collection of -/// independent rows. -/// -/// The table above translates to the following, where each column is contiguous in memory: -/// ```text -/// ┌──────────┬───────────────────────────────┬──────────────────────────────────┬───────────────────┬─────────────┬──────────────────────────────────┬─────────────────┐ -/// │ frame_nr ┆ log_time ┆ rerun.row_id ┆ rerun.entity_path ┆ ┆ rerun.components.Point2D ┆ rerun.components.Color │ -/// ╞══════════╪═══════════════════════════════╪══════════════════════════════════╪═══════════════════╪═════════════╪══════════════════════════════════╪═════════════════╡ -/// │ 1 ┆ 2023-04-05 09:36:47.188796402 ┆ 1753004ACBF5D6E651F2983C3DAF260C ┆ a ┆ [] ┆ [{x: 10, y: 10}, {x: 20, y: 20}] ┆ [2155905279] │ -/// ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ -/// │ 1 ┆ 2023-04-05 09:36:47.188852222 ┆ 1753004ACBF5D6E651F2983C3DAF260C ┆ b ┆ - ┆ - ┆ [] │ -/// ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ -/// │ 2 ┆ 2023-04-05 09:36:47.188855872 ┆ 1753004ACBF5D6E651F2983C3DAF260C ┆ c ┆ [hey] ┆ - ┆ [4294967295] │ -/// └──────────┴───────────────────────────────┴──────────────────────────────────┴───────────────────┴─────────────┴──────────────────────────────────┴─────────────────┘ -/// ``` -#[derive(Debug, Clone, PartialEq)] -pub struct DataTable { - /// Auto-generated `TUID`, uniquely identifying this batch of data and keeping track of the - /// client's wall-clock. - pub table_id: TableId, - - /// The entire column of `RowId`s. - /// - /// Keeps track of the unique identifier for each row that was generated by the clients. - pub col_row_id: RowIdVec, - - /// All the rows for all the time columns. - /// - /// The times are optional since not all rows are guaranteed to have a timestamp for every - /// single timeline (though it is highly likely to be the case in practice). - pub col_timelines: BTreeMap, - - /// The entire column of [`EntityPath`]s. - /// - /// The entity each row relates to, respectively. - pub col_entity_path: EntityPathVec, - - /// All the rows for all the component columns. - /// - /// The cells are optional since not all rows will have data for every single component - /// (i.e. the table is sparse). - pub columns: BTreeMap, -} - -impl DataTable { - /// Creates a new empty table with the given ID. - pub fn new(table_id: TableId) -> Self { - Self { - table_id, - col_row_id: Default::default(), - col_timelines: Default::default(), - col_entity_path: Default::default(), - columns: Default::default(), - } - } - - /// Builds a new `DataTable` from an iterable of [`DataRow`]s. - pub fn from_rows(table_id: TableId, rows: impl IntoIterator) -> Self { - re_tracing::profile_function!(); - - let rows = rows.into_iter(); - - // Explode all rows into columns, and keep track of which components are involved. - let mut components = IntSet::default(); - #[allow(clippy::type_complexity)] - let (col_row_id, col_timepoint, col_entity_path, column): ( - RowIdVec, - TimePointVec, - EntityPathVec, - Vec<_>, - ) = rows - .map(|row| { - components.extend(row.component_names()); - let DataRow { - row_id, - timepoint, - entity_path, - cells, - } = row; - (row_id, timepoint, entity_path, cells) - }) - .multiunzip(); - - // All time columns. - let mut col_timelines: BTreeMap = BTreeMap::default(); - for (i, timepoint) in col_timepoint.iter().enumerate() { - for (timeline, time) in timepoint.iter() { - match col_timelines.entry(*timeline) { - std::collections::btree_map::Entry::Vacant(entry) => { - entry - .insert(vec![None; i].into()) - .push_back(Some(time.as_i64())); - } - std::collections::btree_map::Entry::Occupied(mut entry) => { - let entry = entry.get_mut(); - entry.push_back(Some(time.as_i64())); - } - } - } - - // handle potential sparseness - for (timeline, col_time) in &mut col_timelines { - if timepoint.get(timeline).is_none() { - col_time.push_back(None); - } - } - } - - // Pre-allocate all columns (one per component). - let mut columns = BTreeMap::default(); - for component in components { - columns.insert(component, DataCellColumn(vec![None; column.len()].into())); - } - - // Fill all columns (where possible: data is likely sparse). - for (i, cells) in column.into_iter().enumerate() { - for cell in cells.0 { - let component = cell.component_name(); - // NOTE: unwrap cannot fail, all arrays pre-allocated above. - #[allow(clippy::unwrap_used)] - let column = columns.get_mut(&component).unwrap(); - column[i] = Some(cell); - } - } - - Self { - table_id, - col_row_id, - col_timelines, - col_entity_path, - columns, - } - } -} - -impl DataTable { - #[inline] - pub fn num_rows(&self) -> u32 { - self.col_row_id.len() as _ - } - - /// Fails if any row has two or more cells share the same component type. - #[inline] - pub fn to_rows(&self) -> impl ExactSizeIterator> + '_ { - let num_rows = self.num_rows() as usize; - - let Self { - table_id: _, - col_row_id, - col_timelines, - col_entity_path, - columns, - } = self; - - (0..num_rows).map(move |i| { - let cells = columns - .values() - .filter_map(|rows| rows[i].clone() /* shallow */); - - DataRow::from_cells( - col_row_id[i], - TimePoint::from( - col_timelines - .iter() - .filter_map(|(timeline, times)| { - times[i].map(|time| (*timeline, crate::TimeInt::new_temporal(time))) - }) - .collect::>(), - ), - col_entity_path[i].clone(), - cells, - ) - }) - } - - /// Computes the maximum value for each and every timeline present across this entire table, - /// and returns the corresponding [`TimePoint`]. - #[inline] - pub fn timepoint_max(&self) -> TimePoint { - let mut timepoint = TimePoint::default(); - for (timeline, col_time) in &self.col_timelines { - let time = col_time - .iter() - .flatten() - .max() - .copied() - .map(crate::TimeInt::new_temporal); - - if let Some(time) = time { - timepoint.insert(*timeline, time); - } - } - timepoint - } - - /// Compute and cache the total (heap) allocated size of each individual underlying - /// [`DataCell`]. - /// This does nothing for cells whose size has already been computed and cached before. - /// - /// Beware: this is _very_ costly! - #[inline] - pub fn compute_all_size_bytes(&mut self) { - re_tracing::profile_function!(); - for column in self.columns.values_mut() { - column.compute_all_size_bytes(); - } - } -} - -impl SizeBytes for DataTable { - #[inline] - fn heap_size_bytes(&self) -> u64 { - let Self { - table_id, - col_row_id, - col_timelines, - col_entity_path, - columns, - } = self; - - table_id.heap_size_bytes() - + col_row_id.heap_size_bytes() - + col_timelines.heap_size_bytes() - + col_entity_path.heap_size_bytes() - + columns.heap_size_bytes() - } -} - -// --- Serialization --- - -use arrow2::{ - array::{Array, ListArray, PrimitiveArray}, - bitmap::Bitmap, - chunk::Chunk, - datatypes::{DataType, Field, Schema, TimeUnit}, - offset::Offsets, - types::NativeType, -}; - -pub const METADATA_KIND: &str = "rerun.kind"; -pub const METADATA_KIND_DATA: &str = "data"; -pub const METADATA_KIND_CONTROL: &str = "control"; -pub const METADATA_KIND_TIME: &str = "time"; - -impl DataTable { - /// Serializes the entire table into an arrow payload and schema. - /// - /// A serialized `DataTable` contains two kinds of columns: control & data. - /// - /// * Control columns are those that drive the behavior of the storage systems. - /// They are always present, always dense, and always deserialized upon reception by the - /// server. - /// Internally, time columns are (de)serialized separately from the rest of the control - /// columns for efficiency/QOL concerns: that doesn't change the fact that they are control - /// columns all the same! - /// * Data columns are the ones that hold component data. - /// They are optional, potentially sparse, and never deserialized on the server-side (not by - /// the storage systems, at least). - pub fn serialize(&self) -> DataTableResult<(Schema, Chunk>)> { - re_tracing::profile_function!(); - - let mut schema = Schema::default(); - let mut columns = Vec::new(); - - // Temporary compatibility layer with Chunks. - if let Some(entity_path) = self.col_entity_path.front() { - /// The key used to identify a Rerun [`EntityPath`] in chunk-level [`ArrowSchema`] metadata. - // - // NOTE: Temporarily copied from `re_chunk` while we're transitioning away to the new data - // model. - const CHUNK_METADATA_KEY_ENTITY_PATH: &str = "rerun.entity_path"; - - schema.metadata.insert( - CHUNK_METADATA_KEY_ENTITY_PATH.to_owned(), - entity_path.to_string(), - ); - } - - { - let (control_schema, control_columns) = self.serialize_time_columns(); - schema.fields.extend(control_schema.fields); - schema.metadata.extend(control_schema.metadata); - columns.extend(control_columns); - } - - { - let (control_schema, control_columns) = self.serialize_control_columns()?; - schema.fields.extend(control_schema.fields); - schema.metadata.extend(control_schema.metadata); - columns.extend(control_columns); - } - - { - let (data_schema, data_columns) = self.serialize_data_columns()?; - schema.fields.extend(data_schema.fields); - schema.metadata.extend(data_schema.metadata); - columns.extend(data_columns); - } - - Ok((schema, Chunk::new(columns))) - } - - /// Serializes all time columns into an arrow payload and schema. - fn serialize_time_columns(&self) -> (Schema, Vec>) { - re_tracing::profile_function!(); - - fn serialize_time_column( - timeline: Timeline, - times: &TimeOptVec, - ) -> (Field, Box) { - let data = DataTable::serialize_primitive_deque_opt(times).to(timeline.datatype()); - - let field = Field::new(timeline.name().as_str(), data.data_type().clone(), false) - .with_metadata([(METADATA_KIND.to_owned(), METADATA_KIND_TIME.to_owned())].into()); - - (field, data.boxed()) - } - - let Self { - table_id: _, - col_row_id: _, - col_timelines, - col_entity_path: _, - columns: _, - } = self; - - let mut schema = Schema::default(); - let mut columns = Vec::new(); - - for (timeline, col_time) in col_timelines { - let (time_field, time_column) = serialize_time_column(*timeline, col_time); - schema.fields.push(time_field); - columns.push(time_column); - } - - (schema, columns) - } - - /// Serializes all controls columns into an arrow payload and schema. - /// - /// Control columns are those that drive the behavior of the storage systems. - /// They are always present, always dense, and always deserialized upon reception by the - /// server. - fn serialize_control_columns(&self) -> DataTableResult<(Schema, Vec>)> { - re_tracing::profile_function!(); - - let Self { - table_id, - col_row_id, - col_timelines: _, - col_entity_path, - columns: _, - } = self; - - let mut schema = Schema::default(); - let mut columns = Vec::new(); - - let (row_id_field, row_id_column) = Self::serialize_control_column(col_row_id)?; - schema.fields.push(row_id_field); - columns.push(row_id_column); - - let (entity_path_field, entity_path_column) = - Self::serialize_control_column(col_entity_path)?; - schema.fields.push(entity_path_field); - columns.push(entity_path_column); - - schema.metadata = [(TableId::name().to_string(), table_id.to_string())].into(); - - Ok((schema, columns)) - } - - /// Serializes a single control column: an iterable of dense arrow-like data. - pub fn serialize_control_column<'a, C: re_types_core::Component + 'a>( - values: &'a VecDeque, - ) -> DataTableResult<(Field, Box)> - where - std::borrow::Cow<'a, C>: std::convert::From<&'a C>, - { - re_tracing::profile_function!(); - - let data: Box = C::to_arrow(values)?; - - // TODO(#3360): rethink our extension and metadata usage - let mut field = C::arrow_field() - .with_metadata([(METADATA_KIND.to_owned(), METADATA_KIND_CONTROL.to_owned())].into()); - - // TODO(#3360): rethink our extension and metadata usage - if let DataType::Extension(name, _, _) = data.data_type() { - field - .metadata - .extend([("ARROW:extension:name".to_owned(), name.clone())]); - } - - Ok((field, data)) - } - - /// Serializes a single control column; optimized path for primitive datatypes. - pub fn serialize_primitive_column( - name: &str, - values: &VecDeque, - datatype: Option, - ) -> (Field, Box) { - re_tracing::profile_function!(); - - let data = Self::serialize_primitive_deque(values); - - let datatype = datatype.unwrap_or(data.data_type().clone()); - let data = data.to(datatype.clone()).boxed(); - - let mut field = Field::new(name, datatype.clone(), false) - .with_metadata([(METADATA_KIND.to_owned(), METADATA_KIND_CONTROL.to_owned())].into()); - - if let DataType::Extension(name, _, _) = datatype { - field - .metadata - .extend([("ARROW:extension:name".to_owned(), name)]); - } - - (field, data) - } - - /// Serializes all data columns into an arrow payload and schema. - /// - /// They are optional, potentially sparse, and never deserialized on the server-side (not by - /// the storage systems, at least). - fn serialize_data_columns(&self) -> DataTableResult<(Schema, Vec>)> { - re_tracing::profile_function!(); - - let Self { - table_id: _, - col_row_id: _, - col_timelines: _, - col_entity_path: _, - columns: table, - } = self; - - let mut schema = Schema::default(); - let mut columns = Vec::new(); - - for (component, rows) in table { - // If none of the rows have any data, there's nothing to do here - // TODO(jleibs): would be nice to make serialize_data_column robust to this case - // but I'm not sure if returning an empty column is the right thing to do there. - // See: https://github.com/rerun-io/rerun/issues/2005 - if rows.iter().any(|c| c.is_some()) { - let (field, column) = Self::serialize_data_column(component, rows)?; - schema.fields.push(field); - columns.push(column); - } - } - - Ok((schema, columns)) - } - - /// Serializes a single data column. - pub fn serialize_data_column( - name: &str, - column: &VecDeque>, - ) -> DataTableResult<(Field, Box)> { - re_tracing::profile_function!(); - - /// Create a list-array out of a flattened array of cell values. - /// - /// * Before: `[C, C, C, C, C, C, C, …]` - /// * After: `ListArray[ [[C, C], [C, C, C], None, [C], [C], …] ]` - fn data_to_lists( - column: &VecDeque>, - data: Box, - ext_name: Option, - ) -> Box { - let datatype = data.data_type().clone(); - - let field = { - let mut field = Field::new("item", datatype, true); - - if let Some(name) = ext_name { - field - .metadata - .extend([("ARROW:extension:name".to_owned(), name)]); - } - - field - }; - - let datatype = DataType::List(Arc::new(field)); - let offsets = Offsets::try_from_lengths(column.iter().map(|cell| { - cell.as_ref() - .map_or(0, |cell| cell.num_instances() as usize) - })); - // NOTE: cannot fail, `data` has as many instances as `column` - #[allow(clippy::unwrap_used)] - let offsets = offsets.unwrap().into(); - - #[allow(clippy::from_iter_instead_of_collect)] - let validity = Bitmap::from_iter(column.iter().map(|cell| cell.is_some())); - - ListArray::::new(datatype, offsets, data, validity.into()).boxed() - } - - // TODO(cmc): All we're doing here is allocating and filling a nice contiguous array so - // our `ListArray`s can compute their indices and for the serializer to work with… - // In a far enough future, we could imagine having much finer grain control over the - // serializer and doing all of this at once, bypassing all the mem copies and - // allocations. - - let cell_refs = column - .iter() - .flatten() - .map(|cell| cell.as_arrow_ref()) - .collect_vec(); - - let ext_name = cell_refs.first().and_then(|cell| match cell.data_type() { - DataType::Extension(name, _, _) => Some(name), - _ => None, - }); - - // NOTE: Avoid paying for the cost of the concatenation machinery if there's a single - // row in the column. - let data = if cell_refs.len() == 1 { - data_to_lists(column, cell_refs[0].to_boxed(), ext_name.cloned()) - } else { - // NOTE: This is a column of cells, it shouldn't ever fail to concatenate since - // they share the same underlying type. - let data = - arrow2::compute::concatenate::concatenate(cell_refs.as_slice()).map_err(|err| { - re_log::warn_once!("failed to concatenate cells for column {name}"); - err - })?; - data_to_lists(column, data, ext_name.cloned()) - }; - - let field = Field::new(name, data.data_type().clone(), false) - .with_metadata([(METADATA_KIND.to_owned(), METADATA_KIND_DATA.to_owned())].into()); - - Ok((field, data)) - } - - pub fn serialize_primitive_deque_opt( - data: &VecDeque>, - ) -> PrimitiveArray { - let datatype = T::PRIMITIVE.into(); - let values = data - .iter() - .copied() - .map(Option::unwrap_or_default) - .collect(); - let validity = data - .iter() - .any(Option::is_none) - .then(|| data.iter().map(Option::is_some).collect()); - PrimitiveArray::new(datatype, values, validity) - } - - pub fn serialize_primitive_deque(data: &VecDeque) -> PrimitiveArray { - let datatype = T::PRIMITIVE.into(); - let values = data.iter().copied().collect(); - PrimitiveArray::new(datatype, values, None) - } -} - -impl DataTable { - /// Deserializes an entire table from an arrow payload and schema. - pub fn deserialize( - table_id: TableId, - schema: &Schema, - chunk: &Chunk>, - ) -> DataTableResult { - re_tracing::profile_function!(); - - /// The key used to identify a Rerun [`EntityPath`] in chunk-level [`ArrowSchema`] metadata. - // - // NOTE: Temporarily copied from `re_chunk` while we're transitioning away to the new data - // model. - const CHUNK_METADATA_KEY_ENTITY_PATH: &str = "rerun.entity_path"; - - let entity_path = schema - .metadata - .get(CHUNK_METADATA_KEY_ENTITY_PATH) - .ok_or_else(|| DataTableError::MissingColumn("metadata:entity_path".to_owned()))?; - let entity_path = EntityPath::parse_forgiving(entity_path); - - // --- Time --- - - let col_timelines: DataTableResult<_> = schema - .fields - .iter() - .enumerate() - .filter_map(|(i, field)| { - field.metadata.get(METADATA_KIND).and_then(|kind| { - (kind == METADATA_KIND_TIME).then_some((field.name.as_str(), i)) - }) - }) - .map(|(name, index)| { - chunk - .get(index) - .ok_or(DataTableError::MissingColumn(name.to_owned())) - .and_then(|column| Self::deserialize_time_column(name, &**column)) - }) - .collect(); - let col_timelines = col_timelines?; - - // --- Control --- - - let control_indices: HashMap<&str, usize> = schema - .fields - .iter() - .enumerate() - .filter_map(|(i, field)| { - field.metadata.get(METADATA_KIND).and_then(|kind| { - (kind == METADATA_KIND_CONTROL).then_some((field.name.as_str(), i)) - }) - }) - .collect(); - let control_index = move |name: &str| { - control_indices - .get(name) - .copied() - .ok_or(DataTableError::MissingColumn(name.into())) - }; - - // NOTE: the unwrappings cannot fail since control_index() makes sure the index is valid - #[allow(clippy::unwrap_used)] - let col_row_id = RowId::from_arrow( - chunk - .get(control_index(RowId::name().as_str())?) - .unwrap() - .as_ref(), - )?; - let col_entity_path = std::iter::repeat_with(|| entity_path.clone()) - .take(col_row_id.len()) - .collect_vec(); - - // --- Components --- - - let columns: DataTableResult<_> = schema - .fields - .iter() - .enumerate() - .filter_map(|(i, field)| { - field.metadata.get(METADATA_KIND).and_then(|kind| { - (kind == METADATA_KIND_DATA).then_some((field.name.as_str(), i)) - }) - }) - .map(|(name, index)| { - let component: ComponentName = name.to_owned().into(); - chunk - .get(index) - .ok_or(DataTableError::MissingColumn(name.to_owned())) - .and_then(|column| { - Self::deserialize_data_column(component, &**column) - .map(|data| (component, data)) - }) - }) - .collect(); - let columns = columns?; - - Ok(Self { - table_id, - col_row_id: col_row_id.into(), - col_timelines, - col_entity_path: col_entity_path.into(), - columns, - }) - } - - /// Deserializes a sparse time column. - fn deserialize_time_column( - name: &str, - column: &dyn Array, - ) -> DataTableResult<(Timeline, TimeOptVec)> { - re_tracing::profile_function!(); - - // See also [`Timeline::datatype`] - let timeline = match column.data_type().to_logical_type() { - DataType::Int64 => Timeline::new_sequence(name), - DataType::Timestamp(TimeUnit::Nanosecond, None) => Timeline::new_temporal(name), - _ => { - return Err(DataTableError::NotATimeColumn { - name: name.into(), - datatype: column.data_type().clone(), - }) - } - }; - - // NOTE: unwrap cannot fail here, datatype checked above - #[allow(clippy::unwrap_used)] - let col_time = column - .as_any() - .downcast_ref::>() - .unwrap(); - let col_time: TimeOptVec = col_time.into_iter().map(|time| time.copied()).collect(); - - Ok((timeline, col_time)) - } - - /// Deserializes a sparse data column. - fn deserialize_data_column( - component: ComponentName, - column: &dyn Array, - ) -> DataTableResult { - re_tracing::profile_function!(); - Ok(DataCellColumn( - column - .as_any() - .downcast_ref::>() - .ok_or(DataTableError::NotAColumn(component.to_string()))? - .iter() - // TODO(#3741): Schema metadata gets cloned in every single array. - // This'll become a problem as soon as we enable batching. - .map(|array| array.map(|values| DataCell::from_arrow(component, values))) - .collect(), - )) - } -} - -// --- - -impl DataTable { - /// Deserializes the contents of an [`ArrowMsg`] into a `DataTable`. - #[inline] - pub fn from_arrow_msg(msg: &ArrowMsg) -> DataTableResult { - let ArrowMsg { - table_id, - timepoint_max: _, - schema, - chunk, - on_release: _, - } = msg; - - Self::deserialize(*table_id, schema, chunk) - } - - /// Serializes the contents of a `DataTable` into an [`ArrowMsg`]. - // - // TODO(#1760): support serializing the cell size itself, so it can be computed on the clients. - #[inline] - pub fn to_arrow_msg(&self) -> DataTableResult { - let timepoint_max = self.timepoint_max(); - let (schema, chunk) = self.serialize()?; - - Ok(ArrowMsg { - table_id: self.table_id, - timepoint_max, - schema, - chunk, - on_release: None, - }) - } -} - -// --- - -impl std::fmt::Display for DataTable { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let (schema, columns) = self.serialize().map_err(|err| { - re_log::error_once!("couldn't display data table: {err}"); - std::fmt::Error - })?; - re_format_arrow::format_dataframe( - schema.metadata.clone(), - schema.fields.clone(), - columns.columns().iter().map(|x| x.as_ref()), - ) - .fmt(f) - } -} - -impl DataTable { - /// Checks whether two [`DataTable`]s are _similar_, i.e. not equal on a byte-level but - /// functionally equivalent. - /// - /// Returns `Ok(())` if they match, or an error containing a detailed diff otherwise. - pub fn similar(table1: &Self, table2: &Self) -> anyhow::Result<()> { - /// Given a [`DataTable`], returns all of its rows grouped by timeline. - fn compute_rows(table: &DataTable) -> anyhow::Result>> { - let mut rows_by_timeline: HashMap> = Default::default(); - - for row in table.to_rows() { - let row = row?; - for (&timeline, &time) in row.timepoint.iter() { - let mut row = row.clone(); - row.timepoint = TimePoint::from([(timeline, time)]); - rows_by_timeline.entry(timeline).or_default().push(row); - } - } - Ok(rows_by_timeline) - } - - let mut rows_by_timeline1 = compute_rows(table1)?; - let mut rows_by_timeline2 = compute_rows(table2)?; - - for timeline1 in rows_by_timeline1.keys() { - anyhow::ensure!( - rows_by_timeline2.contains_key(timeline1), - "timeline {timeline1:?} was present in the first rrd file but not in the second", - ); - } - for timeline2 in rows_by_timeline2.keys() { - anyhow::ensure!( - rows_by_timeline1.contains_key(timeline2), - "timeline {timeline2:?} was present in the second rrd file but not in the first", - ); - } - - // NOTE: Can't compare `log_time`, by definition. - rows_by_timeline1.remove(&Timeline::log_time()); - rows_by_timeline2.remove(&Timeline::log_time()); - - for (timeline, rows1) in &mut rows_by_timeline1 { - #[allow(clippy::unwrap_used)] // safe, the keys are checked above - let rows2 = rows_by_timeline2.get_mut(timeline).unwrap(); - - // NOTE: We need both sets of rows to follow a common natural order for the comparison - // to make sense. - rows1.sort_by_key(|row| (row.timepoint.clone(), row.row_id)); - rows2.sort_by_key(|row| (row.timepoint.clone(), row.row_id)); - - anyhow::ensure!( - rows1.len() == rows2.len(), - "rrd files yielded different number of datastore rows for timeline {timeline:?}: {} vs. {}", - rows1.len(), - rows2.len() - ); - - for (ri, (row1, row2)) in rows1.iter().zip(rows2).enumerate() { - let DataRow { - row_id: _, - timepoint: timepoint1, - entity_path: entity_path1, - cells: ref cells1, - } = row1; - let DataRow { - row_id: _, - timepoint: timepoint2, - entity_path: entity_path2, - cells: ref cells2, - } = row2; - - for (c1, c2) in izip!(&cells1.0, &cells2.0) { - if c1 != c2 { - anyhow::ensure!( - c1.datatype() == c2.datatype(), - "Found discrepancy in row #{ri}: cells' datatypes don't match!\n{}", - similar_asserts::SimpleDiff::from_str( - &format!("{:?}:{:?}", c1.component_name(), c1.datatype()), - &format!("{:?}:{:?}", c2.component_name(), c2.datatype()), - "cell1", - "cell2" - ) - ); - - let arr1 = c1.as_arrow_ref(); - let arr2 = c2.as_arrow_ref(); - - if let (Some(arr1), Some(arr2)) = ( - arr1.as_any().downcast_ref::(), - arr2.as_any().downcast_ref::(), - ) { - anyhow::ensure!( - arr1.validity() == arr2.validity(), - "Found discrepancy in row #{ri}: union arrays' validity bitmaps don't match!\n{}\n{}", - similar_asserts::SimpleDiff::from_str(&row1.to_string(), &row2.to_string(), "row1", "row2"), - similar_asserts::SimpleDiff::from_str( - &format!("{:?}", arr1.validity()), - &format!("{:?}", arr2.validity()), - "cell1", - "cell2" - ) - ); - anyhow::ensure!( - arr1.types() == arr2.types(), - "Found discrepancy in row #{ri}: union arrays' type indices don't match!\n{}\n{}", - similar_asserts::SimpleDiff::from_str(&row1.to_string(), &row2.to_string(), "row1", "row2"), - similar_asserts::SimpleDiff::from_str( - &format!("{:?}", arr1.types()), - &format!("{:?}", arr2.types()), - "cell1", - "cell2" - ) - ); - anyhow::ensure!( - arr1.offsets() == arr2.offsets(), - "Found discrepancy in row #{ri}: union arrays' offsets don't match!\n{}\n{}", - similar_asserts::SimpleDiff::from_str(&row1.to_string(), &row2.to_string(), "row1", "row2"), - similar_asserts::SimpleDiff::from_str( - &format!("{:?}", arr1.offsets()), - &format!("{:?}", arr2.offsets()), - "cell1", - "cell2" - ) - ); - } - } - } - - let mut size_mismatches = vec![]; - for (c1, c2) in izip!(&cells1.0, &cells2.0) { - if c1.total_size_bytes() != c2.total_size_bytes() { - size_mismatches.push(format!( - "Sizes don't match! {} ({}) vs. {} ({}) bytes. Perhaps the validity differs?", - c1.total_size_bytes(), - c1.component_name(), - c2.total_size_bytes(), - c2.component_name(), - )); - - fn cell_to_bytes(cell: DataCell) -> anyhow::Result> { - let row = DataRow::from_cells1( - RowId::ZERO, - "cell", - TimePoint::default(), - cell, - )?; - let table = DataTable::from_rows(TableId::ZERO, [row]); - - let msg = table.to_arrow_msg()?; - - use arrow2::io::ipc::write::StreamWriter; - let mut buf = Vec::::new(); - let mut writer = StreamWriter::new(&mut buf, Default::default()); - writer.start(&msg.schema, None)?; - writer.write(&msg.chunk, None)?; - writer.finish()?; - - Ok(buf) - } - - let c1_bytes = cell_to_bytes(c1.clone())?; - let c2_bytes = cell_to_bytes(c2.clone())?; - - size_mismatches.push(format!( - "IPC size is {} vs {} bytes", - c1_bytes.len(), - c2_bytes.len() - )); - - if c1_bytes.len().max(c2_bytes.len()) < 300 { - size_mismatches.push( - similar_asserts::SimpleDiff::from_str( - &format!("{c1_bytes:#?}"), - &format!("{c2_bytes:#?}"), - "cell1_ipc", - "cell2_ipc", - ) - .to_string(), - ); - } - } - } - - anyhow::ensure!( - timepoint1 == timepoint2 && entity_path1 == entity_path2 && cells1 == cells2, - "Found discrepancy in row #{ri}:\n{}\n{}\ - \n\nrow1:\n{row1} - \n\nrow2:\n{row2}", - similar_asserts::SimpleDiff::from_str( - &row1.to_string(), - &row2.to_string(), - "row1", - "row2" - ), - size_mismatches.join("\n"), - ); - } - } - - Ok(()) - } -} - -// --- - -/// Crafts a simple but interesting [`DataTable`]. -#[cfg(not(target_arch = "wasm32"))] -impl DataTable { - pub fn example(timeless: bool) -> Self { - // NOTE: because everything here is predetermined and there is no input we assume it's safe here - #![allow(clippy::unwrap_used)] - use crate::{ - example_components::{MyColor, MyLabel, MyPoint}, - Time, - }; - - let table_id = TableId::new(); - - let mut tick = 0i64; - let mut timepoint = |frame_nr: i64| { - let mut tp = TimePoint::default(); - if !timeless { - tp.insert(Timeline::log_time(), Time::now()); - tp.insert(Timeline::log_tick(), tick); - tp.insert(Timeline::new_sequence("frame_nr"), frame_nr); - } - tick += 1; - tp - }; - - let row0 = { - let positions: &[MyPoint] = &[MyPoint::new(10.0, 10.0), MyPoint::new(20.0, 20.0)]; - let colors: &[_] = &[MyColor(0x8080_80FF)]; - let labels: &[MyLabel] = &[]; - - DataRow::from_cells3(RowId::new(), "a", timepoint(1), (positions, colors, labels)) - .unwrap() - }; - - let row1 = { - let colors: &[MyColor] = &[]; - - DataRow::from_cells1(RowId::new(), "b", timepoint(1), colors).unwrap() - }; - - let row2 = { - let colors: &[_] = &[MyColor(0xFFFF_FFFF)]; - let labels: &[_] = &[MyLabel("hey".into())]; - - DataRow::from_cells2(RowId::new(), "c", timepoint(2), (colors, labels)).unwrap() - }; - - let mut table = Self::from_rows(table_id, [row0, row1, row2]); - table.compute_all_size_bytes(); - - table - } -} diff --git a/crates/re_log_types/src/lib.rs b/crates/re_log_types/src/lib.rs index 5475389f50e4..291b2dcd0665 100644 --- a/crates/re_log_types/src/lib.rs +++ b/crates/re_log_types/src/lib.rs @@ -23,9 +23,9 @@ pub mod hash; pub mod path; pub mod time_point; -mod data_cell; -mod data_row; -mod data_table; +// mod data_cell; +// mod data_row; +// mod data_table; mod instance; mod resolved_time_range; mod time; @@ -37,16 +37,6 @@ use std::sync::Arc; use re_build_info::CrateVersion; pub use self::arrow_msg::{ArrowChunkReleaseCallback, ArrowMsg}; -pub use self::data_cell::{DataCell, DataCellError, DataCellInner, DataCellResult}; -pub use self::data_row::{ - DataCellRow, DataCellVec, DataReadError, DataReadResult, DataRow, DataRowError, DataRowResult, - RowId, -}; -pub use self::data_table::{ - DataCellColumn, DataCellOptVec, DataTable, DataTableError, DataTableResult, EntityPathVec, - ErasedTimeVec, RowIdVec, TableId, TimePointVec, METADATA_KIND, METADATA_KIND_CONTROL, - METADATA_KIND_DATA, -}; pub use self::instance::Instance; pub use self::path::*; pub use self::resolved_time_range::{ResolvedTimeRange, ResolvedTimeRangeF}; @@ -275,6 +265,8 @@ pub enum LogMsg { SetStoreInfo(SetStoreInfo), /// Log an entity using an [`ArrowMsg`]. + // + // TODO(#6574): the store ID should be in the metadata here so we can remove the layer on top ArrowMsg(StoreId, ArrowMsg), /// Send after all messages in a blueprint to signal that the blueprint is complete. @@ -322,7 +314,14 @@ impl_into_enum!( #[derive(Clone, Debug, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] pub struct SetStoreInfo { - pub row_id: RowId, + /// A time-based UID that is only used to help keep track of when these `StoreInfo` originated + /// and how they fit in the global ordering of events. + // + // NOTE: Using a raw `Tuid` instead of an actual `RowId` to prevent a nasty dependency cycle. + // Note that both using a `RowId` as well as this whole serde/msgpack layer as a whole are hacks + // that are destined to disappear anyhow as we are closing in on our network-exposed data APIs. + pub row_id: re_tuid::Tuid, + pub info: StoreInfo, } diff --git a/crates/re_log_types/src/time_point/time_int.rs b/crates/re_log_types/src/time_point/time_int.rs index 72cbf5daf660..bc69e804597d 100644 --- a/crates/re_log_types/src/time_point/time_int.rs +++ b/crates/re_log_types/src/time_point/time_int.rs @@ -31,7 +31,7 @@ impl std::fmt::Debug for TimeInt { .debug_tuple("TimeInt::MAX") .field(&NonMinI64::MAX) .finish(), - Some(t) => f.debug_tuple("TimeInt").field(&t).finish(), + Some(t) => f.write_fmt(format_args!("TimeInt({})", re_format::format_int(t.get()))), None => f.debug_tuple("TimeInt::STATIC").finish(), } } diff --git a/crates/re_query/Cargo.toml b/crates/re_query/Cargo.toml index ccaf5dfbb0de..ba50a64e7db3 100644 --- a/crates/re_query/Cargo.toml +++ b/crates/re_query/Cargo.toml @@ -28,7 +28,8 @@ codegen = [] [dependencies] # Rerun dependencies: -re_data_store.workspace = true +re_chunk.workspace = true +re_chunk_store.workspace = true re_error.workspace = true re_format.workspace = true re_log.workspace = true @@ -63,12 +64,12 @@ similar-asserts.workspace = true [lib] bench = false + [[bin]] name = "clamped_zip" required-features = ["codegen"] bench = false - [[bin]] name = "range_zip" required-features = ["codegen"] diff --git a/crates/re_query/benches/latest_at.rs b/crates/re_query/benches/latest_at.rs index 66d68fffdb13..52e5dc675a8f 100644 --- a/crates/re_query/benches/latest_at.rs +++ b/crates/re_query/benches/latest_at.rs @@ -1,14 +1,14 @@ -//! Contains: -//! - A 1:1 port of the benchmarks in `crates/re_query/benches/query_benchmarks.rs`, with caching enabled. - // Allow unwrap() in benchmarks #![allow(clippy::unwrap_used)] -use criterion::{criterion_group, criterion_main, Criterion}; +use std::sync::Arc; +use criterion::{criterion_group, criterion_main, Criterion}; use itertools::Itertools; -use re_data_store::{DataStore, LatestAtQuery, StoreSubscriber}; -use re_log_types::{entity_path, DataRow, EntityPath, RowId, TimeInt, TimeType, Timeline}; + +use re_chunk::{Chunk, RowId}; +use re_chunk_store::{ChunkStore, ChunkStoreSubscriber, LatestAtQuery}; +use re_log_types::{entity_path, EntityPath, TimeInt, TimeType, Timeline}; use re_query::{clamped_zip_1x1, PromiseResolver}; use re_query::{Caches, LatestAtResults}; use re_types::{ @@ -61,7 +61,7 @@ fn mono_points(c: &mut Criterion) { let paths = (0..NUM_POINTS) .map(move |point_idx| entity_path!("points", point_idx)) .collect_vec(); - let msgs = build_points_rows(&paths, 1); + let msgs = build_points_chunks(&paths, 1); { let mut group = c.benchmark_group("arrow_mono_points2"); @@ -71,14 +71,14 @@ fn mono_points(c: &mut Criterion) { (NUM_POINTS * NUM_FRAMES_POINTS) as _, )); group.bench_function("insert", |b| { - b.iter(|| insert_rows(msgs.iter())); + b.iter(|| insert_chunks(msgs.iter())); }); } { let mut group = c.benchmark_group("arrow_mono_points2"); group.throughput(criterion::Throughput::Elements(NUM_POINTS as _)); - let (caches, store) = insert_rows(msgs.iter()); + let (caches, store) = insert_chunks(msgs.iter()); group.bench_function("query", |b| { b.iter(|| query_and_visit_points(&caches, &store, &paths)); }); @@ -90,7 +90,7 @@ fn mono_strings(c: &mut Criterion) { let paths = (0..NUM_STRINGS) .map(move |string_idx| entity_path!("strings", string_idx)) .collect_vec(); - let msgs = build_strings_rows(&paths, 1); + let msgs = build_strings_chunks(&paths, 1); { let mut group = c.benchmark_group("arrow_mono_strings2"); @@ -99,14 +99,14 @@ fn mono_strings(c: &mut Criterion) { (NUM_STRINGS * NUM_FRAMES_STRINGS) as _, )); group.bench_function("insert", |b| { - b.iter(|| insert_rows(msgs.iter())); + b.iter(|| insert_chunks(msgs.iter())); }); } { let mut group = c.benchmark_group("arrow_mono_strings2"); group.throughput(criterion::Throughput::Elements(NUM_POINTS as _)); - let (caches, store) = insert_rows(msgs.iter()); + let (caches, store) = insert_chunks(msgs.iter()); group.bench_function("query", |b| { b.iter(|| query_and_visit_strings(&caches, &store, &paths)); }); @@ -116,7 +116,7 @@ fn mono_strings(c: &mut Criterion) { fn batch_points(c: &mut Criterion) { // Batch points are logged together at a single path let paths = [EntityPath::from("points")]; - let msgs = build_points_rows(&paths, NUM_POINTS as _); + let msgs = build_points_chunks(&paths, NUM_POINTS as _); { let mut group = c.benchmark_group("arrow_batch_points2"); @@ -124,14 +124,14 @@ fn batch_points(c: &mut Criterion) { (NUM_POINTS * NUM_FRAMES_POINTS) as _, )); group.bench_function("insert", |b| { - b.iter(|| insert_rows(msgs.iter())); + b.iter(|| insert_chunks(msgs.iter())); }); } { let mut group = c.benchmark_group("arrow_batch_points2"); group.throughput(criterion::Throughput::Elements(NUM_POINTS as _)); - let (caches, store) = insert_rows(msgs.iter()); + let (caches, store) = insert_chunks(msgs.iter()); group.bench_function("query", |b| { b.iter(|| query_and_visit_points(&caches, &store, &paths)); }); @@ -141,7 +141,7 @@ fn batch_points(c: &mut Criterion) { fn batch_strings(c: &mut Criterion) { // Batch strings are logged together at a single path let paths = [EntityPath::from("points")]; - let msgs = build_strings_rows(&paths, NUM_STRINGS as _); + let msgs = build_strings_chunks(&paths, NUM_STRINGS as _); { let mut group = c.benchmark_group("arrow_batch_strings2"); @@ -149,14 +149,14 @@ fn batch_strings(c: &mut Criterion) { (NUM_STRINGS * NUM_FRAMES_STRINGS) as _, )); group.bench_function("insert", |b| { - b.iter(|| insert_rows(msgs.iter())); + b.iter(|| insert_chunks(msgs.iter())); }); } { let mut group = c.benchmark_group("arrow_batch_strings2"); group.throughput(criterion::Throughput::Elements(NUM_POINTS as _)); - let (caches, store) = insert_rows(msgs.iter()); + let (caches, store) = insert_chunks(msgs.iter()); group.bench_function("query", |b| { b.iter(|| query_and_visit_strings(&caches, &store, &paths)); }); @@ -201,70 +201,60 @@ pub fn build_some_strings(len: usize) -> Vec { .collect() } -fn build_points_rows(paths: &[EntityPath], num_points: usize) -> Vec { - (0..NUM_FRAMES_POINTS) - .flat_map(move |frame_idx| { - paths.iter().map(move |path| { - let mut row = DataRow::from_cells2( +fn build_points_chunks(paths: &[EntityPath], num_points: usize) -> Vec> { + paths + .iter() + .map(|path| { + let mut builder = Chunk::builder(path.clone()); + for frame_idx in 0..NUM_FRAMES_POINTS { + builder = builder.with_component_batches( RowId::new(), - path.clone(), [build_frame_nr((frame_idx as i64).try_into().unwrap())], - ( - build_some_point2d(num_points), - build_some_colors(num_points), - ), - ) - .unwrap(); - // NOTE: Using unsized cells will crash in debug mode, and benchmarks are run for 1 iteration, - // in debug mode, by the standard test harness. - if cfg!(debug_assertions) { - row.compute_all_size_bytes(); - } - row - }) + [ + &build_some_point2d(num_points) as _, + &build_some_colors(num_points) as _, + ], + ); + } + Arc::new(builder.build().unwrap()) }) .collect() } -fn build_strings_rows(paths: &[EntityPath], num_strings: usize) -> Vec { - (0..NUM_FRAMES_STRINGS) - .flat_map(move |frame_idx| { - paths.iter().map(move |path| { - let mut row = DataRow::from_cells2( +fn build_strings_chunks(paths: &[EntityPath], num_strings: usize) -> Vec> { + paths + .iter() + .map(|path| { + let mut builder = Chunk::builder(path.clone()); + for frame_idx in 0..NUM_FRAMES_POINTS { + builder = builder.with_component_batches( RowId::new(), - path.clone(), [build_frame_nr((frame_idx as i64).try_into().unwrap())], - // We still need to create points because they are the primary for the - // archetype query we want to do. We won't actually deserialize the points - // during the query -- we just need it for the primary keys. - // TODO(jleibs): switch this to use `TextEntry` once the new type has - // landed. - ( - build_some_point2d(num_strings), - build_some_strings(num_strings), - ), - ) - .unwrap(); - // NOTE: Using unsized cells will crash in debug mode, and benchmarks are run for 1 iteration, - // in debug mode, by the standard test harness. - if cfg!(debug_assertions) { - row.compute_all_size_bytes(); - } - row - }) + [ + // We still need to create points because they are the primary for the + // archetype query we want to do. We won't actually deserialize the points + // during the query -- we just need it for the primary keys. + // TODO(jleibs): switch this to use `TextEntry` once the new type has + // landed. + &build_some_point2d(num_strings) as _, + &build_some_strings(num_strings) as _, + ], + ); + } + Arc::new(builder.build().unwrap()) }) .collect() } -fn insert_rows<'a>(msgs: impl Iterator) -> (Caches, DataStore) { - let mut store = DataStore::new( +fn insert_chunks<'a>(msgs: impl Iterator>) -> (Caches, ChunkStore) { + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); let mut caches = Caches::new(&store); - msgs.for_each(|row| { - caches.on_events(&[store.insert_row(row).unwrap()]); + msgs.for_each(|chunk| { + caches.on_events(&[store.insert_chunk(chunk).unwrap().unwrap()]); }); (caches, store) @@ -277,7 +267,7 @@ struct SavePoint { fn query_and_visit_points( caches: &Caches, - store: &DataStore, + store: &ChunkStore, paths: &[EntityPath], ) -> Vec { let resolver = PromiseResolver::default(); @@ -329,7 +319,7 @@ struct SaveString { fn query_and_visit_strings( caches: &Caches, - store: &DataStore, + store: &ChunkStore, paths: &[EntityPath], ) -> Vec { let resolver = PromiseResolver::default(); diff --git a/crates/re_query/examples/latest_at.rs b/crates/re_query/examples/latest_at.rs index ab122fce1150..9d29f77204c9 100644 --- a/crates/re_query/examples/latest_at.rs +++ b/crates/re_query/examples/latest_at.rs @@ -1,7 +1,12 @@ +use std::sync::Arc; + use itertools::Itertools; -use re_data_store::{DataStore, LatestAtQuery}; + +use re_chunk::{Chunk, RowId}; +use re_chunk_store::{ChunkStore, LatestAtQuery}; use re_log_types::example_components::{MyColor, MyLabel, MyPoint, MyPoints}; -use re_log_types::{build_frame_nr, DataRow, RowId, TimeType, Timeline}; +use re_log_types::{build_frame_nr, TimeType, Timeline}; +use re_types::ComponentBatch; use re_types_core::{Archetype as _, Loggable as _}; use re_query::{ @@ -12,7 +17,7 @@ use re_query::{ fn main() -> anyhow::Result<()> { let store = store()?; - eprintln!("store:\n{}", store.to_data_table()?); + eprintln!("store:\n{store}"); let resolver = PromiseResolver::default(); @@ -103,8 +108,8 @@ fn main() -> anyhow::Result<()> { // --- -fn store() -> anyhow::Result { - let mut store = DataStore::new( +fn store() -> anyhow::Result { + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -114,17 +119,19 @@ fn store() -> anyhow::Result { { let timepoint = [build_frame_nr(123)]; - let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, points)?; - store.insert_row(&row)?; - - let colors = vec![MyColor::from_rgb(255, 0, 0)]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, colors)?; - store.insert_row(&row)?; - - let labels = vec![MyLabel("a".into()), MyLabel("b".into())]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, labels)?; - store.insert_row(&row)?; + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches( + RowId::new(), + timepoint, + [ + &[MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)] as &dyn ComponentBatch, // + &[MyColor::from_rgb(255, 0, 0)], + &[MyLabel("a".into()), MyLabel("b".into())], + ], + ) + .build()?; + + store.insert_chunk(&Arc::new(chunk))?; } Ok(store) diff --git a/crates/re_query/examples/range.rs b/crates/re_query/examples/range.rs index ad5903e6946d..d37c984d0547 100644 --- a/crates/re_query/examples/range.rs +++ b/crates/re_query/examples/range.rs @@ -1,7 +1,11 @@ +use std::sync::Arc; + use itertools::Itertools; -use re_data_store::{DataStore, RangeQuery}; +use re_chunk::{Chunk, RowId}; +use re_chunk_store::{ChunkStore, RangeQuery}; use re_log_types::example_components::{MyColor, MyLabel, MyPoint, MyPoints}; -use re_log_types::{build_frame_nr, DataRow, ResolvedTimeRange, RowId, TimeType, Timeline}; +use re_log_types::{build_frame_nr, ResolvedTimeRange, TimeType, Timeline}; +use re_types::ComponentBatch; use re_types_core::{Archetype as _, Loggable as _}; use re_query::{ @@ -13,7 +17,7 @@ use re_query::{ fn main() -> anyhow::Result<()> { let store = store()?; - eprintln!("store:\n{}", store.to_data_table()?); + eprintln!("store:\n{store}"); let resolver = PromiseResolver::default(); @@ -106,8 +110,8 @@ fn main() -> anyhow::Result<()> { // --- -fn store() -> anyhow::Result { - let mut store = DataStore::new( +fn store() -> anyhow::Result { + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -117,33 +121,40 @@ fn store() -> anyhow::Result { { let timepoint = [build_frame_nr(123)]; - let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, points)?; - store.insert_row(&row)?; - - let colors = vec![MyColor::from_rgb(255, 0, 0)]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, colors)?; - store.insert_row(&row)?; - - let labels = vec![MyLabel("a".into()), MyLabel("b".into())]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, labels)?; - store.insert_row(&row)?; + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches( + RowId::new(), + timepoint, + [ + &[MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)] as &dyn ComponentBatch, // + &[MyColor::from_rgb(255, 0, 0)], + &[MyLabel("a".into()), MyLabel("b".into())], + ], + ) + .build()?; + + store.insert_chunk(&Arc::new(chunk))?; } { - let timepoint = [build_frame_nr(456)]; - - let colors = vec![MyColor::from_rgb(255, 0, 0), MyColor::from_rgb(0, 0, 255)]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, colors)?; - store.insert_row(&row)?; - - let points = vec![ - MyPoint::new(10.0, 20.0), - MyPoint::new(30.0, 40.0), - MyPoint::new(50.0, 60.0), - ]; - let row = DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, points)?; - store.insert_row(&row)?; + let timepoint = [build_frame_nr(423)]; + + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches( + RowId::new(), + timepoint, + [ + &[ + MyPoint::new(10.0, 20.0), + MyPoint::new(30.0, 40.0), + MyPoint::new(50.0, 60.0), + ] as &dyn ComponentBatch, // + &[MyColor::from_rgb(255, 0, 0), MyColor::from_rgb(0, 0, 255)], + ], + ) + .build()?; + + store.insert_chunk(&Arc::new(chunk))?; } Ok(store) diff --git a/crates/re_query/src/cache.rs b/crates/re_query/src/cache.rs index 7451a4d2d5b4..bd50a39e60a2 100644 --- a/crates/re_query/src/cache.rs +++ b/crates/re_query/src/cache.rs @@ -6,8 +6,8 @@ use std::{ use ahash::{HashMap, HashSet}; use parking_lot::RwLock; -use re_data_store::{DataStore, StoreDiff, StoreEvent, StoreSubscriber, TimeInt}; -use re_log_types::{EntityPath, ResolvedTimeRange, StoreId, Timeline}; +use re_chunk_store::{ChunkStore, ChunkStoreDiff, ChunkStoreEvent, ChunkStoreSubscriber}; +use re_log_types::{EntityPath, ResolvedTimeRange, StoreId, TimeInt, Timeline}; use re_types_core::ComponentName; use crate::{LatestAtCache, RangeCache}; @@ -67,7 +67,7 @@ impl CacheKey { } pub struct Caches { - /// The [`StoreId`] of the associated [`DataStore`]. + /// The [`StoreId`] of the associated [`ChunkStore`]. pub(crate) store_id: StoreId, // NOTE: `Arc` so we can cheaply free the top-level lock early when needed. @@ -127,7 +127,7 @@ impl std::fmt::Debug for Caches { impl Caches { #[inline] - pub fn new(store: &DataStore) -> Self { + pub fn new(store: &ChunkStore) -> Self { Self { store_id: store.id().clone(), latest_at_per_cache_key: Default::default(), @@ -148,7 +148,7 @@ impl Caches { } } -impl StoreSubscriber for Caches { +impl ChunkStoreSubscriber for Caches { #[inline] fn name(&self) -> String { "rerun.store_subscribers.QueryCache".into() @@ -164,7 +164,7 @@ impl StoreSubscriber for Caches { self } - fn on_events(&mut self, events: &[StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { re_tracing::profile_function!(format!("num_events={}", events.len())); #[derive(Default, Debug)] @@ -176,7 +176,7 @@ impl StoreSubscriber for Caches { let mut compacted = CompactedEvents::default(); for event in events { - let StoreEvent { + let ChunkStoreEvent { store_id, store_generation: _, event_id: _, @@ -190,30 +190,33 @@ impl StoreSubscriber for Caches { store_id, ); - let StoreDiff { + let ChunkStoreDiff { kind: _, // Don't care: both additions and deletions invalidate query results. - row_id: _, - times, - entity_path, - cells, + chunk, } = diff; { re_tracing::profile_scope!("compact events"); - if times.is_empty() { - for component_name in cells.keys() { + if chunk.is_static() { + for component_name in chunk.component_names() { compacted .static_ - .insert((entity_path.clone(), *component_name)); + .insert((chunk.entity_path().clone(), component_name)); } } - for &(timeline, data_time) in times { - for component_name in cells.keys() { - let key = CacheKey::new(entity_path.clone(), timeline, *component_name); - let data_times = compacted.temporal.entry(key).or_default(); - data_times.insert(data_time); + for (&timeline, time_chunk) in chunk.timelines() { + for data_time in time_chunk.times() { + for component_name in chunk.component_names() { + let key = CacheKey::new( + chunk.entity_path().clone(), + timeline, + component_name, + ); + let data_times = compacted.temporal.entry(key).or_default(); + data_times.insert(data_time); + } } } } diff --git a/crates/re_query/src/latest_at/helpers.rs b/crates/re_query/src/latest_at/helpers.rs index 5d501c43d166..c11956df7fbe 100644 --- a/crates/re_query/src/latest_at/helpers.rs +++ b/crates/re_query/src/latest_at/helpers.rs @@ -1,5 +1,6 @@ -use re_data_store::{DataStore, LatestAtQuery}; -use re_log_types::{EntityPath, RowId, TimeInt}; +use re_chunk::RowId; +use re_chunk_store::{ChunkStore, LatestAtQuery}; +use re_log_types::{EntityPath, TimeInt}; use re_types_core::Component; use re_types_core::{external::arrow2::array::Array, ComponentName}; @@ -48,7 +49,7 @@ impl LatestAtComponentResults { re_log::debug_once!("Couldn't get {component_name}: promise still pending"); None } - PromiseResult::Ready(cell) => Some(cell.to_arrow()), + PromiseResult::Ready(array) => Some(array), PromiseResult::Error(err) => { re_log::log_once!( level, @@ -67,7 +68,7 @@ impl LatestAtComponentResults { pub fn try_raw(&self, resolver: &PromiseResolver) -> Option> { match self.resolved(resolver) { PromiseResult::Pending | PromiseResult::Error(_) => None, - PromiseResult::Ready(cell) => Some(cell.to_arrow()), + PromiseResult::Ready(array) => Some(array), } } @@ -142,12 +143,12 @@ impl LatestAtComponentResults { re_log::debug_once!("Couldn't get {component_name}: promise still pending"); None } - PromiseResult::Ready(cell) => { - match cell.as_arrow_ref().len() { + PromiseResult::Ready(array) => { + match array.len() { 0 => { None // Empty list = no data. } - 1 => Some(cell.as_arrow_ref().sliced(0, 1)), + 1 => Some(array.sliced(0, 1)), len => { re_log::log_once!( level, @@ -271,8 +272,8 @@ impl LatestAtComponentResults { None } - PromiseResult::Ready(cell) => { - let len = cell.num_instances() as usize; + PromiseResult::Ready(array) => { + let len = array.len(); // TODO(#5259): Figure out if/how we'd like to integrate clamping semantics into the // selection panel. @@ -282,7 +283,7 @@ impl LatestAtComponentResults { let index = usize::min(index, len.saturating_sub(1)); if len > index { - Some(cell.as_arrow_ref().sliced(index, 1)) + Some(array.sliced(index, 1)) } else { re_log::log_once!( level, @@ -349,7 +350,7 @@ impl Caches { // TODO(#5607): what should happen if the promise is still pending? pub fn latest_at_component_with_log_level( &self, - store: &DataStore, + store: &ChunkStore, resolver: &PromiseResolver, entity_path: &EntityPath, query: &LatestAtQuery, @@ -412,7 +413,7 @@ impl Caches { #[inline] pub fn latest_at_component( &self, - store: &DataStore, + store: &ChunkStore, resolver: &PromiseResolver, entity_path: &EntityPath, query: &LatestAtQuery, @@ -437,7 +438,7 @@ impl Caches { #[inline] pub fn latest_at_component_quiet( &self, - store: &DataStore, + store: &ChunkStore, resolver: &PromiseResolver, entity_path: &EntityPath, query: &LatestAtQuery, @@ -454,7 +455,7 @@ impl Caches { /// Call [`Self::latest_at_component`] at the given path, walking up the hierarchy until an instance is found. pub fn latest_at_component_at_closest_ancestor( &self, - store: &DataStore, + store: &ChunkStore, resolver: &PromiseResolver, entity_path: &EntityPath, query: &LatestAtQuery, diff --git a/crates/re_query/src/latest_at/mod.rs b/crates/re_query/src/latest_at/mod.rs index b99f9ad6922b..db3d380f6881 100644 --- a/crates/re_query/src/latest_at/mod.rs +++ b/crates/re_query/src/latest_at/mod.rs @@ -3,5 +3,5 @@ mod query; mod results; pub use self::helpers::LatestAtMonoResult; -pub use self::query::LatestAtCache; +pub use self::query::{latest_at, LatestAtCache}; pub use self::results::{LatestAtComponentResults, LatestAtResults}; diff --git a/crates/re_query/src/latest_at/query.rs b/crates/re_query/src/latest_at/query.rs index eae9a56b6a02..43fe7249639b 100644 --- a/crates/re_query/src/latest_at/query.rs +++ b/crates/re_query/src/latest_at/query.rs @@ -1,15 +1,17 @@ use std::collections::BTreeSet; use std::{collections::BTreeMap, sync::Arc}; +use arrow2::array::Array as ArrowArray; use indexmap::IndexMap; use itertools::Itertools; use parking_lot::RwLock; -use re_data_store::{DataStore, LatestAtQuery, TimeInt}; -use re_log_types::{EntityPath, RowId}; +use re_chunk::RowId; +use re_chunk_store::{ChunkStore, LatestAtQuery, TimeInt}; +use re_log_types::EntityPath; use re_types_core::{components::ClearIsRecursive, ComponentName, Loggable as _, SizeBytes}; -use crate::{CacheKey, Caches, LatestAtComponentResults, LatestAtResults, Promise}; +use crate::{CacheKey, Caches, LatestAtComponentResults, LatestAtResults}; // --- @@ -38,7 +40,7 @@ impl Caches { /// This is a cached API -- data will be lazily cached upon access. pub fn latest_at( &self, - store: &DataStore, + store: &ChunkStore, query: &LatestAtQuery, entity_path: &EntityPath, component_names: impl IntoIterator, @@ -285,11 +287,34 @@ impl SizeBytes for LatestAtCache { } } +/// Implements the complete end-to-end latest-at logic: +/// * Find all applicable `Chunk`s +/// * Apply a latest-at filter to all of them +/// * Keep the one row with the most recent `RowId` +pub fn latest_at( + store: &ChunkStore, + query: &LatestAtQuery, + entity_path: &EntityPath, + component_name: ComponentName, +) -> Option<(TimeInt, RowId, Box)> { + store + .latest_at_relevant_chunks(query, entity_path, component_name) + .into_iter() + .flat_map(|chunk| { + chunk + .latest_at(query, component_name) + .iter_rows(&query.timeline(), &component_name) + .collect_vec() + }) + .max_by_key(|(data_time, row_id, _)| (*data_time, *row_id)) + .and_then(|(data_time, row_id, array)| array.map(|array| (data_time, row_id, array))) +} + impl LatestAtCache { /// Queries cached latest-at data for a single component. pub fn latest_at( &mut self, - store: &DataStore, + store: &ChunkStore, query: &LatestAtQuery, entity_path: &EntityPath, component_name: ComponentName, @@ -312,19 +337,22 @@ impl LatestAtCache { std::collections::btree_map::Entry::Vacant(entry) => entry, }; - let result = store.latest_at(query, entity_path, component_name, &[component_name]); + if let Some((data_time, row_id, array)) = + latest_at(store, query, entity_path, component_name) + { + let result_data_time = data_time; + let result_row_id = row_id; + let result_component_batch = array; - // NOTE: cannot `result.and_then(...)` or borrowck gets lost. - if let Some((data_time, row_id, mut cells)) = result { // Fast path: we've run the query and realized that we already have the data for the resulting // _data_ time, so let's use that to avoid join & deserialization costs. - if let Some(data_time_bucket_at_data_time) = per_data_time.get(&data_time) { + if let Some(data_time_bucket_at_data_time) = per_data_time.get(&result_data_time) { query_time_bucket_at_query_time.insert(Arc::clone(data_time_bucket_at_data_time)); // We now know for a fact that a query at that data time would yield the same // results: copy the bucket accordingly so that the next cache hit for that query // time ends up taking the fastest path. - let query_time_bucket_at_data_time = per_query_time.entry(data_time); + let query_time_bucket_at_data_time = per_query_time.entry(result_data_time); query_time_bucket_at_data_time .and_modify(|v| *v = Arc::clone(data_time_bucket_at_data_time)) .or_insert(Arc::clone(data_time_bucket_at_data_time)); @@ -332,17 +360,9 @@ impl LatestAtCache { return Some(Arc::clone(data_time_bucket_at_data_time)); } - // Soundness: - // * `cells[0]` is guaranteed to exist since we passed in `&[component_name]` - // * `cells[0]` is guaranteed to be non-null, otherwise this whole result would be null - let Some(cell) = cells[0].take() else { - debug_assert!(cells[0].is_some(), "unreachable: `cells[0]` is missing"); - return None; - }; - let bucket = Arc::new(LatestAtComponentResults { - index: (data_time, row_id), - promise: Some(Promise::new(cell)), + index: (result_data_time, result_row_id), + value: Some((component_name, result_component_batch)), cached_dense: Default::default(), }); @@ -351,7 +371,7 @@ impl LatestAtCache { let query_time_bucket_at_query_time = query_time_bucket_at_query_time.insert(Arc::clone(&bucket)); - let data_time_bucket_at_data_time = per_data_time.entry(data_time); + let data_time_bucket_at_data_time = per_data_time.entry(result_data_time); data_time_bucket_at_data_time .and_modify(|v| *v = Arc::clone(query_time_bucket_at_query_time)) .or_insert(Arc::clone(query_time_bucket_at_query_time)); diff --git a/crates/re_query/src/latest_at/results.rs b/crates/re_query/src/latest_at/results.rs index edca11466af9..db8e8e914435 100644 --- a/crates/re_query/src/latest_at/results.rs +++ b/crates/re_query/src/latest_at/results.rs @@ -1,13 +1,13 @@ use std::sync::{Arc, OnceLock}; +use arrow2::array::Array as ArrowArray; use nohash_hasher::IntMap; -use re_log_types::{DataCell, RowId, TimeInt}; -use re_types_core::{Component, ComponentName, DeserializationError, SizeBytes}; +use re_chunk::RowId; +use re_log_types::TimeInt; +use re_types_core::{Component, ComponentName, SizeBytes}; -use crate::{ - ErasedFlatVecDeque, FlatVecDeque, Promise, PromiseResolver, PromiseResult, QueryError, -}; +use crate::{ErasedFlatVecDeque, FlatVecDeque, PromiseResolver, PromiseResult, QueryError}; // --- @@ -139,7 +139,7 @@ pub struct LatestAtComponentResults { pub(crate) index: (TimeInt, RowId), // Option so we can have a constant default value for `Self`. - pub(crate) promise: Option, + pub(crate) value: Option<(ComponentName, Box)>, /// The resolved, converted, deserialized dense data. pub(crate) cached_dense: OnceLock>, @@ -150,7 +150,7 @@ impl LatestAtComponentResults { pub const fn empty() -> Self { Self { index: (TimeInt::STATIC, RowId::ZERO), - promise: None, + value: None, cached_dense: OnceLock::new(), } } @@ -158,10 +158,13 @@ impl LatestAtComponentResults { /// Returns the [`ComponentName`] of the resolved data, if available. #[inline] pub fn component_name(&self, resolver: &PromiseResolver) -> Option { - match self.resolved(resolver) { - PromiseResult::Ready(cell) => Some(cell.component_name()), - _ => None, - } + // TODO(cmc): completely get rid of the promise resolving stuff once we start exposing + // chunks all the way. + _ = resolver; + + self.value + .as_ref() + .map(|(component_name, _)| *component_name) } /// Returns whether the resolved data is static. @@ -191,7 +194,7 @@ impl SizeBytes for LatestAtComponentResults { fn heap_size_bytes(&self) -> u64 { let Self { index, - promise, + value: promise, cached_dense, } = self; @@ -207,7 +210,7 @@ impl std::fmt::Debug for LatestAtComponentResults { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let Self { index, - promise: _, + value: _, cached_dense: _, // we can't, we don't know the type } = self; @@ -228,9 +231,13 @@ impl LatestAtComponentResults { /// Returns the raw resolved data, if it's ready. #[inline] - pub fn resolved(&self, resolver: &PromiseResolver) -> PromiseResult { - if let Some(cell) = self.promise.as_ref() { - resolver.resolve(cell) + pub fn resolved(&self, resolver: &PromiseResolver) -> PromiseResult> { + // TODO(cmc): completely get rid of the promise resolving stuff once we start exposing + // chunks all the way. + _ = resolver; + + if let Some((_, value)) = self.value.as_ref() { + PromiseResult::Ready(value.clone()) } else { PromiseResult::Pending } @@ -247,10 +254,12 @@ impl LatestAtComponentResults { &self, resolver: &PromiseResolver, ) -> PromiseResult> { - if let Some(cell) = self.promise.as_ref() { - resolver - .resolve(cell) - .map(|cell| self.downcast_dense::(&cell)) + // TODO(cmc): completely get rid of the promise resolving stuff once we start exposing + // chunks all the way. + _ = resolver; + + if let Some((_, value)) = self.value.as_ref() { + PromiseResult::Ready(self.downcast_dense(&**value)) } else { // Manufactured empty result. PromiseResult::Ready(Ok(&[])) @@ -274,7 +283,7 @@ impl LatestAtComponentResults { } impl LatestAtComponentResults { - fn downcast_dense(&self, cell: &DataCell) -> crate::Result<&[C]> { + fn downcast_dense(&self, value: &dyn ArrowArray) -> crate::Result<&[C]> { // `OnceLock::get` is non-blocking -- this is a best-effort fast path in case the // data has already been computed. // @@ -285,9 +294,7 @@ impl LatestAtComponentResults { // We have to do this outside of the callback in order to propagate errors. // Hence the early exit check above. - let data = cell - .try_to_native::() - .map_err(|err| DeserializationError::DataCellError(err.to_string()))?; + let data = C::from_arrow(value)?; #[allow(clippy::borrowed_box)] let cached: &Arc = self diff --git a/crates/re_query/src/lib.rs b/crates/re_query/src/lib.rs index f36f14a3fe74..354b4f0a2dc2 100644 --- a/crates/re_query/src/lib.rs +++ b/crates/re_query/src/lib.rs @@ -19,7 +19,7 @@ pub use self::promise::{Promise, PromiseId, PromiseResolver, PromiseResult}; pub use self::range::{RangeComponentResults, RangeData, RangeResults}; pub use self::range_zip::*; -pub(crate) use self::latest_at::LatestAtCache; +pub(crate) use self::latest_at::{latest_at, LatestAtCache}; pub(crate) use self::range::{RangeCache, RangeComponentResultsInner}; pub mod external { @@ -57,9 +57,6 @@ pub enum QueryError { requested: re_types_core::ComponentName, }, - #[error("Error with one or more the underlying data cells: {0}")] - DataCell(#[from] re_log_types::DataCellError), - #[error("Error deserializing: {0}")] DeserializationError(#[from] re_types_core::DeserializationError), @@ -80,7 +77,7 @@ pub type Result = std::result::Result; // --- -use re_data_store::{LatestAtQuery, RangeQuery}; +use re_chunk_store::{LatestAtQuery, RangeQuery}; #[derive(Debug)] pub enum Results { diff --git a/crates/re_query/src/promise.rs b/crates/re_query/src/promise.rs index 54666232a2db..d370c1e3aa26 100644 --- a/crates/re_query/src/promise.rs +++ b/crates/re_query/src/promise.rs @@ -1,6 +1,8 @@ use std::sync::Arc; -use re_log_types::DataCell; +use arrow2::array::Array as ArrowArray; + +// TODO(cmc): remove this once we rework re_query to be Chunk-based. // --- @@ -32,7 +34,7 @@ impl PromiseId { // --- -/// A [`Promise`] turns a source [`DataCell`] into a new [`DataCell`] with the helper of a +/// A [`Promise`] turns a source arrow array into a new arrow array with the help of a /// [`PromiseResolver`]. /// /// Each promise is uniquely identified via a [`PromiseId`]. @@ -41,7 +43,7 @@ impl PromiseId { #[derive(Debug, Clone)] pub struct Promise { id: PromiseId, - source: DataCell, + source: Box, } static_assertions::assert_eq_size!(Promise, Option); @@ -56,7 +58,7 @@ impl re_types_core::SizeBytes for Promise { impl Promise { #[inline] - pub fn new(source: DataCell) -> Self { + pub fn new(source: Box) -> Self { Self { id: PromiseId::new(), source, @@ -80,7 +82,7 @@ impl PromiseResolver { /// idempotent (the [`PromiseResolver`] keeps track of the state of all [`Promise`]s, both /// pending and already resolved). #[inline] - pub fn resolve(&self, promise: &Promise) -> PromiseResult { + pub fn resolve(&self, promise: &Promise) -> PromiseResult> { // NOTE: we're pretending there's gonna be some kind of interior mutability when // everything's said and done. _ = self; diff --git a/crates/re_query/src/range/query.rs b/crates/re_query/src/range/query.rs index 93239373294b..932902a85e38 100644 --- a/crates/re_query/src/range/query.rs +++ b/crates/re_query/src/range/query.rs @@ -1,15 +1,15 @@ use std::sync::Arc; +use arrow2::array::Array; +use itertools::Itertools; use parking_lot::RwLock; -use re_data_store::LatestAtQuery; -use re_data_store::{DataStore, RangeQuery, TimeInt}; +use re_chunk::RowId; +use re_chunk_store::{ChunkStore, LatestAtQuery, RangeQuery, TimeInt}; use re_log_types::{EntityPath, ResolvedTimeRange}; use re_types_core::{ComponentName, SizeBytes}; -use crate::{ - CacheKey, Caches, Promise, RangeComponentResults, RangeComponentResultsInner, RangeResults, -}; +use crate::{CacheKey, Caches, RangeComponentResults, RangeComponentResultsInner, RangeResults}; // --- @@ -21,7 +21,7 @@ impl Caches { /// This is a cached API -- data will be lazily cached upon access. pub fn range( &self, - store: &DataStore, + store: &ChunkStore, query: &RangeQuery, entity_path: &EntityPath, component_names: impl IntoIterator, @@ -69,12 +69,10 @@ impl Caches { let hole_end = TimeInt::new_temporal(query.range().min().as_i64().saturating_sub(1)); if hole_start < hole_end { - if let Some((data_time, _, _)) = store.latest_at( - &LatestAtQuery::new(query.timeline(), hole_end), - entity_path, - component_name, - &[component_name], - ) { + let query = &LatestAtQuery::new(query.timeline(), hole_end); + if let Some((data_time, _, _)) = + crate::latest_at(store, query, entity_path, component_name) + { if data_time > hole_start { re_log::trace!(%entity_path, %component_name, "coarsely invalidated because of bridged queries"); cache.pending_invalidation = Some(TimeInt::MIN); @@ -88,12 +86,10 @@ impl Caches { let hole_end = TimeInt::new_temporal(time_range.min().as_i64().saturating_sub(1)); if hole_start < hole_end { - if let Some((data_time, _, _)) = store.latest_at( - &LatestAtQuery::new(query.timeline(), hole_end), - entity_path, - component_name, - &[component_name], - ) { + let query = &LatestAtQuery::new(query.timeline(), hole_end); + if let Some((data_time, _, _)) = + crate::latest_at(store, query, entity_path, component_name) + { if data_time > hole_start { re_log::trace!(%entity_path, %component_name, "coarsely invalidated because of bridged queries"); cache.pending_invalidation = Some(TimeInt::MIN); @@ -204,11 +200,36 @@ impl SizeBytes for RangeCache { } } +/// Implements the complete end-to-end range logic: +/// * Find all applicable `Chunk`s +/// * Apply a range filter to all of them +/// * Concatenate all the results (will sort them later) +pub fn range<'a>( + store: &'a ChunkStore, + query: &'a RangeQuery, + entity_path: &EntityPath, + component_name: ComponentName, +) -> impl Iterator)> + 'a { + store + .range_relevant_chunks(query, entity_path, component_name) + .into_iter() + .map(move |chunk| chunk.range(query, component_name)) + .filter(|chunk| !chunk.is_empty()) + .flat_map(move |chunk| { + chunk + .iter_rows(&query.timeline(), &component_name) + .filter_map(|(data_time, row_id, array)| { + array.map(|array| (data_time, row_id, array)) + }) + .collect_vec() + }) +} + impl RangeCache { /// Queries cached range data for a single component. pub fn range( &mut self, - store: &DataStore, + store: &ChunkStore, query: &RangeQuery, entity_path: &EntityPath, component_name: ComponentName, @@ -227,20 +248,11 @@ impl RangeCache { if let Some(query_front) = query_front.as_ref() { re_tracing::profile_scope!("front"); - for (data_time, row_id, mut cells) in - store.range(query_front, entity_path, [component_name]) + for (data_time, row_id, array) in range(store, query_front, entity_path, component_name) { - // Soundness: - // * `cells[0]` is guaranteed to exist since we passed in `&[component_name]` - // * `cells[0]` is guaranteed to be non-null, otherwise this whole result would be null - let Some(cell) = cells[0].take() else { - debug_assert!(cells[0].is_some(), "unreachable: `cells[0]` is missing"); - continue; - }; - per_data_time .promises_front - .push(((data_time, row_id), Promise::new(cell))); + .push(((data_time, row_id), array)); } { re_tracing::profile_scope!("sort front"); @@ -253,22 +265,13 @@ impl RangeCache { if let Some(query_back) = per_data_time.compute_back_query(query, query_front.as_ref()) { re_tracing::profile_scope!("back"); - for (data_time, row_id, mut cells) in store - .range(&query_back, entity_path, [component_name]) + for (data_time, row_id, array) in range(store, &query_back, entity_path, component_name) // If there's static data to be found, the front query will take care of it already. .filter(|(data_time, _, _)| !data_time.is_static()) { - // Soundness: - // * `cells[0]` is guaranteed to exist since we passed in `&[component_name]` - // * `cells[0]` is guaranteed to be non-null, otherwise this whole result would be null - let Some(cell) = cells[0].take() else { - debug_assert!(cells[0].is_some(), "unreachable: `cells[0]` is missing"); - continue; - }; - per_data_time .promises_back - .push(((data_time, row_id), Promise::new(cell))); + .push(((data_time, row_id), array)); } { re_tracing::profile_scope!("sort back"); diff --git a/crates/re_query/src/range/results.rs b/crates/re_query/src/range/results.rs index 12d3e6db6be7..98558b8dbedd 100644 --- a/crates/re_query/src/range/results.rs +++ b/crates/re_query/src/range/results.rs @@ -5,16 +5,17 @@ use std::{ sync::{Arc, OnceLock}, }; +use arrow2::array::Array as ArrowArray; use nohash_hasher::IntMap; - use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard, RwLockWriteGuard}; -use re_data_store::RangeQuery; -use re_log_types::{ResolvedTimeRange, RowId, TimeInt}; + +use re_chunk::RowId; +use re_chunk_store::RangeQuery; +use re_log_types::{ResolvedTimeRange, TimeInt}; use re_types_core::{Component, ComponentName, DeserializationError, SizeBytes}; use crate::{ - ErasedFlatVecDeque, FlatVecDeque, LatestAtComponentResults, Promise, PromiseResolver, - PromiseResult, + ErasedFlatVecDeque, FlatVecDeque, LatestAtComponentResults, PromiseResolver, PromiseResult, }; // --- @@ -237,7 +238,7 @@ impl<'a, C: Component> RangeData<'a, C> { ) -> Self { let LatestAtComponentResults { index, - promise: _, + value: _, cached_dense, } = results; @@ -365,6 +366,10 @@ impl RangeComponentResults { // It's tracing the deserialization of an entire range query at once -- it's fine. re_tracing::profile_function!(); + // TODO(cmc): completely get rid of the promise resolving stuff once we start exposing + // chunks all the way. + _ = resolver; + // --- Step 1: try and upsert pending data (write lock) --- REENTERING.with_borrow_mut(|reentering| *reentering = reentering.saturating_add(1)); @@ -420,28 +425,16 @@ impl RangeComponentResults { // available up to that point in time. // // Reminder: promises are sorted in ascending index order. - while let Some(((data_time, row_id), promise)) = results.promises_front.pop() { - let data = match resolver.resolve(&promise) { - PromiseResult::Pending => { - results.front_status = (data_time, PromiseResult::Pending); - break; - } - PromiseResult::Error(err) => { - results.front_status = (data_time, PromiseResult::Error(err)); - break; - } - PromiseResult::Ready(cell) => { - results.front_status = (data_time, PromiseResult::Ready(())); - match cell - .try_to_native::() - .map_err(|err| DeserializationError::DataCellError(err.to_string())) - { - Ok(data) => data, - Err(err) => { - re_log::error!(%err, component=%C::name(), "data deserialization failed -- skipping"); - continue; - } - } + while let Some(((data_time, row_id), array)) = results.promises_front.pop() { + results.front_status = (data_time, PromiseResult::Ready(())); + + let data = match C::from_arrow(array.as_ref()) + .map_err(|err| DeserializationError::DataCellError(err.to_string())) + { + Ok(data) => data, + Err(err) => { + re_log::error!(%err, component=%C::name(), "data deserialization failed -- skipping"); + continue; } }; @@ -489,28 +482,14 @@ impl RangeComponentResults { // Pop the promises from the end so that if we encounter one that has yet to be // resolved, we can stop right there and know we have a contiguous range of data // available up to that point in time. - while let Some(((data_time, index), promise)) = results.promises_back.pop() { - let data = match resolver.resolve(&promise) { - PromiseResult::Pending => { - results.back_status = (data_time, PromiseResult::Pending); - break; - } - PromiseResult::Error(err) => { - results.back_status = (data_time, PromiseResult::Error(err)); - break; - } - PromiseResult::Ready(cell) => { - results.front_status = (data_time, PromiseResult::Ready(())); - match cell - .try_to_native::() - .map_err(|err| DeserializationError::DataCellError(err.to_string())) - { - Ok(data) => data, - Err(err) => { - re_log::error!(%err, "data deserialization failed -- skipping"); - continue; - } - } + while let Some(((data_time, index), array)) = results.promises_back.pop() { + let data = match C::from_arrow(array.as_ref()) + .map_err(|err| DeserializationError::DataCellError(err.to_string())) + { + Ok(data) => data, + Err(err) => { + re_log::error!(%err, component=%C::name(), "data deserialization failed -- skipping"); + continue; } }; @@ -636,13 +615,13 @@ pub struct RangeComponentResultsInner { /// front-side of the ringbuffer (i.e. further back in time). /// /// Always sorted in ascending index order ([`TimeInt`] + [`RowId`] pair). - pub(crate) promises_front: Vec<((TimeInt, RowId), Promise)>, + pub(crate) promises_front: Vec<((TimeInt, RowId), Box)>, /// All the pending promises that must resolved in order to fill the missing data on the /// back-side of the ringbuffer (i.e. the most recent data). /// /// Always sorted in ascending index order ([`TimeInt`] + [`RowId`] pair). - pub(crate) promises_back: Vec<((TimeInt, RowId), Promise)>, + pub(crate) promises_back: Vec<((TimeInt, RowId), Box)>, /// Keeps track of the status of the data on the front-side of the cache. pub(crate) front_status: (TimeInt, PromiseResult<()>), diff --git a/crates/re_query/src/range_zip/mod.rs b/crates/re_query/src/range_zip/mod.rs index 60913fef721c..8bf59965386c 100644 --- a/crates/re_query/src/range_zip/mod.rs +++ b/crates/re_query/src/range_zip/mod.rs @@ -5,7 +5,8 @@ pub use self::generated::*; mod tests { use itertools::Itertools as _; - use re_log_types::{RowId, TimeInt}; + use re_chunk::RowId; + use re_log_types::TimeInt; use super::*; diff --git a/crates/re_query/tests/latest_at.rs b/crates/re_query/tests/latest_at.rs index 30eca0e755af..97d78c203994 100644 --- a/crates/re_query/tests/latest_at.rs +++ b/crates/re_query/tests/latest_at.rs @@ -1,22 +1,27 @@ // https://github.com/rust-lang/rust-clippy/issues/10011 #![cfg(test)] -use re_data_store::{DataStore, LatestAtQuery, StoreSubscriber}; +use std::sync::Arc; + +use re_chunk::RowId; +use re_chunk_store::{ + external::re_chunk::Chunk, ChunkStore, ChunkStoreSubscriber as _, LatestAtQuery, +}; use re_log_types::{ build_frame_nr, example_components::{MyColor, MyPoint, MyPoints}, - DataRow, EntityPath, RowId, TimeInt, TimePoint, + EntityPath, TimeInt, TimePoint, }; use re_query::Caches; use re_query::PromiseResolver; -use re_types::Archetype as _; +use re_types::{Archetype as _, ComponentBatch}; use re_types_core::Loggable as _; // --- #[test] fn simple_query() { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -25,20 +30,21 @@ fn simple_query() { let entity_path = "point"; let timepoint = [build_frame_nr(123)]; - let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row = - DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, points.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row); - - let colors = vec![MyColor::from_rgb(255, 0, 0)]; - let row = - DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, colors.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row); + let row_id1 = RowId::new(); + let points1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; + let row_id2 = RowId::new(); + let colors2 = vec![MyColor::from_rgb(255, 0, 0)]; + let chunk = Chunk::builder(entity_path.into()) + .with_component_batch(row_id1, timepoint, &points1) + .with_component_batch(row_id2, timepoint, &colors2) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(timepoint[0].0, timepoint[0].1); - let expected_compound_index = (TimeInt::new_temporal(123), row.row_id()); - let expected_points = &points; - let expected_colors = &colors; + let query = re_chunk_store::LatestAtQuery::new(timepoint[0].0, timepoint[0].1); + let expected_compound_index = (TimeInt::new_temporal(123), row_id2); + let expected_points = &points1; + let expected_colors = &colors2; query_and_compare( &caches, &store, @@ -52,7 +58,7 @@ fn simple_query() { #[test] fn static_query() { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -61,23 +67,28 @@ fn static_query() { let entity_path = "point"; let timepoint = [build_frame_nr(123)]; + let row_id1 = RowId::new(); let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, points.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row1); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id1, timepoint, [&points as &dyn ComponentBatch]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); + let row_id2 = RowId::new(); let colors = vec![MyColor::from_rgb(255, 0, 0)]; - let row2 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - TimePoint::default(), - colors.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row2); - - let query = re_data_store::LatestAtQuery::new(timepoint[0].0, timepoint[0].1); - let expected_compound_index = (TimeInt::new_temporal(123), row1.row_id()); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches( + row_id2, + TimePoint::default(), + [&colors as &dyn ComponentBatch], + ) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); + + let query = re_chunk_store::LatestAtQuery::new(timepoint[0].0, timepoint[0].1); + let expected_compound_index = (TimeInt::new_temporal(123), row_id1); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -108,33 +119,29 @@ fn invalidation() { .copied() .unwrap_or(TimeInt::STATIC); - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); let mut caches = Caches::new(&store); + let row_id1 = RowId::new(); let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint.clone(), - points.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row1); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id1, present_data_timepoint.clone(), [&points as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); + let row_id2 = RowId::new(); let colors = vec![MyColor::from_rgb(1, 2, 3)]; - let row2 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint.clone(), - colors.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row2); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id2, present_data_timepoint.clone(), [&colors as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let expected_compound_index = (present_timestamp, row2.row_id()); + let expected_compound_index = (present_timestamp, row_id2); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -150,17 +157,15 @@ fn invalidation() { // --- Modify present --- // Modify the PoV component + let row_id3 = RowId::new(); let points = vec![MyPoint::new(10.0, 20.0), MyPoint::new(30.0, 40.0)]; - let row3 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint.clone(), - points.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row3); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id3, present_data_timepoint.clone(), [&points as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let expected_compound_index = (present_timestamp, row3.row_id()); + let expected_compound_index = (present_timestamp, row_id3); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -174,17 +179,15 @@ fn invalidation() { ); // Modify the optional component + let row_id4 = RowId::new(); let colors = vec![MyColor::from_rgb(4, 5, 6), MyColor::from_rgb(7, 8, 9)]; - let row4 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint.clone(), - colors.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row4); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id4, present_data_timepoint.clone(), [&colors as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let expected_compound_index = (present_timestamp, row4.row_id()); + let expected_compound_index = (present_timestamp, row_id4); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -200,17 +203,15 @@ fn invalidation() { // --- Modify past --- // Modify the PoV component + let row_id5 = RowId::new(); let points_past = vec![MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)]; - let row5 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - past_data_timepoint.clone(), - points_past.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row5); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id5, past_data_timepoint.clone(), [&points_past as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let expected_compound_index = (present_timestamp, row4.row_id()); + let expected_compound_index = (present_timestamp, row_id4); let expected_points = if past_timestamp.is_static() { &points_past } else { @@ -228,20 +229,18 @@ fn invalidation() { ); // Modify the optional component + let row_id6 = RowId::new(); let colors_past = vec![MyColor::from_rgb(10, 11, 12), MyColor::from_rgb(13, 14, 15)]; - let row6 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - past_data_timepoint, - colors_past.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row6); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id6, past_data_timepoint.clone(), [&colors_past as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); let (expected_compound_index, expected_colors) = if past_timestamp.is_static() { - ((past_timestamp, row6.row_id()), &colors_past) + ((past_timestamp, row_id6), &colors_past) } else { - ((present_timestamp, row4.row_id()), &colors) + ((present_timestamp, row_id4), &colors) }; query_and_compare( &caches, @@ -256,20 +255,22 @@ fn invalidation() { // --- Modify future --- // Modify the PoV component + let row_id7 = RowId::new(); let points_future = vec![MyPoint::new(1000.0, 2000.0), MyPoint::new(3000.0, 4000.0)]; - let row7 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - future_data_timepoint.clone(), - points_future.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row7); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches( + row_id7, + future_data_timepoint.clone(), + [&points_future as _], + ) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); let (expected_compound_index, expected_points) = if past_timestamp.is_static() { - ((past_timestamp, row6.row_id()), &points_past) + ((past_timestamp, row_id6), &points_past) } else { - ((present_timestamp, row4.row_id()), &points) + ((present_timestamp, row_id4), &points) }; query_and_compare( &caches, @@ -282,20 +283,22 @@ fn invalidation() { ); // Modify the optional component + let row_id8 = RowId::new(); let colors_future = vec![MyColor::from_rgb(16, 17, 18)]; - let row = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - future_data_timepoint, - colors_future, - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches( + row_id8, + future_data_timepoint.clone(), + [&colors_future as _], + ) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); let (expected_compound_index, expected_colors) = if past_timestamp.is_static() { - ((past_timestamp, row6.row_id()), &colors_past) + ((past_timestamp, row_id6), &colors_past) } else { - ((present_timestamp, row4.row_id()), &colors) + ((present_timestamp, row_id4), &colors) }; query_and_compare( &caches, @@ -355,7 +358,7 @@ fn invalidation() { // ``` #[test] fn invalidation_of_future_optionals() { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -369,13 +372,16 @@ fn invalidation_of_future_optionals() { let query_time = [build_frame_nr(9999)]; + let row_id1 = RowId::new(); let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1 = - DataRow::from_cells1_sized(RowId::new(), entity_path, static_, points.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row1); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id1, static_, [&points as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); - let expected_compound_index = (TimeInt::STATIC, row1.row_id()); + let query = re_chunk_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); + let expected_compound_index = (TimeInt::STATIC, row_id1); let expected_points = &points; let expected_colors = &[]; query_and_compare( @@ -388,13 +394,16 @@ fn invalidation_of_future_optionals() { expected_colors, ); + let row_id2 = RowId::new(); let colors = vec![MyColor::from_rgb(255, 0, 0)]; - let row2 = - DataRow::from_cells1_sized(RowId::new(), entity_path, frame2, colors.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row2); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id2, frame2, [&colors as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); - let expected_compound_index = (TimeInt::new_temporal(2), row2.row_id()); + let query = re_chunk_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); + let expected_compound_index = (TimeInt::new_temporal(2), row_id2); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -407,13 +416,16 @@ fn invalidation_of_future_optionals() { expected_colors, ); + let row_id3 = RowId::new(); let colors = vec![MyColor::from_rgb(0, 0, 255)]; - let row3 = - DataRow::from_cells1_sized(RowId::new(), entity_path, frame3, colors.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row3); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id3, frame3, [&colors as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); - let expected_compound_index = (TimeInt::new_temporal(3), row3.row_id()); + let query = re_chunk_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); + let expected_compound_index = (TimeInt::new_temporal(3), row_id3); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -426,13 +438,16 @@ fn invalidation_of_future_optionals() { expected_colors, ); + let row_id4 = RowId::new(); let colors = vec![MyColor::from_rgb(0, 255, 0)]; - let row4 = - DataRow::from_cells1_sized(RowId::new(), entity_path, frame3, colors.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row4); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id4, frame3, [&colors as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); - let expected_compound_index = (TimeInt::new_temporal(3), row4.row_id()); + let query = re_chunk_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); + let expected_compound_index = (TimeInt::new_temporal(3), row_id4); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -448,7 +463,7 @@ fn invalidation_of_future_optionals() { #[test] fn static_invalidation() { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -460,14 +475,16 @@ fn static_invalidation() { let query_time = [build_frame_nr(9999)]; + let row_id1 = RowId::new(); let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timeless.clone(), points.clone()) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row1); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id1, timeless.clone(), [&points as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); - let expected_compound_index = (TimeInt::STATIC, row1.row_id()); + let query = re_chunk_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); + let expected_compound_index = (TimeInt::STATIC, row_id1); let expected_points = &points; let expected_colors = &[]; query_and_compare( @@ -480,14 +497,16 @@ fn static_invalidation() { expected_colors, ); + let row_id2 = RowId::new(); let colors = vec![MyColor::from_rgb(255, 0, 0)]; - let row2 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timeless.clone(), colors.clone()) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row2); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id2, timeless.clone(), [&colors as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); - let expected_compound_index = (TimeInt::STATIC, row2.row_id()); + let query = re_chunk_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); + let expected_compound_index = (TimeInt::STATIC, row_id2); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -500,13 +519,16 @@ fn static_invalidation() { expected_colors, ); + let row_id3 = RowId::new(); let colors = vec![MyColor::from_rgb(0, 0, 255)]; - let row3 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timeless, colors.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row3); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batches(row_id3, timeless.clone(), [&colors as _]) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); - let query = re_data_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); - let expected_compound_index = (TimeInt::STATIC, row3.row_id()); + let query = re_chunk_store::LatestAtQuery::new(query_time[0].0, query_time[0].1); + let expected_compound_index = (TimeInt::STATIC, row_id3); let expected_points = &points; let expected_colors = &colors; query_and_compare( @@ -522,13 +544,13 @@ fn static_invalidation() { // --- -fn insert_and_react(store: &mut DataStore, caches: &mut Caches, row: &DataRow) { - caches.on_events(&[store.insert_row(row).unwrap()]); +fn insert_and_react(store: &mut ChunkStore, caches: &mut Caches, chunk: &Arc) { + caches.on_events(&[store.insert_chunk(chunk).unwrap().unwrap()]); } fn query_and_compare( caches: &Caches, - store: &DataStore, + store: &ChunkStore, query: &LatestAtQuery, entity_path: &EntityPath, expected_compound_index: (TimeInt, RowId), @@ -559,6 +581,7 @@ fn query_and_compare( .flatten() .unwrap(); + eprintln!("{store}"); // eprintln!("{}", store.to_data_table().unwrap()); similar_asserts::assert_eq!(expected_compound_index, cached.compound_index); diff --git a/crates/re_query/tests/range.rs b/crates/re_query/tests/range.rs index 2c252147f755..dde0d64e6499 100644 --- a/crates/re_query/tests/range.rs +++ b/crates/re_query/tests/range.rs @@ -1,13 +1,19 @@ // https://github.com/rust-lang/rust-clippy/issues/10011 #![cfg(test)] +use std::sync::Arc; + use itertools::Itertools as _; -use re_data_store::{DataStore, RangeQuery, ResolvedTimeRange, StoreSubscriber as _, TimeInt}; +use re_chunk::{RowId, Timeline}; +use re_chunk_store::{ + external::re_chunk::Chunk, ChunkStore, ChunkStoreSubscriber as _, RangeQuery, + ResolvedTimeRange, TimeInt, +}; use re_log_types::{ build_frame_nr, example_components::{MyColor, MyPoint, MyPoints}, - DataReadError, DataRow, EntityPath, RowId, TimePoint, Timeline, + EntityPath, TimePoint, }; use re_query::{Caches, PromiseResolver, PromiseResult}; use re_types::Archetype; @@ -17,7 +23,7 @@ use re_types_core::Loggable as _; #[test] fn simple_range() -> anyhow::Result<()> { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -26,61 +32,44 @@ fn simple_range() -> anyhow::Result<()> { let entity_path: EntityPath = "point".into(); let timepoint1 = [build_frame_nr(123)]; - let points1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1_1 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint1, - points1.clone(), - )?; - insert_and_react(&mut store, &mut caches, &row1_1); - let colors1 = vec![MyColor::from_rgb(255, 0, 0)]; - let row1_2 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint1, - colors1.clone(), - )?; - insert_and_react(&mut store, &mut caches, &row1_2); + let row_id1_1 = RowId::new(); + let points1_1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; + let row_id1_2 = RowId::new(); + let colors1_2 = vec![MyColor::from_rgb(255, 0, 0)]; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id1_1, timepoint1, &points1_1) + .with_component_batch(row_id1_2, timepoint1, &colors1_2) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); let timepoint2 = [build_frame_nr(223)]; + let row_id2 = RowId::new(); let colors2 = vec![MyColor::from_rgb(255, 0, 0)]; - let row2 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint2, - colors2.clone(), - )?; - insert_and_react(&mut store, &mut caches, &row2); + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id2, timepoint2, &colors2) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); let timepoint3 = [build_frame_nr(323)]; + let row_id3 = RowId::new(); let points3 = vec![MyPoint::new(10.0, 20.0), MyPoint::new(30.0, 40.0)]; - let row3 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint3, - points3.clone(), - )?; - insert_and_react(&mut store, &mut caches, &row3); + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id3, timepoint3, &points3) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); // --- First test: `(timepoint1, timepoint3]` --- - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( timepoint1[0].0, ResolvedTimeRange::new(timepoint1[0].1.as_i64() + 1, timepoint3[0].1), ); let expected_points = &[ - ( - (TimeInt::new_temporal(323), row3.row_id()), - points3.as_slice(), - ), // + ((TimeInt::new_temporal(323), row_id3), points3.as_slice()), // ]; let expected_colors = &[ - ( - (TimeInt::new_temporal(223), row2.row_id()), - colors2.as_slice(), - ), // + ((TimeInt::new_temporal(223), row_id2), colors2.as_slice()), // ]; query_and_compare( &caches, @@ -93,30 +82,24 @@ fn simple_range() -> anyhow::Result<()> { // --- Second test: `[timepoint1, timepoint3]` --- - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( timepoint1[0].0, ResolvedTimeRange::new(timepoint1[0].1, timepoint3[0].1), ); let expected_points = &[ ( - (TimeInt::new_temporal(123), row1_1.row_id()), - points1.as_slice(), - ), // - ( - (TimeInt::new_temporal(323), row3.row_id()), - points3.as_slice(), + (TimeInt::new_temporal(123), row_id1_1), + points1_1.as_slice(), ), // + ((TimeInt::new_temporal(323), row_id3), points3.as_slice()), // ]; let expected_colors = &[ ( - (TimeInt::new_temporal(123), row1_2.row_id()), - colors1.as_slice(), - ), // - ( - (TimeInt::new_temporal(223), row2.row_id()), - colors2.as_slice(), + (TimeInt::new_temporal(123), row_id1_2), + colors1_2.as_slice(), ), // + ((TimeInt::new_temporal(223), row_id2), colors2.as_slice()), // ]; query_and_compare( &caches, @@ -131,8 +114,8 @@ fn simple_range() -> anyhow::Result<()> { } #[test] -fn static_range() { - let mut store = DataStore::new( +fn static_range() -> anyhow::Result<()> { + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -141,81 +124,57 @@ fn static_range() { let entity_path: EntityPath = "point".into(); let timepoint1 = [build_frame_nr(123)]; - let points1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1_1 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint1, - points1.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row1_1); - let colors1 = vec![MyColor::from_rgb(255, 0, 0)]; - let row1_2 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint1, - colors1.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row1_2); + let row_id1_1 = RowId::new(); + let points1_1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; + let row_id1_2 = RowId::new(); + let colors1_2 = vec![MyColor::from_rgb(255, 0, 0)]; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id1_1, timepoint1, &points1_1) + .with_component_batch(row_id1_2, timepoint1, &colors1_2) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); // Insert statically too! - let row1_3 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - TimePoint::default(), - colors1.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row1_3); + let row_id1_3 = RowId::new(); + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id1_3, TimePoint::default(), &colors1_2) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); let timepoint2 = [build_frame_nr(223)]; - let colors2 = vec![MyColor::from_rgb(255, 0, 0)]; - let row2_1 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint2, - colors2.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row2_1); + let row_id2_1 = RowId::new(); + let colors2_1 = vec![MyColor::from_rgb(255, 0, 0)]; + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id2_1, timepoint2, &colors2_1) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); // Insert statically too! - let row2_2 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - TimePoint::default(), - colors2.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row2_2); + let row_id2_2 = RowId::new(); + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id2_2, TimePoint::default(), &colors2_1) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); let timepoint3 = [build_frame_nr(323)]; // Create some Positions with implicit instances + let row_id3 = RowId::new(); let points3 = vec![MyPoint::new(10.0, 20.0), MyPoint::new(30.0, 40.0)]; - let row3 = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint3, - points3.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row3); + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batch(row_id3, timepoint3, &points3) + .build()?; + insert_and_react(&mut store, &mut caches, &Arc::new(chunk)); // --- First test: `(timepoint1, timepoint3]` --- - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( timepoint1[0].0, ResolvedTimeRange::new(timepoint1[0].1.as_i64() + 1, timepoint3[0].1), ); let expected_points = &[ - ( - (TimeInt::new_temporal(323), row3.row_id()), - points3.as_slice(), - ), // + ((TimeInt::new_temporal(323), row_id3), points3.as_slice()), // ]; let expected_colors = &[ - ((TimeInt::STATIC, row2_2.row_id()), colors2.as_slice()), // + ((TimeInt::STATIC, row_id2_2), colors2_1.as_slice()), // ]; query_and_compare( &caches, @@ -230,23 +189,20 @@ fn static_range() { // The inclusion of `timepoint1` means latest-at semantics will fall back to timeless data! - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( timepoint1[0].0, ResolvedTimeRange::new(timepoint1[0].1, timepoint3[0].1), ); let expected_points = &[ ( - (TimeInt::new_temporal(123), row1_1.row_id()), - points1.as_slice(), - ), // - ( - (TimeInt::new_temporal(323), row3.row_id()), - points3.as_slice(), + (TimeInt::new_temporal(123), row_id1_1), + points1_1.as_slice(), ), // + ((TimeInt::new_temporal(323), row_id3), points3.as_slice()), // ]; let expected_colors = &[ - ((TimeInt::STATIC, row2_2.row_id()), colors2.as_slice()), // + ((TimeInt::STATIC, row_id2_2), colors2_1.as_slice()), // ]; query_and_compare( &caches, @@ -259,7 +215,7 @@ fn static_range() { // --- Third test: `[-inf, +inf]` --- - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( timepoint1[0].0, ResolvedTimeRange::new(TimeInt::MIN, TimeInt::MAX), ); @@ -273,6 +229,8 @@ fn static_range() { expected_points, expected_colors, ); + + Ok(()) } // Test the case where the user loads a piece of data at the end of the time range, then a piece at @@ -288,7 +246,7 @@ fn static_range() { // properly keep track of the fact that there are holes in the data -- on purpose. #[test] fn time_back_and_forth() { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -296,38 +254,43 @@ fn time_back_and_forth() { let entity_path: EntityPath = "point".into(); - let (rows, points): (Vec<_>, Vec<_>) = (0..10) + let (chunks, points): (Vec<_>, Vec<_>) = (0..10) .map(|i| { let timepoint = [build_frame_nr(i)]; let points = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint, - points.clone(), - ) - .unwrap(); + let chunk = Arc::new( + Chunk::builder(entity_path.clone()) + .with_component_batch(RowId::new(), timepoint, &points.clone()) + .build() + .unwrap(), + ); - insert_and_react(&mut store, &mut caches, &row); + insert_and_react(&mut store, &mut caches, &chunk); - (row, points) + (chunk, points) }) .unzip(); // --- Query #1: `[8, 10]` --- - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( Timeline::new_sequence("frame_nr"), ResolvedTimeRange::new(8, 10), ); let expected_points = &[ ( - (TimeInt::new_temporal(8), rows[8].row_id()), // + ( + TimeInt::new_temporal(8), + chunks[8].row_id_range().unwrap().0, + ), // points[8].as_slice(), ), // ( - (TimeInt::new_temporal(9), rows[9].row_id()), // + ( + TimeInt::new_temporal(9), + chunks[9].row_id_range().unwrap().0, + ), // points[9].as_slice(), ), // ]; @@ -335,22 +298,31 @@ fn time_back_and_forth() { // --- Query #2: `[1, 3]` --- - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( Timeline::new_sequence("frame_nr"), ResolvedTimeRange::new(1, 3), ); let expected_points = &[ ( - (TimeInt::new_temporal(1), rows[1].row_id()), // + ( + TimeInt::new_temporal(1), + chunks[1].row_id_range().unwrap().0, + ), // points[1].as_slice(), ), // ( - (TimeInt::new_temporal(2), rows[2].row_id()), // + ( + TimeInt::new_temporal(2), + chunks[2].row_id_range().unwrap().0, + ), // points[2].as_slice(), ), // ( - (TimeInt::new_temporal(3), rows[3].row_id()), // + ( + TimeInt::new_temporal(3), + chunks[3].row_id_range().unwrap().0, + ), // points[3].as_slice(), ), // ]; @@ -358,22 +330,31 @@ fn time_back_and_forth() { // --- Query #3: `[5, 7]` --- - let query = re_data_store::RangeQuery::new( + let query = RangeQuery::new( Timeline::new_sequence("frame_nr"), ResolvedTimeRange::new(5, 7), ); let expected_points = &[ ( - (TimeInt::new_temporal(5), rows[5].row_id()), // + ( + TimeInt::new_temporal(5), + chunks[5].row_id_range().unwrap().0, + ), // points[5].as_slice(), ), // ( - (TimeInt::new_temporal(6), rows[6].row_id()), // + ( + TimeInt::new_temporal(6), + chunks[6].row_id_range().unwrap().0, + ), // points[6].as_slice(), ), // ( - (TimeInt::new_temporal(7), rows[7].row_id()), // + ( + TimeInt::new_temporal(7), + chunks[7].row_id_range().unwrap().0, + ), // points[7].as_slice(), ), // ]; @@ -401,37 +382,33 @@ fn invalidation() { .copied() .unwrap_or(TimeInt::STATIC); - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); let mut caches = Caches::new(&store); + let row_id1 = RowId::new(); let points1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint.clone(), - points1.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row1); + let chunk1 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id1, present_data_timepoint.clone(), &points1) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk1)); + let row_id2 = RowId::new(); let colors2 = vec![MyColor::from_rgb(1, 2, 3)]; - let row2 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint.clone(), - colors2.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row2); + let chunk2 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id2, present_data_timepoint.clone(), &colors2) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk2)); let expected_points = &[ - ((present_timestamp, row1.row_id()), points1.as_slice()), // + ((present_timestamp, row_id1), points1.as_slice()), // ]; let expected_colors = &[ - ((present_timestamp, row2.row_id()), colors2.as_slice()), // + ((present_timestamp, row_id2), colors2.as_slice()), // ]; query_and_compare( &caches, @@ -445,22 +422,20 @@ fn invalidation() { // --- Modify present --- // Modify the PoV component + let row_id3 = RowId::new(); let points3 = vec![MyPoint::new(10.0, 20.0), MyPoint::new(30.0, 40.0)]; - let row3 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint.clone(), - points3.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row3); + let chunk3 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id3, present_data_timepoint.clone(), &points3) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk3)); let expected_points = &[ - ((present_timestamp, row1.row_id()), points1.as_slice()), // - ((present_timestamp, row3.row_id()), points3.as_slice()), // + ((present_timestamp, row_id1), points1.as_slice()), // + ((present_timestamp, row_id3), points3.as_slice()), // ]; let expected_colors = &[ - ((present_timestamp, row2.row_id()), colors2.as_slice()), // + ((present_timestamp, row_id2), colors2.as_slice()), // ]; query_and_compare( &caches, @@ -472,23 +447,21 @@ fn invalidation() { ); // Modify the optional component + let row_id4 = RowId::new(); let colors4 = vec![MyColor::from_rgb(4, 5, 6), MyColor::from_rgb(7, 8, 9)]; - let row4 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - present_data_timepoint, - colors4.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row4); + let chunk4 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id4, present_data_timepoint.clone(), &colors4) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk4)); let expected_points = &[ - ((present_timestamp, row1.row_id()), points1.as_slice()), // - ((present_timestamp, row3.row_id()), points3.as_slice()), // + ((present_timestamp, row_id1), points1.as_slice()), // + ((present_timestamp, row_id3), points3.as_slice()), // ]; let expected_colors = &[ - ((present_timestamp, row2.row_id()), colors2.as_slice()), // - ((present_timestamp, row4.row_id()), colors4.as_slice()), // + ((present_timestamp, row_id2), colors2.as_slice()), // + ((present_timestamp, row_id4), colors4.as_slice()), // ]; query_and_compare( &caches, @@ -503,22 +476,20 @@ fn invalidation() { // Modify the PoV component let points5 = vec![MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)]; - let row5 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - past_data_timepoint.clone(), - points5.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row5); + let row_id5 = RowId::new(); + let chunk5 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id5, past_data_timepoint.clone(), &points5) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk5)); let expected_points1 = &[ - ((past_timestamp, row5.row_id()), points5.as_slice()), // + ((past_timestamp, row_id5), points5.as_slice()), // ] as &[_]; let expected_points2 = &[ - ((past_timestamp, row5.row_id()), points5.as_slice()), // - ((present_timestamp, row1.row_id()), points1.as_slice()), // - ((present_timestamp, row3.row_id()), points3.as_slice()), // + ((past_timestamp, row_id5), points5.as_slice()), // + ((present_timestamp, row_id1), points1.as_slice()), // + ((present_timestamp, row_id3), points3.as_slice()), // ] as &[_]; let expected_points = if past_data_timepoint.is_static() { expected_points1 @@ -526,8 +497,8 @@ fn invalidation() { expected_points2 }; let expected_colors = &[ - ((present_timestamp, row2.row_id()), colors2.as_slice()), // - ((present_timestamp, row4.row_id()), colors4.as_slice()), // + ((present_timestamp, row_id2), colors2.as_slice()), // + ((present_timestamp, row_id4), colors4.as_slice()), // ]; query_and_compare( &caches, @@ -539,23 +510,21 @@ fn invalidation() { ); // Modify the optional component + let row_id6 = RowId::new(); let colors6 = vec![MyColor::from_rgb(10, 11, 12), MyColor::from_rgb(13, 14, 15)]; - let row6 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - past_data_timepoint.clone(), - colors6.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row6); + let chunk6 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id6, past_data_timepoint.clone(), &colors6) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk6)); let expected_colors1 = &[ - ((past_timestamp, row6.row_id()), colors6.as_slice()), // + ((past_timestamp, row_id6), colors6.as_slice()), // ] as &[_]; let expected_colors2 = &[ - ((past_timestamp, row6.row_id()), colors6.as_slice()), // - ((present_timestamp, row2.row_id()), colors2.as_slice()), // - ((present_timestamp, row4.row_id()), colors4.as_slice()), // + ((past_timestamp, row_id6), colors6.as_slice()), // + ((present_timestamp, row_id2), colors2.as_slice()), // + ((present_timestamp, row_id4), colors4.as_slice()), // ] as &[_]; let expected_colors = if past_data_timepoint.is_static() { expected_colors1 @@ -574,24 +543,22 @@ fn invalidation() { // --- Modify future --- // Modify the PoV component + let row_id7 = RowId::new(); let points7 = vec![MyPoint::new(1000.0, 2000.0), MyPoint::new(3000.0, 4000.0)]; - let row7 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - future_data_timepoint.clone(), - points7.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row7); + let chunk7 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id7, future_data_timepoint.clone(), &points7) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk7)); let expected_points1 = &[ - ((past_timestamp, row5.row_id()), points5.as_slice()), // + ((past_timestamp, row_id5), points5.as_slice()), // ] as &[_]; let expected_points2 = &[ - ((past_timestamp, row5.row_id()), points5.as_slice()), // - ((present_timestamp, row1.row_id()), points1.as_slice()), // - ((present_timestamp, row3.row_id()), points3.as_slice()), // - ((future_timestamp, row7.row_id()), points7.as_slice()), // + ((past_timestamp, row_id5), points5.as_slice()), // + ((present_timestamp, row_id1), points1.as_slice()), // + ((present_timestamp, row_id3), points3.as_slice()), // + ((future_timestamp, row_id7), points7.as_slice()), // ] as &[_]; let expected_points = if past_data_timepoint.is_static() { expected_points1 @@ -608,24 +575,22 @@ fn invalidation() { ); // Modify the optional component + let row_id8 = RowId::new(); let colors8 = vec![MyColor::from_rgb(16, 17, 18)]; - let row8 = DataRow::from_cells1_sized( - RowId::new(), - entity_path, - future_data_timepoint, - colors8.clone(), - ) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row8); + let chunk8 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id8, future_data_timepoint.clone(), &colors8) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk8)); let expected_colors1 = &[ - ((past_timestamp, row6.row_id()), colors6.as_slice()), // + ((past_timestamp, row_id6), colors6.as_slice()), // ] as &[_]; let expected_colors2 = &[ - ((past_timestamp, row6.row_id()), colors6.as_slice()), // - ((present_timestamp, row2.row_id()), colors2.as_slice()), // - ((present_timestamp, row4.row_id()), colors4.as_slice()), // - ((future_timestamp, row8.row_id()), colors8.as_slice()), // + ((past_timestamp, row_id6), colors6.as_slice()), // + ((present_timestamp, row_id2), colors2.as_slice()), // + ((present_timestamp, row_id4), colors4.as_slice()), // + ((future_timestamp, row_id8), colors8.as_slice()), // ] as &[_]; let expected_colors = if past_data_timepoint.is_static() { expected_colors1 @@ -689,7 +654,7 @@ fn invalidation() { // ``` #[test] fn invalidation_of_future_optionals() { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -701,15 +666,18 @@ fn invalidation_of_future_optionals() { let frame2 = [build_frame_nr(2)]; let frame3 = [build_frame_nr(3)]; - let query = re_data_store::RangeQuery::new(frame2[0].0, ResolvedTimeRange::EVERYTHING); + let query = RangeQuery::new(frame2[0].0, ResolvedTimeRange::EVERYTHING); + let row_id1 = RowId::new(); let points1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timeless, points1.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row1); + let chunk1 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id1, timeless, &points1) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk1)); let expected_points = &[ - ((TimeInt::STATIC, row1.row_id()), points1.as_slice()), // + ((TimeInt::STATIC, row_id1), points1.as_slice()), // ]; let expected_colors = &[]; query_and_compare( @@ -721,16 +689,16 @@ fn invalidation_of_future_optionals() { expected_colors, ); + let row_id2 = RowId::new(); let colors2 = vec![MyColor::from_rgb(255, 0, 0)]; - let row2 = - DataRow::from_cells1_sized(RowId::new(), entity_path, frame2, colors2.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row2); + let chunk2 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id2, frame2, &colors2) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk2)); let expected_colors = &[ - ( - (TimeInt::new_temporal(2), row2.row_id()), - colors2.as_slice(), - ), // + ((TimeInt::new_temporal(2), row_id2), colors2.as_slice()), // ]; query_and_compare( &caches, @@ -741,20 +709,17 @@ fn invalidation_of_future_optionals() { expected_colors, ); + let row_id3 = RowId::new(); let colors3 = vec![MyColor::from_rgb(0, 0, 255)]; - let row3 = - DataRow::from_cells1_sized(RowId::new(), entity_path, frame3, colors3.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row3); + let chunk3 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id3, frame3, &colors3) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk3)); let expected_colors = &[ - ( - (TimeInt::new_temporal(2), row2.row_id()), - colors2.as_slice(), - ), // - ( - (TimeInt::new_temporal(3), row3.row_id()), - colors3.as_slice(), - ), // + ((TimeInt::new_temporal(2), row_id2), colors2.as_slice()), // + ((TimeInt::new_temporal(3), row_id3), colors3.as_slice()), // ]; query_and_compare( &caches, @@ -765,24 +730,18 @@ fn invalidation_of_future_optionals() { expected_colors, ); + let row_id4 = RowId::new(); let colors4 = vec![MyColor::from_rgb(0, 255, 0)]; - let row4 = - DataRow::from_cells1_sized(RowId::new(), entity_path, frame3, colors4.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row4); + let chunk4 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id4, frame3, &colors4) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk4)); let expected_colors = &[ - ( - (TimeInt::new_temporal(2), row2.row_id()), - colors2.as_slice(), - ), // - ( - (TimeInt::new_temporal(3), row3.row_id()), - colors3.as_slice(), - ), // - ( - (TimeInt::new_temporal(3), row4.row_id()), - colors4.as_slice(), - ), // + ((TimeInt::new_temporal(2), row_id2), colors2.as_slice()), // + ((TimeInt::new_temporal(3), row_id3), colors3.as_slice()), // + ((TimeInt::new_temporal(3), row_id4), colors4.as_slice()), // ]; query_and_compare( &caches, @@ -796,7 +755,7 @@ fn invalidation_of_future_optionals() { #[test] fn invalidation_static() { - let mut store = DataStore::new( + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -807,16 +766,18 @@ fn invalidation_static() { let timeless = TimePoint::default(); let frame0 = [build_frame_nr(TimeInt::ZERO)]; - let query = re_data_store::RangeQuery::new(frame0[0].0, ResolvedTimeRange::EVERYTHING); + let query = RangeQuery::new(frame0[0].0, ResolvedTimeRange::EVERYTHING); + let row_id1 = RowId::new(); let points1 = vec![MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)]; - let row1 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timeless.clone(), points1.clone()) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row1); + let chunk1 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id1, timeless.clone(), &points1) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk1)); let expected_points = &[ - ((TimeInt::STATIC, row1.row_id()), points1.as_slice()), // + ((TimeInt::STATIC, row_id1), points1.as_slice()), // ]; let expected_colors = &[]; query_and_compare( @@ -828,14 +789,16 @@ fn invalidation_static() { expected_colors, ); + let row_id2 = RowId::new(); let colors2 = vec![MyColor::from_rgb(255, 0, 0)]; - let row2 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timeless.clone(), colors2.clone()) - .unwrap(); - insert_and_react(&mut store, &mut caches, &row2); + let chunk2 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id2, timeless.clone(), &colors2) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk2)); let expected_colors = &[ - ((TimeInt::STATIC, row2.row_id()), colors2.as_slice()), // + ((TimeInt::STATIC, row_id2), colors2.as_slice()), // ]; query_and_compare( &caches, @@ -846,13 +809,16 @@ fn invalidation_static() { expected_colors, ); + let row_id3 = RowId::new(); let colors3 = vec![MyColor::from_rgb(0, 0, 255)]; - let row3 = - DataRow::from_cells1_sized(RowId::new(), entity_path, timeless, colors3.clone()).unwrap(); - insert_and_react(&mut store, &mut caches, &row3); + let chunk3 = Chunk::builder(entity_path.into()) + .with_component_batch(row_id3, timeless, &colors3) + .build() + .unwrap(); + insert_and_react(&mut store, &mut caches, &Arc::new(chunk3)); let expected_colors = &[ - ((TimeInt::STATIC, row3.row_id()), colors3.as_slice()), // + ((TimeInt::STATIC, row_id3), colors3.as_slice()), // ]; query_and_compare( &caches, @@ -866,8 +832,8 @@ fn invalidation_static() { // See . #[test] -fn concurrent_multitenant_edge_case() -> anyhow::Result<()> { - let mut store = DataStore::new( +fn concurrent_multitenant_edge_case() { + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -881,27 +847,27 @@ fn concurrent_multitenant_edge_case() -> anyhow::Result<()> { MyPoint::new(point_value, point_value + 1.0), MyPoint::new(point_value + 2.0, point_value + 3.0), ]; - let row = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint, - points.clone(), - )?; - Ok::<_, DataReadError>((timepoint, points, row)) + let chunk = Arc::new( + Chunk::builder(entity_path.clone()) + .with_component_batch(RowId::new(), timepoint, &points) + .build() + .unwrap(), + ); + (timepoint, points, chunk) }; - let (timepoint1, points1, row1) = add_points(123, 1.0)?; - insert_and_react(&mut store, &mut caches, &row1); - let (_timepoint2, points2, row2) = add_points(223, 2.0)?; - insert_and_react(&mut store, &mut caches, &row2); - let (_timepoint3, points3, row3) = add_points(323, 3.0)?; - insert_and_react(&mut store, &mut caches, &row3); + let (timepoint1, points1, chunk1) = add_points(123, 1.0); + insert_and_react(&mut store, &mut caches, &chunk1); + let (_timepoint2, points2, chunk2) = add_points(223, 2.0); + insert_and_react(&mut store, &mut caches, &chunk2); + let (_timepoint3, points3, chunk3) = add_points(323, 3.0); + insert_and_react(&mut store, &mut caches, &chunk3); // --- Tenant #1 queries the data, but doesn't cache the result in the deserialization cache --- - let query = re_data_store::RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::EVERYTHING); + let query = RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::EVERYTHING); - eprintln!("{}", store.to_data_table().unwrap()); + eprintln!("{store}"); { let cached = caches.range( @@ -916,31 +882,29 @@ fn concurrent_multitenant_edge_case() -> anyhow::Result<()> { // --- Meanwhile, tenant #2 queries and deserializes the data --- - let query = re_data_store::RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::EVERYTHING); + let query = RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::EVERYTHING); let expected_points = &[ ( - (TimeInt::new_temporal(123), row1.row_id()), + (TimeInt::new_temporal(123), chunk1.row_id_range().unwrap().0), points1.as_slice(), ), // ( - (TimeInt::new_temporal(223), row2.row_id()), + (TimeInt::new_temporal(223), chunk2.row_id_range().unwrap().0), points2.as_slice(), ), // ( - (TimeInt::new_temporal(323), row3.row_id()), + (TimeInt::new_temporal(323), chunk3.row_id_range().unwrap().0), points3.as_slice(), ), // ]; query_and_compare(&caches, &store, &query, &entity_path, expected_points, &[]); - - Ok(()) } // See . #[test] -fn concurrent_multitenant_edge_case2() -> anyhow::Result<()> { - let mut store = DataStore::new( +fn concurrent_multitenant_edge_case2() { + let mut store = ChunkStore::new( re_log_types::StoreId::random(re_log_types::StoreKind::Recording), Default::default(), ); @@ -954,29 +918,29 @@ fn concurrent_multitenant_edge_case2() -> anyhow::Result<()> { MyPoint::new(point_value, point_value + 1.0), MyPoint::new(point_value + 2.0, point_value + 3.0), ]; - let row = DataRow::from_cells1_sized( - RowId::new(), - entity_path.clone(), - timepoint, - points.clone(), - )?; - Ok::<_, DataReadError>((timepoint, points, row)) + let chunk = Arc::new( + Chunk::builder(entity_path.clone()) + .with_component_batch(RowId::new(), timepoint, &points) + .build() + .unwrap(), + ); + (timepoint, points, chunk) }; - let (timepoint1, points1, row1) = add_points(123, 1.0)?; - insert_and_react(&mut store, &mut caches, &row1); - let (_timepoint2, points2, row2) = add_points(223, 2.0)?; - insert_and_react(&mut store, &mut caches, &row2); - let (_timepoint3, points3, row3) = add_points(323, 3.0)?; - insert_and_react(&mut store, &mut caches, &row3); - let (_timepoint4, points4, row4) = add_points(423, 4.0)?; - insert_and_react(&mut store, &mut caches, &row4); - let (_timepoint5, points5, row5) = add_points(523, 5.0)?; - insert_and_react(&mut store, &mut caches, &row5); + let (timepoint1, points1, chunk1) = add_points(123, 1.0); + insert_and_react(&mut store, &mut caches, &chunk1); + let (_timepoint2, points2, chunk2) = add_points(223, 2.0); + insert_and_react(&mut store, &mut caches, &chunk2); + let (_timepoint3, points3, chunk3) = add_points(323, 3.0); + insert_and_react(&mut store, &mut caches, &chunk3); + let (_timepoint4, points4, chunk4) = add_points(423, 4.0); + insert_and_react(&mut store, &mut caches, &chunk4); + let (_timepoint5, points5, chunk5) = add_points(523, 5.0); + insert_and_react(&mut store, &mut caches, &chunk5); // --- Tenant #1 queries the data at (123, 223), but doesn't cache the result in the deserialization cache --- - let query1 = re_data_store::RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::new(123, 223)); + let query1 = RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::new(123, 223)); { let cached = caches.range( &store, @@ -990,7 +954,7 @@ fn concurrent_multitenant_edge_case2() -> anyhow::Result<()> { // --- Tenant #2 queries the data at (423, 523), but doesn't cache the result in the deserialization cache --- - let query2 = re_data_store::RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::new(423, 523)); + let query2 = RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::new(423, 523)); { let cached = caches.range( &store, @@ -1004,18 +968,18 @@ fn concurrent_multitenant_edge_case2() -> anyhow::Result<()> { // --- Tenant #2 queries the data at (223, 423) and deserializes it --- - let query3 = re_data_store::RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::new(223, 423)); + let query3 = RangeQuery::new(timepoint1[0].0, ResolvedTimeRange::new(223, 423)); let expected_points = &[ ( - (TimeInt::new_temporal(223), row2.row_id()), + (TimeInt::new_temporal(223), chunk2.row_id_range().unwrap().0), points2.as_slice(), ), // ( - (TimeInt::new_temporal(323), row3.row_id()), + (TimeInt::new_temporal(323), chunk3.row_id_range().unwrap().0), points3.as_slice(), ), // ( - (TimeInt::new_temporal(423), row4.row_id()), + (TimeInt::new_temporal(423), chunk4.row_id_range().unwrap().0), points4.as_slice(), ), // ]; @@ -1025,11 +989,11 @@ fn concurrent_multitenant_edge_case2() -> anyhow::Result<()> { let expected_points = &[ ( - (TimeInt::new_temporal(123), row1.row_id()), + (TimeInt::new_temporal(123), chunk1.row_id_range().unwrap().0), points1.as_slice(), ), // ( - (TimeInt::new_temporal(223), row2.row_id()), + (TimeInt::new_temporal(223), chunk2.row_id_range().unwrap().0), points2.as_slice(), ), // ]; @@ -1039,28 +1003,26 @@ fn concurrent_multitenant_edge_case2() -> anyhow::Result<()> { let expected_points = &[ ( - (TimeInt::new_temporal(423), row4.row_id()), + (TimeInt::new_temporal(423), chunk4.row_id_range().unwrap().0), points4.as_slice(), ), // ( - (TimeInt::new_temporal(523), row5.row_id()), + (TimeInt::new_temporal(523), chunk5.row_id_range().unwrap().0), points5.as_slice(), ), // ]; query_and_compare(&caches, &store, &query2, &entity_path, expected_points, &[]); - - Ok(()) } -// --- +// // --- -fn insert_and_react(store: &mut DataStore, caches: &mut Caches, row: &DataRow) { - caches.on_events(&[store.insert_row(row).unwrap()]); +fn insert_and_react(store: &mut ChunkStore, caches: &mut Caches, chunk: &Arc) { + caches.on_events(&[store.insert_chunk(chunk).unwrap().unwrap()]); } fn query_and_compare( caches: &Caches, - store: &DataStore, + store: &ChunkStore, query: &RangeQuery, entity_path: &EntityPath, expected_all_points_indexed: &[((TimeInt, RowId), &[MyPoint])], @@ -1097,8 +1059,8 @@ fn query_and_compare( )); let cached_all_colors_indexed = cached_all_colors.range_indexed(); - // eprintln!("{query:?}"); - // eprintln!("{}", store.to_data_table().unwrap()); + eprintln!("{query:?}"); + eprintln!("{store}"); similar_asserts::assert_eq!( expected_all_points_indexed, diff --git a/crates/re_renderer/README.md b/crates/re_renderer/README.md index a3adadb3d911..958b5f5b5401 100644 --- a/crates/re_renderer/README.md +++ b/crates/re_renderer/README.md @@ -23,7 +23,7 @@ Goals & philosophy: * Automatic resource re-use & caching * Lazy loading whenever possible for best startup performance * Run great both on the desktop and web -* No dependencies on `re_viewer` or Rerun data store libraries +* No dependencies on `re_viewer` or Rerun chunk store libraries ## Debugging diff --git a/crates/re_sdk/Cargo.toml b/crates/re_sdk/Cargo.toml index ce8adf3014e8..0552f77db2f5 100644 --- a/crates/re_sdk/Cargo.toml +++ b/crates/re_sdk/Cargo.toml @@ -82,7 +82,7 @@ webbrowser = { workspace = true, optional = true } libc.workspace = true [dev-dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true ndarray-rand.workspace = true ndarray.workspace = true diff --git a/crates/re_sdk/src/lib.rs b/crates/re_sdk/src/lib.rs index af2d0824bbd6..56c7a6f018d8 100644 --- a/crates/re_sdk/src/lib.rs +++ b/crates/re_sdk/src/lib.rs @@ -78,9 +78,9 @@ pub mod sink { pub mod log { pub use re_chunk::{ Chunk, ChunkBatcher, ChunkBatcherConfig, ChunkBatcherError, ChunkBatcherResult, ChunkError, - ChunkResult, PendingRow, TransportChunk, + ChunkId, ChunkResult, PendingRow, RowId, TransportChunk, }; - pub use re_log_types::{DataCell, DataRow, DataTable, LogMsg, RowId, TableId}; + pub use re_log_types::LogMsg; } /// Time-related types. diff --git a/crates/re_sdk/src/recording_stream.rs b/crates/re_sdk/src/recording_stream.rs index b5ae4a388bfb..21d80e679489 100644 --- a/crates/re_sdk/src/recording_stream.rs +++ b/crates/re_sdk/src/recording_stream.rs @@ -9,11 +9,11 @@ use crossbeam::channel::{Receiver, Sender}; use itertools::Either; use parking_lot::Mutex; -use re_chunk::{Chunk, ChunkBatcher, ChunkBatcherConfig, ChunkBatcherError, PendingRow}; +use re_chunk::{Chunk, ChunkBatcher, ChunkBatcherConfig, ChunkBatcherError, PendingRow, RowId}; use re_log_types::{ - ApplicationId, ArrowChunkReleaseCallback, ArrowMsg, BlueprintActivationCommand, DataCellError, - EntityPath, LogMsg, RowId, StoreId, StoreInfo, StoreKind, StoreSource, TableId, Time, TimeInt, - TimePoint, TimeType, Timeline, TimelineName, + ApplicationId, ArrowChunkReleaseCallback, BlueprintActivationCommand, EntityPath, LogMsg, + StoreId, StoreInfo, StoreKind, StoreSource, Time, TimeInt, TimePoint, TimeType, Timeline, + TimelineName, }; use re_types_core::{AsComponents, ComponentBatch, SerializationError}; @@ -54,10 +54,6 @@ pub enum RecordingStreamError { #[error("Failed to spawn the underlying batcher: {0}")] ChunkBatcher(#[from] ChunkBatcherError), - /// Error within the underlying data cell. - #[error("Failed to instantiate data cell: {0}")] - DataCell(#[from] DataCellError), - /// Error within the underlying serializer. #[error("Failed to serialize component data: {0}")] Serialization(#[from] SerializationError), @@ -81,10 +77,6 @@ pub enum RecordingStreamError { #[error(transparent)] WebSink(#[from] crate::web_viewer::WebViewerSinkError), - /// An error that can occur because a row in the store has inconsistent columns. - #[error(transparent)] - DataReadError(#[from] re_log_types::DataReadError), - /// An error occurred while attempting to use a [`re_data_loader::DataLoader`]. #[cfg(feature = "data_loaders")] #[error(transparent)] @@ -740,7 +732,7 @@ impl RecordingStreamInner { ); sink.send( re_log_types::SetStoreInfo { - row_id: re_log_types::RowId::new(), + row_id: *RowId::new(), info: info.clone(), } .into(), @@ -1220,7 +1212,7 @@ fn forwarding_thread( ); new_sink.send( re_log_types::SetStoreInfo { - row_id: re_log_types::RowId::new(), + row_id: *RowId::new(), info: info.clone(), } .into(), @@ -1252,25 +1244,15 @@ fn forwarding_thread( // NOTE: Always pop chunks first, this is what makes `Command::PopPendingChunks` possible, // which in turns makes `RecordingStream::flush_blocking` well defined. while let Ok(chunk) = chunks.try_recv() { - let timepoint_max = chunk.timepoint_max(); - let chunk = match chunk.to_transport() { + let mut msg = match chunk.to_arrow_msg() { Ok(chunk) => chunk, Err(err) => { re_log::error!(%err, "couldn't serialize chunk; data dropped (this is a bug in Rerun!)"); continue; } }; - - sink.send(LogMsg::ArrowMsg( - info.store_id.clone(), - ArrowMsg { - table_id: TableId::new(), - timepoint_max, - schema: chunk.schema, - chunk: chunk.data, - on_release: on_release.clone(), - }, - )); + msg.on_release = on_release.clone(); + sink.send(LogMsg::ArrowMsg(info.store_id.clone(), msg)); } select! { @@ -1282,8 +1264,7 @@ fn forwarding_thread( break; }; - let timepoint_max = chunk.timepoint_max(); - let chunk = match chunk.to_transport() { + let msg = match chunk.to_arrow_msg() { Ok(chunk) => chunk, Err(err) => { re_log::error!(%err, "couldn't serialize chunk; data dropped (this is a bug in Rerun!)"); @@ -1291,16 +1272,7 @@ fn forwarding_thread( } }; - sink.send(LogMsg::ArrowMsg( - info.store_id.clone(), - ArrowMsg { - table_id: TableId::new(), - timepoint_max, - schema: chunk.schema, - chunk: chunk.data, - on_release: on_release.clone(), - }, - )); + sink.send(LogMsg::ArrowMsg(info.store_id.clone(), msg)); } recv(cmds_rx) -> res => { @@ -2075,7 +2047,6 @@ impl RecordingStream { #[cfg(test)] mod tests { use re_chunk::TransportChunk; - use re_log_types::RowId; use super::*; @@ -2111,7 +2082,7 @@ mod tests { // buffered mode. match msgs.pop().unwrap() { LogMsg::SetStoreInfo(msg) => { - assert!(msg.row_id != RowId::ZERO); + assert!(msg.row_id != *RowId::ZERO); similar_asserts::assert_eq!(store_info, msg.info); } _ => panic!("expected SetStoreInfo"), @@ -2122,7 +2093,7 @@ mod tests { // This arrives _before_ the data itself since we're using manual flushing. match msgs.pop().unwrap() { LogMsg::SetStoreInfo(msg) => { - assert!(msg.row_id != RowId::ZERO); + assert!(msg.row_id != *RowId::ZERO); similar_asserts::assert_eq!(store_info, msg.info); } _ => panic!("expected SetStoreInfo"), @@ -2175,7 +2146,7 @@ mod tests { // buffered mode. match msgs.pop().unwrap() { LogMsg::SetStoreInfo(msg) => { - assert!(msg.row_id != RowId::ZERO); + assert!(msg.row_id != *RowId::ZERO); similar_asserts::assert_eq!(store_info, msg.info); } _ => panic!("expected SetStoreInfo"), @@ -2186,7 +2157,7 @@ mod tests { // This arrives _before_ the data itself since we're using manual flushing. match msgs.pop().unwrap() { LogMsg::SetStoreInfo(msg) => { - assert!(msg.row_id != RowId::ZERO); + assert!(msg.row_id != *RowId::ZERO); similar_asserts::assert_eq!(store_info, msg.info); } _ => panic!("expected SetStoreInfo"), @@ -2244,7 +2215,7 @@ mod tests { // to in-memory mode. match msgs.pop().unwrap() { LogMsg::SetStoreInfo(msg) => { - assert!(msg.row_id != RowId::ZERO); + assert!(msg.row_id != *RowId::ZERO); similar_asserts::assert_eq!(store_info, msg.info); } _ => panic!("expected SetStoreInfo"), @@ -2254,7 +2225,7 @@ mod tests { // TODO(jleibs): Avoid a redundant StoreInfo message. match msgs.pop().unwrap() { LogMsg::SetStoreInfo(msg) => { - assert!(msg.row_id != RowId::ZERO); + assert!(msg.row_id != *RowId::ZERO); similar_asserts::assert_eq!(store_info, msg.info); } _ => panic!("expected SetStoreInfo"), diff --git a/crates/re_selection_panel/Cargo.toml b/crates/re_selection_panel/Cargo.toml index a8bff8565303..4c6ddfba6474 100644 --- a/crates/re_selection_panel/Cargo.toml +++ b/crates/re_selection_panel/Cargo.toml @@ -19,8 +19,9 @@ workspace = true all-features = true [dependencies] +re_chunk.workspace = true re_context_menu.workspace = true -re_data_store.workspace = true +re_chunk_store.workspace = true re_data_ui.workspace = true re_entity_db.workspace = true re_log_types.workspace = true diff --git a/crates/re_selection_panel/src/defaults_ui.rs b/crates/re_selection_panel/src/defaults_ui.rs index 4b10e4629221..f0c2313acea6 100644 --- a/crates/re_selection_panel/src/defaults_ui.rs +++ b/crates/re_selection_panel/src/defaults_ui.rs @@ -1,9 +1,11 @@ use std::collections::{BTreeMap, BTreeSet}; use itertools::Itertools as _; -use re_data_store::LatestAtQuery; + +use re_chunk::{Chunk, RowId}; +use re_chunk_store::LatestAtQuery; use re_data_ui::{sorted_component_list_for_ui, DataUi as _}; -use re_log_types::{DataCell, DataRow, EntityPath, RowId}; +use re_log_types::EntityPath; use re_types_core::ComponentName; use re_ui::{list_item::LabelContent, UiExt as _}; use re_viewer_context::{ @@ -269,10 +271,13 @@ fn add_popup_ui( // Present the option to add new components for each component that doesn't // already have an active override. - for (component, viz) in component_to_vis { + for (component_name, viz) in component_to_vis { + #[allow(clippy::blocks_in_conditions)] if ui - .button(component.short_name()) - .on_hover_ui(|ui| component.data_ui_recording(ctx.viewer_ctx, ui, UiLayout::Tooltip)) + .button(component_name.short_name()) + .on_hover_ui(|ui| { + component_name.data_ui_recording(ctx.viewer_ctx, ui, UiLayout::Tooltip); + }) .clicked() { // We are creating a new override. We need to decide what initial value to give it. @@ -281,34 +286,33 @@ fn add_popup_ui( // - Finally, fall back on the default value from the component registry. // TODO(jleibs): Is this the right place for fallbacks to come from? - let Some(mut initial_data) = ctx + let Some(initial_data) = ctx .visualizer_collection .get_by_identifier(viz) .ok() - .and_then(|sys| { - sys.fallback_for(&query_context, component) - .map(|fallback| DataCell::from_arrow(component, fallback)) - .ok() - }) + .and_then(|sys| sys.fallback_for(&query_context, component_name).ok()) else { - re_log::warn!("Could not identify an initial value for: {}", component); + re_log::warn!( + "Could not identify an initial value for: {}", + component_name + ); return; }; - initial_data.compute_size_bytes(); - - match DataRow::from_cells( - RowId::new(), - ctx.blueprint_timepoint_for_writes(), - defaults_path.clone(), - [initial_data], - ) { - Ok(row) => { + match Chunk::builder(defaults_path.clone()) + .with_row( + RowId::new(), + ctx.blueprint_timepoint_for_writes(), + [(component_name, initial_data)], + ) + .build() + { + Ok(chunk) => { ctx.viewer_ctx .command_sender .send_system(SystemCommand::UpdateBlueprint( ctx.blueprint_db().store_id().clone(), - vec![row], + vec![chunk], )); } Err(err) => { diff --git a/crates/re_selection_panel/src/lib.rs b/crates/re_selection_panel/src/lib.rs index 84d205889026..b3ad8913a73c 100644 --- a/crates/re_selection_panel/src/lib.rs +++ b/crates/re_selection_panel/src/lib.rs @@ -13,7 +13,7 @@ pub use selection_panel::SelectionPanel; #[cfg(test)] mod test { use super::*; - use re_data_store::LatestAtQuery; + use re_chunk_store::LatestAtQuery; use re_viewer_context::{blueprint_timeline, Item, SpaceViewId}; use re_viewport_blueprint::ViewportBlueprint; diff --git a/crates/re_selection_panel/src/visible_time_range_ui.rs b/crates/re_selection_panel/src/visible_time_range_ui.rs index d93d1166316e..897edb32b1ce 100644 --- a/crates/re_selection_panel/src/visible_time_range_ui.rs +++ b/crates/re_selection_panel/src/visible_time_range_ui.rs @@ -131,7 +131,7 @@ fn visible_time_range_ui( &timeline_name, has_individual_range, resolved_query_range, - time_range_override_path.clone(), + time_range_override_path, visible_time_ranges, ); } @@ -142,7 +142,7 @@ fn save_visible_time_ranges( timeline_name: &TimelineName, has_individual_range: bool, query_range: QueryRange, - property_path: EntityPath, + property_path: &EntityPath, mut visible_time_ranges: re_types::blueprint::archetypes::VisibleTimeRanges, ) { if has_individual_range { diff --git a/crates/re_selection_panel/src/visualizer_ui.rs b/crates/re_selection_panel/src/visualizer_ui.rs index 206bafcad48a..5df20514c9ba 100644 --- a/crates/re_selection_panel/src/visualizer_ui.rs +++ b/crates/re_selection_panel/src/visualizer_ui.rs @@ -2,7 +2,7 @@ use itertools::Itertools; use re_data_ui::{sorted_component_list_for_ui, DataUi}; use re_entity_db::EntityDb; -use re_log_types::{DataCell, EntityPath}; +use re_log_types::EntityPath; use re_space_view::latest_at_with_blueprint_resolved_data; use re_types::external::arrow2; use re_types_blueprint::blueprint::components::VisualizerOverrides; @@ -421,7 +421,7 @@ fn editable_blueprint_component_list_item( fn menu_more( ctx: &ViewContext<'_>, ui: &mut egui::Ui, - component: re_types::ComponentName, + component_name: re_types::ComponentName, override_path: &EntityPath, raw_override: &Option>, raw_default: &Option>, @@ -433,7 +433,7 @@ fn menu_more( .on_disabled_hover_text("There's no override active") .clicked() { - ctx.save_empty_blueprint_component_by_name(override_path, component); + ctx.save_empty_blueprint_component_by_name(override_path, component_name); ui.close_menu(); } @@ -446,26 +446,20 @@ fn menu_more( .clicked() { if let Some(raw_default) = raw_default.as_ref() { - ctx.save_blueprint_data_cell( - override_path, - DataCell::from_arrow(component, raw_default.clone()), - ); + ctx.save_blueprint_array(override_path, component_name, raw_default.clone()); } ui.close_menu(); } if ui.button("Set to fallback value").clicked() { - ctx.save_blueprint_data_cell( - override_path, - DataCell::from_arrow(component, raw_fallback.to_boxed()), - ); + ctx.save_blueprint_array(override_path, component_name, raw_fallback.to_boxed()); ui.close_menu(); } let override_differs_from_default = raw_override != &ctx .viewer_ctx - .raw_latest_at_in_default_blueprint(override_path, component); + .raw_latest_at_in_default_blueprint(override_path, component_name); if ui .add_enabled( override_differs_from_default, @@ -475,14 +469,15 @@ fn menu_more( .on_disabled_hover_text("Current override is the same as the override specified in the default blueprint (if any)") .clicked() { - ctx.reset_blueprint_component_by_name(override_path, component); + ctx.reset_blueprint_component_by_name(override_path, component_name); ui.close_menu(); } if ui.button("Make default for current view").clicked() { - ctx.save_blueprint_data_cell( + ctx.save_blueprint_array( ctx.defaults_path, - DataCell::from_arrow(component, raw_current_value.to_boxed()), + component_name, + raw_current_value.to_boxed(), ); ui.close_menu(); } diff --git a/crates/re_space_view/Cargo.toml b/crates/re_space_view/Cargo.toml index 6c030d61987c..d7eb17846ff4 100644 --- a/crates/re_space_view/Cargo.toml +++ b/crates/re_space_view/Cargo.toml @@ -22,7 +22,7 @@ all-features = true default = [] [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_entity_db.workspace = true re_log_types.workspace = true re_log.workspace = true diff --git a/crates/re_space_view/src/lib.rs b/crates/re_space_view/src/lib.rs index 18ad173cc9d0..73174f7e308f 100644 --- a/crates/re_space_view/src/lib.rs +++ b/crates/re_space_view/src/lib.rs @@ -24,19 +24,22 @@ pub mod external { // ----------- -use re_entity_db::external::re_data_store; - /// Utility for implementing [`re_viewer_context::VisualizerAdditionalApplicabilityFilter`] using on the properties of a concrete component. #[inline] pub fn diff_component_filter( - event: &re_data_store::StoreEvent, + event: &re_chunk_store::ChunkStoreEvent, filter: impl Fn(&T) -> bool, ) -> bool { let filter = &filter; - event.diff.cells.iter().any(|(component_name, cell)| { - component_name == &T::name() - && T::from_arrow(cell.as_arrow_ref()) - .map(|components| components.iter().any(filter)) - .unwrap_or(false) - }) + event + .diff + .chunk + .components() + .get(&T::name()) + .map_or(false, |list_array| { + list_array + .iter() + .filter_map(|array| array.and_then(|array| T::from_arrow(&*array).ok())) + .any(|instances| instances.iter().any(filter)) + }) } diff --git a/crates/re_space_view/src/query.rs b/crates/re_space_view/src/query.rs index 4b8a298381b8..d6f6554739ea 100644 --- a/crates/re_space_view/src/query.rs +++ b/crates/re_space_view/src/query.rs @@ -1,6 +1,6 @@ use nohash_hasher::IntSet; -use re_data_store::{LatestAtQuery, RangeQuery}; +use re_chunk_store::{LatestAtQuery, RangeQuery}; use re_query::LatestAtResults; use re_types_core::ComponentName; use re_viewer_context::{DataResult, ViewContext, ViewerContext}; diff --git a/crates/re_space_view/src/results_ext.rs b/crates/re_space_view/src/results_ext.rs index 6041d29d8e6b..0972dea6905b 100644 --- a/crates/re_space_view/src/results_ext.rs +++ b/crates/re_space_view/src/results_ext.rs @@ -1,5 +1,7 @@ -use re_data_store::{LatestAtQuery, RangeQuery}; -use re_log_types::{external::arrow2, hash::Hash64, RowId, TimeInt}; +use re_chunk_store::RowId; +use re_chunk_store::{LatestAtQuery, RangeQuery}; +use re_log_types::hash::Hash64; +use re_log_types::{external::arrow2, TimeInt}; use re_query::{ LatestAtComponentResults, LatestAtResults, PromiseResolver, PromiseResult, RangeData, RangeResults, Results, diff --git a/crates/re_space_view_bar_chart/Cargo.toml b/crates/re_space_view_bar_chart/Cargo.toml index 486cd0a9670a..92106e4ecf42 100644 --- a/crates/re_space_view_bar_chart/Cargo.toml +++ b/crates/re_space_view_bar_chart/Cargo.toml @@ -19,7 +19,7 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_entity_db.workspace = true re_log_types.workspace = true re_log.workspace = true diff --git a/crates/re_space_view_bar_chart/src/visualizer_system.rs b/crates/re_space_view_bar_chart/src/visualizer_system.rs index 62d730805fd5..50bbad8799c8 100644 --- a/crates/re_space_view_bar_chart/src/visualizer_system.rs +++ b/crates/re_space_view_bar_chart/src/visualizer_system.rs @@ -1,6 +1,7 @@ use std::collections::BTreeMap; -use re_data_store::LatestAtQuery; +use re_chunk_store::ChunkStoreEvent; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityPath; use re_space_view::{diff_component_filter, DataResultQuery as _}; use re_types::{ @@ -29,8 +30,11 @@ impl IdentifiedViewSystem for BarChartVisualizerSystem { struct BarChartVisualizerEntityFilter; impl VisualizerAdditionalApplicabilityFilter for BarChartVisualizerEntityFilter { - fn update_applicability(&mut self, event: &re_data_store::StoreEvent) -> bool { - diff_component_filter(event, |tensor: &components::TensorData| tensor.is_vector()) + #[inline] + fn update_applicability(&mut self, event: &ChunkStoreEvent) -> bool { + diff_component_filter(event, |tensor: &re_types::components::TensorData| { + tensor.is_vector() + }) } } diff --git a/crates/re_space_view_dataframe/Cargo.toml b/crates/re_space_view_dataframe/Cargo.toml index 6702ada56f2c..483c6804cf68 100644 --- a/crates/re_space_view_dataframe/Cargo.toml +++ b/crates/re_space_view_dataframe/Cargo.toml @@ -19,7 +19,7 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_data_ui.workspace = true re_entity_db.workspace = true re_log_types.workspace = true diff --git a/crates/re_space_view_dataframe/src/space_view_class.rs b/crates/re_space_view_dataframe/src/space_view_class.rs index a0d136f06764..aa86130bf69d 100644 --- a/crates/re_space_view_dataframe/src/space_view_class.rs +++ b/crates/re_space_view_dataframe/src/space_view_class.rs @@ -2,7 +2,7 @@ use std::collections::BTreeSet; use egui_extras::Column; -use re_data_store::{DataStore, LatestAtQuery}; +use re_chunk_store::{ChunkStore, LatestAtQuery}; use re_data_ui::item_ui::instance_path_button; use re_entity_db::InstancePath; use re_log_types::{EntityPath, Instance, Timeline}; @@ -221,7 +221,7 @@ impl SpaceViewClass for DataframeSpaceView { /// Returns a sorted, deduplicated iterator of all instance paths for a given entity. fn sorted_instance_paths_for<'a>( entity_path: &'a EntityPath, - store: &'a DataStore, + store: &'a ChunkStore, timeline: &'a Timeline, latest_at_query: &'a LatestAtQuery, ) -> impl Iterator + 'a { @@ -229,13 +229,20 @@ fn sorted_instance_paths_for<'a>( .all_components(timeline, entity_path) .unwrap_or_default() .into_iter() - .filter(|comp| !comp.is_indicator_component()) - .flat_map(|comp| { + .filter(|component_name| !component_name.is_indicator_component()) + .flat_map(|component_name| { let num_instances = store - .latest_at(latest_at_query, entity_path, comp, &[comp]) - .map_or(0, |(_, _, cells)| { - cells[0].as_ref().map_or(0, |cell| cell.num_instances()) - }); + .latest_at_relevant_chunks(latest_at_query, entity_path, component_name) + .into_iter() + .filter_map(|chunk| { + let (data_time, row_id, batch) = chunk + .latest_at(latest_at_query, component_name) + .iter_rows(timeline, &component_name) + .next()?; + batch.map(|batch| (data_time, row_id, batch)) + }) + .max_by_key(|(data_time, row_id, _)| (*data_time, *row_id)) + .map_or(0, |(_, _, batch)| batch.len()); (0..num_instances).map(|i| Instance::from(i as u64)) }) .collect::>() // dedup and sort diff --git a/crates/re_space_view_spatial/Cargo.toml b/crates/re_space_view_spatial/Cargo.toml index 5a4c16207078..795211046c76 100644 --- a/crates/re_space_view_spatial/Cargo.toml +++ b/crates/re_space_view_spatial/Cargo.toml @@ -19,7 +19,7 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_data_ui.workspace = true re_entity_db.workspace = true re_error.workspace = true diff --git a/crates/re_space_view_spatial/src/contexts/transform_context.rs b/crates/re_space_view_spatial/src/contexts/transform_context.rs index 5899a24e22bb..e0fff1aa15be 100644 --- a/crates/re_space_view_spatial/src/contexts/transform_context.rs +++ b/crates/re_space_view_spatial/src/contexts/transform_context.rs @@ -1,6 +1,6 @@ use nohash_hasher::IntMap; -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::{EntityDb, EntityPath, EntityTree}; use re_space_view::DataResultQuery as _; use re_types::{ @@ -315,7 +315,7 @@ fn get_cached_transform( fn get_cached_pinhole( entity_path: &re_log_types::EntityPath, entity_db: &EntityDb, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, ) -> Option<(PinholeProjection, ViewCoordinates)> { entity_db .latest_at_component::(entity_path, query) diff --git a/crates/re_space_view_spatial/src/lib.rs b/crates/re_space_view_spatial/src/lib.rs index c1c15e169499..b6c609b35cfa 100644 --- a/crates/re_space_view_spatial/src/lib.rs +++ b/crates/re_space_view_spatial/src/lib.rs @@ -51,7 +51,7 @@ mod view_kind { fn resolution_from_tensor( entity_db: &re_entity_db::EntityDb, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, entity_path: &re_log_types::EntityPath, ) -> Option { // TODO(#5607): what should happen if the promise is still pending? @@ -67,7 +67,7 @@ fn resolution_from_tensor( /// Utility for querying a pinhole archetype instance. fn query_pinhole( ctx: &ViewContext<'_>, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, data_result: &re_viewer_context::DataResult, ) -> Option { let results = data_result @@ -98,7 +98,7 @@ fn query_pinhole( // TODO(andreas): This is duplicated into `re_viewport` fn query_pinhole_legacy( entity_db: &re_entity_db::EntityDb, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, entity_path: &re_log_types::EntityPath, ) -> Option { // TODO(#5607): what should happen if the promise is still pending? diff --git a/crates/re_space_view_spatial/src/max_image_dimension_subscriber.rs b/crates/re_space_view_spatial/src/max_image_dimension_subscriber.rs index c0fb58cf4571..83bcc530709e 100644 --- a/crates/re_space_view_spatial/src/max_image_dimension_subscriber.rs +++ b/crates/re_space_view_spatial/src/max_image_dimension_subscriber.rs @@ -1,7 +1,7 @@ use ahash::HashMap; use nohash_hasher::IntMap; use once_cell::sync::OnceCell; -use re_data_store::{StoreSubscriber, StoreSubscriberHandle}; +use re_chunk_store::{ChunkStore, ChunkStoreSubscriber, ChunkStoreSubscriberHandle}; use re_log_types::{EntityPath, StoreId}; use re_types::{components::TensorData, Loggable}; @@ -21,7 +21,7 @@ impl MaxImageDimensions { store_id: &StoreId, f: impl FnOnce(&IntMap) -> T, ) -> Option { - re_data_store::DataStore::with_subscriber_once( + ChunkStore::with_subscriber_once( MaxImageDimensionSubscriber::subscription_handle(), move |subscriber: &MaxImageDimensionSubscriber| { subscriber.max_dimensions.get(store_id).map(|v| &v.0).map(f) @@ -40,37 +40,41 @@ impl MaxImageDimensionSubscriber { /// Accesses the global store subscriber. /// /// Lazily registers the subscriber if it hasn't been registered yet. - pub fn subscription_handle() -> StoreSubscriberHandle { - static SUBSCRIPTION: OnceCell = OnceCell::new(); - *SUBSCRIPTION - .get_or_init(|| re_data_store::DataStore::register_subscriber(Box::::default())) + pub fn subscription_handle() -> ChunkStoreSubscriberHandle { + static SUBSCRIPTION: OnceCell = OnceCell::new(); + *SUBSCRIPTION.get_or_init(|| ChunkStore::register_subscriber(Box::::default())) } } -impl StoreSubscriber for MaxImageDimensionSubscriber { +impl ChunkStoreSubscriber for MaxImageDimensionSubscriber { + #[inline] fn name(&self) -> String { "MaxImageDimensionStoreSubscriber".to_owned() } + #[inline] fn as_any(&self) -> &dyn std::any::Any { self } + #[inline] fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self } - fn on_events(&mut self, events: &[re_data_store::StoreEvent]) { + fn on_events(&mut self, events: &[re_chunk_store::ChunkStoreEvent]) { re_tracing::profile_function!(); for event in events { - if event.diff.kind != re_data_store::StoreDiffKind::Addition { + if event.diff.kind != re_chunk_store::ChunkStoreDiffKind::Addition { // Max image dimensions are strictly additive continue; } - if let Some(cell) = event.diff.cells.get(&TensorData::name()) { - if let Ok(Some(tensor_data)) = cell.try_to_native_mono::() { + if let Some(all_tensor_data) = event.diff.chunk.components().get(&TensorData::name()) { + for tensor_data in all_tensor_data.iter().filter_map(|array| { + array.and_then(|array| TensorData::from_arrow(&*array).ok()?.into_iter().next()) + }) { if let Some([height, width, channels]) = tensor_data.image_height_width_channels() { @@ -79,7 +83,7 @@ impl StoreSubscriber for MaxImageDimensionSubscriber { .entry(event.store_id.clone()) .or_default() .0 - .entry(event.diff.entity_path.clone()) + .entry(event.diff.chunk.entity_path().clone()) .or_default(); dimensions.height = dimensions.height.max(height); diff --git a/crates/re_space_view_spatial/src/spatial_topology.rs b/crates/re_space_view_spatial/src/spatial_topology.rs index 09aecba5421b..4720ec8b0db9 100644 --- a/crates/re_space_view_spatial/src/spatial_topology.rs +++ b/crates/re_space_view_spatial/src/spatial_topology.rs @@ -2,7 +2,10 @@ use once_cell::sync::OnceCell; use ahash::HashMap; use nohash_hasher::{IntMap, IntSet}; -use re_data_store::{StoreSubscriber, StoreSubscriberHandle}; +use re_chunk_store::{ + ChunkStore, ChunkStoreDiffKind, ChunkStoreEvent, ChunkStoreSubscriber, + ChunkStoreSubscriberHandle, +}; use re_log_types::{EntityPath, EntityPathHash, StoreId}; use re_types::{ components::{DisconnectedSpace, PinholeProjection, ViewCoordinates}, @@ -124,31 +127,33 @@ impl SpatialTopologyStoreSubscriber { /// Accesses the global store subscriber. /// /// Lazily registers the subscriber if it hasn't been registered yet. - pub fn subscription_handle() -> StoreSubscriberHandle { - static SUBSCRIPTION: OnceCell = OnceCell::new(); - *SUBSCRIPTION - .get_or_init(|| re_data_store::DataStore::register_subscriber(Box::::default())) + pub fn subscription_handle() -> ChunkStoreSubscriberHandle { + static SUBSCRIPTION: OnceCell = OnceCell::new(); + *SUBSCRIPTION.get_or_init(|| ChunkStore::register_subscriber(Box::::default())) } } -impl StoreSubscriber for SpatialTopologyStoreSubscriber { +impl ChunkStoreSubscriber for SpatialTopologyStoreSubscriber { + #[inline] fn name(&self) -> String { "SpatialTopologyStoreSubscriber".to_owned() } + #[inline] fn as_any(&self) -> &dyn std::any::Any { self } + #[inline] fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self } - fn on_events(&mut self, events: &[re_data_store::StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { re_tracing::profile_function!(); for event in events { - if event.diff.kind != re_data_store::StoreDiffKind::Addition { + if event.diff.kind != ChunkStoreDiffKind::Addition { // Topology is only additive, don't care about removals. continue; } @@ -158,7 +163,10 @@ impl StoreSubscriber for SpatialTopologyStoreSubscriber { self.topologies .entry(event.store_id.clone()) .or_default() - .on_store_diff(&event.diff.entity_path, event.diff.cells.keys()); + .on_store_diff( + event.diff.chunk.entity_path(), + event.diff.chunk.component_names(), + ); } } } @@ -206,7 +214,7 @@ impl Default for SpatialTopology { impl SpatialTopology { /// Accesses the spatial topology for a given store. pub fn access(store_id: &StoreId, f: impl FnOnce(&Self) -> T) -> Option { - re_data_store::DataStore::with_subscriber_once( + ChunkStore::with_subscriber_once( SpatialTopologyStoreSubscriber::subscription_handle(), move |topology_subscriber: &SpatialTopologyStoreSubscriber| { topology_subscriber.topologies.get(store_id).map(f) @@ -261,10 +269,10 @@ impl SpatialTopology { self.subspaces.get(&origin) } - fn on_store_diff<'a>( + fn on_store_diff( &mut self, entity_path: &EntityPath, - added_components: impl Iterator, + added_components: impl Iterator, ) { re_tracing::profile_function!(); @@ -272,11 +280,11 @@ impl SpatialTopology { let mut new_heuristic_hints = HeuristicHints::empty(); for added_component in added_components { - if added_component == &DisconnectedSpace::name() { + if added_component == DisconnectedSpace::name() { new_subspace_connections.insert(SubSpaceConnectionFlags::Disconnected); - } else if added_component == &PinholeProjection::name() { + } else if added_component == PinholeProjection::name() { new_subspace_connections.insert(SubSpaceConnectionFlags::Pinhole); - } else if added_component == &ViewCoordinates::name() { + } else if added_component == ViewCoordinates::name() { new_heuristic_hints.insert(HeuristicHints::ViewCoordinates3d); }; } @@ -675,7 +683,7 @@ mod tests { } fn add_diff(topo: &mut SpatialTopology, path: &str, components: &[ComponentName]) { - topo.on_store_diff(&path.into(), components.iter()); + topo.on_store_diff(&path.into(), components.iter().copied()); } fn check_paths_in_space(topo: &SpatialTopology, paths: &[&str], expected_origin: &str) { diff --git a/crates/re_space_view_spatial/src/ui.rs b/crates/re_space_view_spatial/src/ui.rs index 869ab352088e..3862e0f184e2 100644 --- a/crates/re_space_view_spatial/src/ui.rs +++ b/crates/re_space_view_spatial/src/ui.rs @@ -444,7 +444,7 @@ pub fn picking( .contains(&instance_path.entity_path.hash()); struct PickedImageInfo { - row_id: re_log_types::RowId, + row_id: re_chunk_store::RowId, tensor: TensorData, meaning: TensorDataMeaning, coordinates: [u32; 2], @@ -634,7 +634,7 @@ fn image_hover_ui( ui_clip_rect: egui::Rect, coords: [u32; 2], space_from_ui: egui::emath::RectTransform, - tensor_data_row_id: re_log_types::RowId, + tensor_data_row_id: re_chunk_store::RowId, annotations: &AnnotationSceneContext, meaning: TensorDataMeaning, meter: Option, diff --git a/crates/re_space_view_spatial/src/visualizers/assets3d.rs b/crates/re_space_view_spatial/src/visualizers/assets3d.rs index 926f0352b0a4..2c7904873d99 100644 --- a/crates/re_space_view_spatial/src/visualizers/assets3d.rs +++ b/crates/re_space_view_spatial/src/visualizers/assets3d.rs @@ -1,4 +1,5 @@ -use re_log_types::{hash::Hash64, Instance, RowId, TimeInt}; +use re_chunk_store::RowId; +use re_log_types::{hash::Hash64, Instance, TimeInt}; use re_query::range_zip_1x2; use re_renderer::renderer::MeshInstance; use re_renderer::RenderContext; @@ -13,6 +14,7 @@ use re_viewer_context::{ }; use super::{filter_visualizable_3d_entities, SpatialViewVisualizerData}; + use crate::{ contexts::SpatialSceneEntityContext, instance_hash_conversions::picking_layer_id_from_instance_path_hash, diff --git a/crates/re_space_view_spatial/src/visualizers/cameras.rs b/crates/re_space_view_spatial/src/visualizers/cameras.rs index 79e4907fe637..f56eabae59ef 100644 --- a/crates/re_space_view_spatial/src/visualizers/cameras.rs +++ b/crates/re_space_view_spatial/src/visualizers/cameras.rs @@ -220,7 +220,7 @@ impl VisualizerSystem for CamerasVisualizer { line_builder.radius_boost_in_ui_points_for_outlines(SIZE_BOOST_IN_POINTS_FOR_LINE_OUTLINES); for data_result in query.iter_visible_data_results(ctx, Self::identifier()) { - let time_query = re_data_store::LatestAtQuery::new(query.timeline, query.latest_at); + let time_query = re_chunk_store::LatestAtQuery::new(query.timeline, query.latest_at); if let Some(pinhole) = query_pinhole(ctx, &time_query, data_result) { let entity_highlight = query diff --git a/crates/re_space_view_spatial/src/visualizers/depth_images.rs b/crates/re_space_view_spatial/src/visualizers/depth_images.rs index 907922b7b5b4..59385898c818 100644 --- a/crates/re_space_view_spatial/src/visualizers/depth_images.rs +++ b/crates/re_space_view_spatial/src/visualizers/depth_images.rs @@ -1,8 +1,9 @@ use itertools::Itertools as _; use nohash_hasher::IntSet; +use re_chunk_store::RowId; use re_entity_db::EntityPath; -use re_log_types::{EntityPathHash, RowId, TimeInt}; +use re_log_types::{EntityPathHash, TimeInt}; use re_query::range_zip_1x3; use re_renderer::renderer::{DepthCloud, DepthClouds}; use re_space_view::diff_component_filter; @@ -258,7 +259,7 @@ impl IdentifiedViewSystem for DepthImageVisualizer { struct DepthImageVisualizerEntityFilter; impl VisualizerAdditionalApplicabilityFilter for DepthImageVisualizerEntityFilter { - fn update_applicability(&mut self, event: &re_data_store::StoreEvent) -> bool { + fn update_applicability(&mut self, event: &re_chunk_store::ChunkStoreEvent) -> bool { diff_component_filter(event, |tensor: &re_types::components::TensorData| { tensor.is_shaped_like_an_image() }) diff --git a/crates/re_space_view_spatial/src/visualizers/images.rs b/crates/re_space_view_spatial/src/visualizers/images.rs index 543cedf4048c..1594c3135aaa 100644 --- a/crates/re_space_view_spatial/src/visualizers/images.rs +++ b/crates/re_space_view_spatial/src/visualizers/images.rs @@ -1,6 +1,7 @@ use itertools::Itertools as _; -use re_log_types::{RowId, TimeInt}; +use re_chunk_store::{ChunkStoreEvent, RowId}; +use re_log_types::TimeInt; use re_query::range_zip_1x1; use re_space_view::diff_component_filter; use re_types::{ @@ -52,7 +53,7 @@ impl IdentifiedViewSystem for ImageVisualizer { struct ImageVisualizerEntityFilter; impl VisualizerAdditionalApplicabilityFilter for ImageVisualizerEntityFilter { - fn update_applicability(&mut self, event: &re_data_store::StoreEvent) -> bool { + fn update_applicability(&mut self, event: &ChunkStoreEvent) -> bool { diff_component_filter(event, |tensor: &re_types::components::TensorData| { tensor.is_shaped_like_an_image() }) diff --git a/crates/re_space_view_spatial/src/visualizers/meshes.rs b/crates/re_space_view_spatial/src/visualizers/meshes.rs index a564557a2181..652cbe7be81b 100644 --- a/crates/re_space_view_spatial/src/visualizers/meshes.rs +++ b/crates/re_space_view_spatial/src/visualizers/meshes.rs @@ -1,5 +1,6 @@ use itertools::Itertools as _; -use re_log_types::{hash::Hash64, Instance, RowId, TimeInt}; +use re_chunk_store::RowId; +use re_log_types::{hash::Hash64, Instance, TimeInt}; use re_query::range_zip_1x7; use re_renderer::renderer::MeshInstance; use re_renderer::RenderContext; diff --git a/crates/re_space_view_spatial/src/visualizers/segmentation_images.rs b/crates/re_space_view_spatial/src/visualizers/segmentation_images.rs index 9a0b63e1d6fa..72b76b9eebff 100644 --- a/crates/re_space_view_spatial/src/visualizers/segmentation_images.rs +++ b/crates/re_space_view_spatial/src/visualizers/segmentation_images.rs @@ -1,6 +1,7 @@ use itertools::Itertools as _; -use re_log_types::{RowId, TimeInt}; +use re_chunk_store::RowId; +use re_log_types::TimeInt; use re_query::range_zip_1x1; use re_space_view::diff_component_filter; use re_types::{ @@ -52,7 +53,7 @@ impl IdentifiedViewSystem for SegmentationImageVisualizer { struct SegmentationImageVisualizerEntityFilter; impl VisualizerAdditionalApplicabilityFilter for SegmentationImageVisualizerEntityFilter { - fn update_applicability(&mut self, event: &re_data_store::StoreEvent) -> bool { + fn update_applicability(&mut self, event: &re_chunk_store::ChunkStoreEvent) -> bool { diff_component_filter(event, |tensor: &re_types::components::TensorData| { tensor.is_shaped_like_an_image() }) diff --git a/crates/re_space_view_spatial/src/visualizers/transform3d_arrows.rs b/crates/re_space_view_spatial/src/visualizers/transform3d_arrows.rs index b2676d76f32f..e0fe378bf902 100644 --- a/crates/re_space_view_spatial/src/visualizers/transform3d_arrows.rs +++ b/crates/re_space_view_spatial/src/visualizers/transform3d_arrows.rs @@ -61,7 +61,7 @@ impl VisualizerSystem for Transform3DArrowsVisualizer { let transforms = context_systems.get::()?; - let latest_at_query = re_data_store::LatestAtQuery::new(query.timeline, query.latest_at); + let latest_at_query = re_chunk_store::LatestAtQuery::new(query.timeline, query.latest_at); // Counting all transforms ahead of time is a bit wasteful, but we also don't expect a huge amount, // so let re_renderer's allocator internally decide what buffer sizes to pick & grow them as we go. diff --git a/crates/re_space_view_spatial/src/visualizers/utilities/entity_iterator.rs b/crates/re_space_view_spatial/src/visualizers/utilities/entity_iterator.rs index f95a546bf959..de2da7a26d47 100644 --- a/crates/re_space_view_spatial/src/visualizers/utilities/entity_iterator.rs +++ b/crates/re_space_view_spatial/src/visualizers/utilities/entity_iterator.rs @@ -1,5 +1,5 @@ use itertools::Either; -use re_data_store::{LatestAtQuery, RangeQuery}; +use re_chunk_store::{LatestAtQuery, RangeQuery}; use re_log_types::{TimeInt, Timeline}; use re_space_view::{ latest_at_with_blueprint_resolved_data, range_with_blueprint_resolved_data, HybridResults, diff --git a/crates/re_space_view_spatial/src/visualizers/utilities/textured_rect.rs b/crates/re_space_view_spatial/src/visualizers/utilities/textured_rect.rs index 0e4e0fb3ecdb..4973780446ea 100644 --- a/crates/re_space_view_spatial/src/visualizers/utilities/textured_rect.rs +++ b/crates/re_space_view_spatial/src/visualizers/utilities/textured_rect.rs @@ -1,3 +1,4 @@ +use re_chunk_store::RowId; use re_log_types::EntityPath; use re_renderer::renderer; use re_types::{ @@ -13,7 +14,7 @@ pub fn tensor_to_textured_rect( ctx: &ViewerContext<'_>, ent_path: &EntityPath, ent_context: &SpatialSceneEntityContext<'_>, - tensor_data_row_id: re_log_types::RowId, + tensor_data_row_id: RowId, tensor: &DecodedTensor, meaning: TensorDataMeaning, multiplicative_tint: egui::Rgba, diff --git a/crates/re_space_view_tensor/Cargo.toml b/crates/re_space_view_tensor/Cargo.toml index f48cdc08b89b..0abc75aa0608 100644 --- a/crates/re_space_view_tensor/Cargo.toml +++ b/crates/re_space_view_tensor/Cargo.toml @@ -19,7 +19,7 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_data_ui.workspace = true re_entity_db.workspace = true re_log_types.workspace = true diff --git a/crates/re_space_view_tensor/src/space_view_class.rs b/crates/re_space_view_tensor/src/space_view_class.rs index cdac653ab879..462a82427065 100644 --- a/crates/re_space_view_tensor/src/space_view_class.rs +++ b/crates/re_space_view_tensor/src/space_view_class.rs @@ -2,8 +2,9 @@ use egui::{epaint::TextShape, Align2, NumExt as _, Vec2}; use ndarray::Axis; use re_space_view::{suggest_space_view_for_each_entity, view_property_ui}; +use re_chunk_store::RowId; use re_data_ui::tensor_summary_ui_grid_contents; -use re_log_types::{EntityPath, RowId}; +use re_log_types::EntityPath; use re_types::{ blueprint::{ archetypes::{TensorScalarMapping, TensorSliceSelection, TensorViewFit}, diff --git a/crates/re_space_view_tensor/src/tensor_slice_to_gpu.rs b/crates/re_space_view_tensor/src/tensor_slice_to_gpu.rs index a050360454d4..b8be398f9773 100644 --- a/crates/re_space_view_tensor/src/tensor_slice_to_gpu.rs +++ b/crates/re_space_view_tensor/src/tensor_slice_to_gpu.rs @@ -1,4 +1,4 @@ -use re_log_types::RowId; +use re_chunk_store::RowId; use re_renderer::{ renderer::{ColormappedTexture, ShaderDecoding}, resource_managers::{GpuTexture2D, Texture2DCreationDesc, TextureManager2DError}, diff --git a/crates/re_space_view_tensor/src/visualizer_system.rs b/crates/re_space_view_tensor/src/visualizer_system.rs index 527d9fb69d87..be4d5acd156a 100644 --- a/crates/re_space_view_tensor/src/visualizer_system.rs +++ b/crates/re_space_view_tensor/src/visualizer_system.rs @@ -1,6 +1,5 @@ -use re_data_store::LatestAtQuery; +use re_chunk_store::{LatestAtQuery, RowId}; use re_entity_db::{external::re_query::LatestAtMonoResult, EntityPath}; -use re_log_types::RowId; use re_types::{archetypes::Tensor, components::TensorData, tensor_data::DecodedTensor}; use re_viewer_context::{ IdentifiedViewSystem, SpaceViewSystemExecutionError, TensorDecodeCache, ViewContext, diff --git a/crates/re_space_view_text_document/Cargo.toml b/crates/re_space_view_text_document/Cargo.toml index 7c9c5dcd5c72..d95c38aa3579 100644 --- a/crates/re_space_view_text_document/Cargo.toml +++ b/crates/re_space_view_text_document/Cargo.toml @@ -24,7 +24,7 @@ default = ["markdown"] markdown = ["dep:egui_commonmark"] [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_renderer.workspace = true re_space_view.workspace = true re_tracing.workspace = true diff --git a/crates/re_space_view_text_document/src/visualizer_system.rs b/crates/re_space_view_text_document/src/visualizer_system.rs index 6f9d1229e6bc..d0e573c93fe7 100644 --- a/crates/re_space_view_text_document/src/visualizer_system.rs +++ b/crates/re_space_view_text_document/src/visualizer_system.rs @@ -1,4 +1,4 @@ -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_space_view::DataResultQuery as _; use re_types::{ archetypes::TextDocument, diff --git a/crates/re_space_view_text_log/Cargo.toml b/crates/re_space_view_text_log/Cargo.toml index af00490278a2..9f2214ea0d2c 100644 --- a/crates/re_space_view_text_log/Cargo.toml +++ b/crates/re_space_view_text_log/Cargo.toml @@ -19,7 +19,7 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_data_ui.workspace = true re_entity_db.workspace = true re_log_types.workspace = true diff --git a/crates/re_space_view_text_log/src/space_view_class.rs b/crates/re_space_view_text_log/src/space_view_class.rs index e1edfd61594a..16efd99d3b73 100644 --- a/crates/re_space_view_text_log/src/space_view_class.rs +++ b/crates/re_space_view_text_log/src/space_view_class.rs @@ -264,17 +264,6 @@ impl ViewTextFilters { // --- -fn get_time_point(ctx: &ViewerContext<'_>, entry: &Entry) -> Option { - if let Some((time_point, _)) = ctx.recording_store().row_metadata(&entry.row_id) { - Some(time_point.clone()) - } else { - if !entry.time.is_static() { - re_log::warn_once!("Missing metadata for {:?}", entry.entity_path); - } - None - } -} - /// `scroll_to_row` indicates how far down we want to scroll in terms of logical rows, /// as opposed to `scroll_to_offset` (computed below) which is how far down we want to /// scroll in terms of actual points. @@ -285,12 +274,20 @@ fn table_ui( entries: &[&Entry], scroll_to_row: Option, ) { - let timelines = state - .filters - .col_timelines - .iter() - .filter_map(|(timeline, visible)| visible.then_some(timeline)) - .collect::>(); + let timelines = vec![*ctx.rec_cfg.time_ctrl.read().timeline()]; + + // TODO(#6611): This regressed because adding a metadata registry in the store is an antipattern. + // + // We'll bring back the multi-timeline display once we get rid of the native cache and start + // exposing chunks directly instead. + // Since chunks embed the data for all associated timelines, there'll be no extra work needed + // to get that information out. + // let timelines = state + // .filters + // .col_timelines + // .iter() + // .filter_map(|(timeline, visible)| visible.then_some(timeline)) + // .collect::>(); use egui_extras::Column; @@ -364,18 +361,17 @@ fn table_ui( let entry = &entries[row.index()]; // timeline(s) - let time_point = get_time_point(ctx, entry); + let timepoint: TimePoint = [(global_timeline, entry.time)].into(); for timeline in &timelines { row.col(|ui| { - let row_time = time_point - .as_ref() - .and_then(|t| t.get(timeline)) + let row_time = timepoint + .get(timeline) .copied() .unwrap_or(re_log_types::TimeInt::STATIC); item_ui::time_button(ctx, ui, timeline, row_time); if let Some(global_time) = global_time { - if *timeline == &global_timeline { + if timeline == &global_timeline { #[allow(clippy::comparison_chain)] if global_time < row_time { // We've past the global time - it is thus above this row. diff --git a/crates/re_space_view_text_log/src/visualizer_system.rs b/crates/re_space_view_text_log/src/visualizer_system.rs index 8a3ed07840a5..8bfede64239a 100644 --- a/crates/re_space_view_text_log/src/visualizer_system.rs +++ b/crates/re_space_view_text_log/src/visualizer_system.rs @@ -1,6 +1,7 @@ -use re_data_store::ResolvedTimeRange; +use re_chunk_store::ResolvedTimeRange; +use re_chunk_store::RowId; use re_entity_db::EntityPath; -use re_log_types::{RowId, TimeInt}; +use re_log_types::TimeInt; use re_query::{clamped_zip_1x2, range_zip_1x2}; use re_space_view::{range_with_blueprint_resolved_data, RangeResultsExt}; use re_types::{ @@ -16,15 +17,10 @@ use re_viewer_context::{ #[derive(Debug, Clone)] pub struct Entry { pub row_id: RowId, - pub entity_path: EntityPath, - pub time: TimeInt, - pub color: Option, - pub body: Text, - pub level: Option, } @@ -54,7 +50,7 @@ impl VisualizerSystem for TextLogSystem { re_tracing::profile_function!(); let query = - re_data_store::RangeQuery::new(view_query.timeline, ResolvedTimeRange::EVERYTHING); + re_chunk_store::RangeQuery::new(view_query.timeline, ResolvedTimeRange::EVERYTHING); for data_result in view_query.iter_visible_data_results(ctx, Self::identifier()) { if let Err(err) = self.process_entity(ctx, &query, data_result) { @@ -88,7 +84,7 @@ impl TextLogSystem { fn process_entity( &mut self, ctx: &ViewContext<'_>, - query: &re_data_store::RangeQuery, + query: &re_chunk_store::RangeQuery, data_result: &re_viewer_context::DataResult, ) -> Result<(), SpaceViewSystemExecutionError> { re_tracing::profile_function!(); diff --git a/crates/re_space_view_time_series/Cargo.toml b/crates/re_space_view_time_series/Cargo.toml index d679e5b54f1f..51a350ceabbc 100644 --- a/crates/re_space_view_time_series/Cargo.toml +++ b/crates/re_space_view_time_series/Cargo.toml @@ -19,7 +19,7 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk_store.workspace = true re_format.workspace = true re_log.workspace = true re_log_types.workspace = true diff --git a/crates/re_space_view_time_series/src/line_visualizer_system.rs b/crates/re_space_view_time_series/src/line_visualizer_system.rs index ba8dfcefcd9b..b84d17b43c58 100644 --- a/crates/re_space_view_time_series/src/line_visualizer_system.rs +++ b/crates/re_space_view_time_series/src/line_visualizer_system.rs @@ -188,7 +188,7 @@ impl SeriesLineSystem { re_tracing::profile_scope!("primary", &data_result.entity_path.to_string()); let entity_path = &data_result.entity_path; - let query = re_data_store::RangeQuery::new(view_query.timeline, time_range); + let query = re_chunk_store::RangeQuery::new(view_query.timeline, time_range); let results = range_with_blueprint_resolved_data( ctx, diff --git a/crates/re_space_view_time_series/src/point_visualizer_system.rs b/crates/re_space_view_time_series/src/point_visualizer_system.rs index 1869fa92bb68..96f4daa0339d 100644 --- a/crates/re_space_view_time_series/src/point_visualizer_system.rs +++ b/crates/re_space_view_time_series/src/point_visualizer_system.rs @@ -149,7 +149,7 @@ impl SeriesPointSystem { re_tracing::profile_scope!("primary", &data_result.entity_path.to_string()); let entity_path = &data_result.entity_path; - let query = re_data_store::RangeQuery::new(view_query.timeline, time_range); + let query = re_chunk_store::RangeQuery::new(view_query.timeline, time_range); let results = range_with_blueprint_resolved_data( ctx, diff --git a/crates/re_space_view_time_series/src/space_view_class.rs b/crates/re_space_view_time_series/src/space_view_class.rs index 911b1712056f..1b338e986ca7 100644 --- a/crates/re_space_view_time_series/src/space_view_class.rs +++ b/crates/re_space_view_time_series/src/space_view_class.rs @@ -2,7 +2,7 @@ use egui::ahash::{HashMap, HashSet}; use egui_plot::{Legend, Line, Plot, PlotPoint, Points}; -use re_data_store::TimeType; +use re_chunk_store::TimeType; use re_format::next_grid_tick_magnitude_ns; use re_log_types::{EntityPath, TimeInt, TimeZone}; use re_space_view::{controls, view_property_ui}; diff --git a/crates/re_space_view_time_series/src/util.rs b/crates/re_space_view_time_series/src/util.rs index 3baf3ad0e79b..fbce06aeecbe 100644 --- a/crates/re_space_view_time_series/src/util.rs +++ b/crates/re_space_view_time_series/src/util.rs @@ -83,7 +83,7 @@ pub fn points_to_series( entity_path: &EntityPath, time_per_pixel: f64, points: Vec, - store: &re_data_store::DataStore, + store: &re_chunk_store::ChunkStore, query: &ViewQuery<'_>, series_name: &re_types::components::Name, aggregator: AggregationPolicy, diff --git a/crates/re_time_panel/Cargo.toml b/crates/re_time_panel/Cargo.toml index ea294406c2f4..dbf56fb817dc 100644 --- a/crates/re_time_panel/Cargo.toml +++ b/crates/re_time_panel/Cargo.toml @@ -20,7 +20,7 @@ all-features = true [dependencies] re_context_menu.workspace = true -re_data_store.workspace = true +re_chunk_store.workspace = true re_data_ui.workspace = true re_entity_db.workspace = true re_format.workspace = true diff --git a/crates/re_time_panel/src/data_density_graph.rs b/crates/re_time_panel/src/data_density_graph.rs index e0fc763ea96b..79dc012444f3 100644 --- a/crates/re_time_panel/src/data_density_graph.rs +++ b/crates/re_time_panel/src/data_density_graph.rs @@ -544,9 +544,8 @@ fn show_row_ids_tooltip( ui.label(format!("{num_events} events")); } - let query = re_data_store::LatestAtQuery::new(*time_ctrl.timeline(), time_range.max()); - let ui_layout = UiLayout::Tooltip; + let query = re_chunk_store::LatestAtQuery::new(*time_ctrl.timeline(), time_range.max()); let TimePanelItem { entity_path, diff --git a/crates/re_time_panel/src/lib.rs b/crates/re_time_panel/src/lib.rs index 67b546699e0b..db4743e35b88 100644 --- a/crates/re_time_panel/src/lib.rs +++ b/crates/re_time_panel/src/lib.rs @@ -830,7 +830,7 @@ impl TimePanel { // Conversely, temporal components change over time, and so showing a specific instance here // can be confusing. if is_static { - let query = re_data_store::LatestAtQuery::new( + let query = re_chunk_store::LatestAtQuery::new( *time_ctrl.timeline(), TimeInt::MAX, ); @@ -983,8 +983,8 @@ fn collapsed_time_marker_and_time( } let space_needed_for_current_time = match timeline.typ() { - re_data_store::TimeType::Time => 220.0, - re_data_store::TimeType::Sequence => 100.0, + re_chunk_store::TimeType::Time => 220.0, + re_chunk_store::TimeType::Sequence => 100.0, }; { diff --git a/crates/re_time_panel/src/time_axis.rs b/crates/re_time_panel/src/time_axis.rs index 3d9366236ffe..4ca217262090 100644 --- a/crates/re_time_panel/src/time_axis.rs +++ b/crates/re_time_panel/src/time_axis.rs @@ -152,7 +152,7 @@ fn create_ranges(times: &TimeHistogram, gap_threshold: u64) -> vec1::Vec1 vec1::Vec1 { let mut time_histogram = TimeHistogram::default(); diff --git a/crates/re_types/definitions/rerun/datatypes/entity_path.fbs b/crates/re_types/definitions/rerun/datatypes/entity_path.fbs index e0642ec07414..e4fe05de0eea 100644 --- a/crates/re_types/definitions/rerun/datatypes/entity_path.fbs +++ b/crates/re_types/definitions/rerun/datatypes/entity_path.fbs @@ -10,7 +10,7 @@ namespace rerun.datatypes; // --- -/// A path to an entity in the `DataStore`. +/// A path to an entity in the `ChunkStore`. table EntityPath ( "attr.arrow.transparent", "attr.python.aliases": "str", diff --git a/crates/re_types_core/src/datatypes/entity_path.rs b/crates/re_types_core/src/datatypes/entity_path.rs index 815de6c2aa62..ef7a51f9c174 100644 --- a/crates/re_types_core/src/datatypes/entity_path.rs +++ b/crates/re_types_core/src/datatypes/entity_path.rs @@ -22,7 +22,7 @@ use crate::SerializationResult; use crate::{ComponentBatch, MaybeOwnedComponentBatch}; use crate::{DeserializationError, DeserializationResult}; -/// **Datatype**: A path to an entity in the `DataStore`. +/// **Datatype**: A path to an entity in the `ChunkStore`. #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Default)] #[repr(transparent)] pub struct EntityPath(pub crate::ArrowString); diff --git a/crates/re_types_core/src/size_bytes.rs b/crates/re_types_core/src/size_bytes.rs index 71c94385dd6a..277dec0c1189 100644 --- a/crates/re_types_core/src/size_bytes.rs +++ b/crates/re_types_core/src/size_bytes.rs @@ -2,6 +2,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; use std::sync::Arc; use arrow2::datatypes::{DataType, Field}; +use arrow2::types::{NativeType, Offset}; use smallvec::SmallVec; // --- @@ -347,6 +348,27 @@ impl SizeBytes for Box { } } +impl SizeBytes for PrimitiveArray { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(self) as _ + } +} + +impl SizeBytes for ListArray { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(self) as _ + } +} + +impl SizeBytes for StructArray { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(self) as _ + } +} + // --- Arrow estimations --- // The following is a modified version of [1], available under MIT OR Apache-2.0. diff --git a/crates/re_ui/src/command.rs b/crates/re_ui/src/command.rs index e5cd9dca4cf5..7345659e42e1 100644 --- a/crates/re_ui/src/command.rs +++ b/crates/re_ui/src/command.rs @@ -69,7 +69,7 @@ pub enum UICommand { #[cfg(not(target_arch = "wasm32"))] ScreenshotWholeApp, #[cfg(not(target_arch = "wasm32"))] - PrintDataStore, + PrintChunkStore, #[cfg(not(target_arch = "wasm32"))] PrintBlueprintStore, #[cfg(not(target_arch = "wasm32"))] @@ -210,9 +210,9 @@ impl UICommand { "Copy screenshot of the whole app to clipboard", ), #[cfg(not(target_arch = "wasm32"))] - Self::PrintDataStore => ( + Self::PrintChunkStore => ( "Print datastore", - "Prints the entire data store to the console and clipboard. WARNING: this may be A LOT of text.", + "Prints the entire chunk store to the console and clipboard. WARNING: this may be A LOT of text.", ), #[cfg(not(target_arch = "wasm32"))] Self::PrintBlueprintStore => ( @@ -328,7 +328,7 @@ impl UICommand { #[cfg(not(target_arch = "wasm32"))] Self::ScreenshotWholeApp => None, #[cfg(not(target_arch = "wasm32"))] - Self::PrintDataStore => None, + Self::PrintChunkStore => None, #[cfg(not(target_arch = "wasm32"))] Self::PrintBlueprintStore => None, #[cfg(not(target_arch = "wasm32"))] diff --git a/crates/re_viewer/Cargo.toml b/crates/re_viewer/Cargo.toml index 4d35b9f66f7d..57ecc176b72a 100644 --- a/crates/re_viewer/Cargo.toml +++ b/crates/re_viewer/Cargo.toml @@ -40,9 +40,10 @@ analytics = ["dep:re_analytics"] # Internal: re_build_info.workspace = true re_blueprint_tree.workspace = true +re_chunk.workspace = true re_data_loader.workspace = true re_data_source.workspace = true -re_data_store.workspace = true +re_chunk_store.workspace = true re_data_ui.workspace = true re_edit_ui.workspace = true re_entity_db.workspace = true diff --git a/crates/re_viewer/src/app.rs b/crates/re_viewer/src/app.rs index 66645114b28e..83088b5a2874 100644 --- a/crates/re_viewer/src/app.rs +++ b/crates/re_viewer/src/app.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use re_build_info::CrateVersion; use re_data_source::{DataSource, FileContents}; use re_entity_db::entity_db::EntityDb; @@ -505,8 +507,8 @@ impl App { // to apply updates here, but this needs more validation and testing to be safe. if !self.state.app_options.inspect_blueprint_timeline { let blueprint_db = store_hub.entity_db_mut(&blueprint_id); - for row in updates { - match blueprint_db.add_data_row(row) { + for chunk in updates { + match blueprint_db.add_chunk(&Arc::new(chunk)) { Ok(()) => {} Err(err) => { re_log::warn_once!("Failed to store blueprint delta: {err}"); @@ -720,36 +722,19 @@ impl App { self.screenshotter.request_screenshot(egui_ctx); } #[cfg(not(target_arch = "wasm32"))] - UICommand::PrintDataStore => { + UICommand::PrintChunkStore => { if let Some(ctx) = store_context { - let table = ctx.recording.store().to_data_table(); - match table { - Ok(table) => { - let text = format!("{table}"); - egui_ctx.output_mut(|o| o.copied_text = text.clone()); - println!("{text}"); - } - Err(err) => { - println!("{err}"); - } - } + let text = format!("{}", ctx.recording.store()); + egui_ctx.output_mut(|o| o.copied_text = text.clone()); + println!("{text}"); } } #[cfg(not(target_arch = "wasm32"))] UICommand::PrintBlueprintStore => { if let Some(ctx) = store_context { - let table = ctx.blueprint.store().to_data_table(); - match table { - Ok(table) => { - let text = format!("{table}"); - - egui_ctx.output_mut(|o| o.copied_text = text.clone()); - println!("{text}"); - } - Err(err) => { - println!("{err}"); - } - } + let text = format!("{}", ctx.blueprint.store()); + egui_ctx.output_mut(|o| o.copied_text = text.clone()); + println!("{text}"); } } #[cfg(not(target_arch = "wasm32"))] @@ -1908,7 +1893,7 @@ fn save_entity_db( rrd_version: CrateVersion, file_name: String, title: String, - to_log_messages: impl FnOnce() -> re_log_types::DataTableResult>, + to_log_messages: impl FnOnce() -> re_chunk::ChunkResult>, ) -> anyhow::Result<()> { re_tracing::profile_function!(); diff --git a/crates/re_viewer/src/app_blueprint.rs b/crates/re_viewer/src/app_blueprint.rs index bc2217bcec0c..9068b8824314 100644 --- a/crates/re_viewer/src/app_blueprint.rs +++ b/crates/re_viewer/src/app_blueprint.rs @@ -1,7 +1,10 @@ -use re_data_store::LatestAtQuery; +use std::sync::Arc; + +use re_chunk::{Chunk, RowId}; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityDb; -use re_log_types::{DataRow, EntityPath, RowId}; -use re_types::blueprint::components::PanelState; +use re_log_types::EntityPath; +use re_types::{blueprint::components::PanelState, ComponentBatch}; use re_viewer_context::{CommandSender, StoreContext, SystemCommand, SystemCommandSender}; const TOP_PANEL_PATH: &str = "top_panel"; @@ -191,10 +194,14 @@ pub fn setup_welcome_screen_blueprint(welcome_screen_blueprint: &mut EntityDb) { let timepoint = re_viewer_context::blueprint_timepoint_for_writes(welcome_screen_blueprint); - let row = - DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, [value]).unwrap(); // Can only fail if we have the wrong number of instances for the component, and we don't + let chunk = Chunk::builder(entity_path) + .with_component_batches(RowId::new(), timepoint, [&value as &dyn ComponentBatch]) + .build() + .unwrap(); // Can only fail if we have the wrong number of instances for the component, and we don't - welcome_screen_blueprint.add_data_row(row).unwrap(); // Can only fail if we have the wrong number of instances for the component, and we don't + welcome_screen_blueprint + .add_chunk(&Arc::new(chunk)) + .unwrap(); // Can only fail if we have the wrong number of instances for the component, and we don't } } @@ -212,12 +219,14 @@ impl<'a> AppBlueprint<'a> { let timepoint = store_ctx.blueprint_timepoint_for_writes(); - let row = - DataRow::from_cells1_sized(RowId::new(), entity_path, timepoint, [value]).unwrap(); // Can only fail if we have the wrong number of instances for the component, and we don't + let chunk = Chunk::builder(entity_path) + .with_component_batches(RowId::new(), timepoint, [&value as &dyn ComponentBatch]) + .build() + .unwrap(); // Can only fail if we have the wrong number of instances for the component, and we don't command_sender.send_system(SystemCommand::UpdateBlueprint( store_ctx.blueprint.store_id().clone(), - vec![row], + vec![chunk], )); } } diff --git a/crates/re_viewer/src/app_state.rs b/crates/re_viewer/src/app_state.rs index d6ac7d5097d7..f743c5cb43ee 100644 --- a/crates/re_viewer/src/app_state.rs +++ b/crates/re_viewer/src/app_state.rs @@ -1,6 +1,6 @@ use ahash::HashMap; -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityDb; use re_log_types::{LogMsg, ResolvedTimeRangeF, StoreId}; use re_smart_channel::ReceiveSet; diff --git a/crates/re_viewer/src/blueprint/validation.rs b/crates/re_viewer/src/blueprint/validation.rs index 18b8b0ca5e12..9535846f583b 100644 --- a/crates/re_viewer/src/blueprint/validation.rs +++ b/crates/re_viewer/src/blueprint/validation.rs @@ -1,11 +1,9 @@ -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityDb; use re_log_types::Timeline; use re_types_core::Component; pub(crate) fn validate_component(blueprint: &EntityDb) -> bool { - let query = LatestAtQuery::latest(Timeline::default()); - if let Some(data_type) = blueprint.data_store().lookup_datatype(&C::name()) { if data_type != &C::arrow_datatype() { // If the schemas don't match, we definitely have a problem @@ -20,13 +18,19 @@ pub(crate) fn validate_component(blueprint: &EntityDb) -> bool { // Otherwise, our usage of serde-fields means we still might have a problem // this can go away once we stop using serde-fields. // Walk the blueprint and see if any cells fail to deserialize for this component type. + let query = LatestAtQuery::latest(Timeline::default()); for path in blueprint.entity_paths() { - if let Some([Some(cell)]) = blueprint - .data_store() - .latest_at(&query, path, C::name(), &[C::name()]) - .map(|(_, _, cells)| cells) + let results = blueprint.query_caches().latest_at( + blueprint.store(), + &query, + path, + [C::name()], + ); + if let Some(array) = results + .get(C::name()) + .and_then(|results| results.raw(blueprint.resolver(), C::name())) { - if let Err(err) = cell.try_to_native_mono::() { + if let Err(err) = C::from_arrow_opt(&*array) { re_log::debug!( "Failed to deserialize component {:?}: {:?}", C::name(), @@ -38,5 +42,6 @@ pub(crate) fn validate_component(blueprint: &EntityDb) -> bool { } } } + true } diff --git a/crates/re_viewer/src/lib.rs b/crates/re_viewer/src/lib.rs index 8757bc7699f6..cbdc55c19d57 100644 --- a/crates/re_viewer/src/lib.rs +++ b/crates/re_viewer/src/lib.rs @@ -31,12 +31,11 @@ pub(crate) use {app_state::AppState, ui::memory_panel}; pub use app::{App, StartupOptions}; pub mod external { - pub use re_data_ui; pub use {eframe, egui}; pub use { - re_data_store, re_data_store::external::*, re_entity_db, re_log, re_log_types, re_memory, - re_query, re_renderer, re_types, re_ui, re_viewer_context, re_viewer_context::external::*, - re_viewport, re_viewport::external::*, + re_chunk, re_chunk::external::*, re_chunk_store, re_chunk_store::external::*, re_data_ui, + re_entity_db, re_log, re_log_types, re_memory, re_query, re_renderer, re_types, re_ui, + re_viewer_context, re_viewer_context::external::*, re_viewport, re_viewport::external::*, }; } diff --git a/crates/re_viewer/src/ui/memory_panel.rs b/crates/re_viewer/src/ui/memory_panel.rs index ffce44ea4d4f..735298cfbdc2 100644 --- a/crates/re_viewer/src/ui/memory_panel.rs +++ b/crates/re_viewer/src/ui/memory_panel.rs @@ -1,5 +1,6 @@ use itertools::Itertools; -use re_data_store::{DataStoreConfig, DataStoreRowStats, DataStoreStats}; + +use re_chunk_store::{ChunkStoreChunkStats, ChunkStoreConfig, ChunkStoreStats}; use re_format::{format_bytes, format_uint}; use re_memory::{util::sec_since_start, MemoryHistory, MemoryLimit, MemoryUse}; use re_query::{CachedComponentStats, CachesStats}; @@ -30,9 +31,15 @@ impl MemoryPanel { (gpu_resource_stats.total_buffer_size_in_bytes + gpu_resource_stats.total_texture_size_in_bytes) as _, ), - store_stats.map(|stats| stats.recording_stats.total.num_bytes as _), + store_stats.map(|stats| { + (stats.recording_stats2.static_chunks.total_size_bytes + + stats.recording_stats2.temporal_chunks.total_size_bytes) as _ + }), store_stats.map(|stats| stats.recording_cached_stats.total_size_bytes() as _), - store_stats.map(|stats| stats.blueprint_stats.total.num_bytes as _), + store_stats.map(|stats| { + (stats.blueprint_stats.static_chunks.total_size_bytes + + stats.blueprint_stats.temporal_chunks.total_size_bytes) as _ + }), ); } @@ -90,10 +97,10 @@ impl MemoryPanel { if let Some(store_stats) = store_stats { ui.separator(); ui.collapsing("Datastore Resources", |ui| { - Self::store_stats( + Self::store_stats2( ui, - &store_stats.recording_config, - &store_stats.recording_stats, + &store_stats.recording_config2, + &store_stats.recording_stats2, ); }); @@ -104,7 +111,7 @@ impl MemoryPanel { ui.separator(); ui.collapsing("Blueprint Resources", |ui| { - Self::store_stats( + Self::store_stats2( ui, &store_stats.blueprint_config, &store_stats.blueprint_stats, @@ -206,88 +213,50 @@ impl MemoryPanel { }); } - fn store_stats( + fn store_stats2( ui: &mut egui::Ui, - store_config: &DataStoreConfig, - store_stats: &DataStoreStats, + store_config: &ChunkStoreConfig, + store_stats: &ChunkStoreStats, ) { - egui::Grid::new("store config grid") - .num_columns(3) - .show(ui, |ui| { - ui.label(egui::RichText::new("Limits").italics()); - ui.label("Row limit"); - ui.end_row(); - - let label_rows = |ui: &mut egui::Ui, num_rows| { - if num_rows == u64::MAX { - ui.label("+∞") - } else { - ui.label(re_format::format_uint(num_rows)) - } - }; + // TODO(cmc): this will become useful again once we introduce compaction settings. + _ = store_config; - ui.label("Timeless:"); - label_rows(ui, u64::MAX); - ui.end_row(); - - ui.label("Temporal:"); - label_rows(ui, store_config.indexed_bucket_num_rows); - ui.end_row(); - }); - - ui.separator(); - - egui::Grid::new("store stats grid") + egui::Grid::new("store stats grid 2") .num_columns(3) .show(ui, |ui| { - let DataStoreStats { - type_registry, - metadata_registry, - static_tables, - temporal, - temporal_buckets, - total, + let ChunkStoreStats { + static_chunks, + temporal_chunks, } = *store_stats; ui.label(egui::RichText::new("Stats").italics()); - ui.label("Buckets"); - ui.label("Rows"); - ui.label("Size"); + ui.label("Chunks"); + ui.label("Rows (total)"); + ui.label("Size (total)"); ui.end_row(); - fn label_row_stats(ui: &mut egui::Ui, row_stats: DataStoreRowStats) { - let DataStoreRowStats { - num_rows, - num_bytes, - } = row_stats; + fn label_chunk_stats(ui: &mut egui::Ui, stats: ChunkStoreChunkStats) { + let ChunkStoreChunkStats { + num_chunks, + total_size_bytes, + total_num_rows, + } = stats; - ui.label(re_format::format_uint(num_rows)); - ui.label(re_format::format_bytes(num_bytes as _)); + ui.label(re_format::format_uint(num_chunks)); + ui.label(re_format::format_uint(total_num_rows)); + ui.label(re_format::format_bytes(total_size_bytes as _)); } - ui.label("Type registry:"); - ui.label(""); - label_row_stats(ui, type_registry); - ui.end_row(); - - ui.label("Metadata registry:"); - ui.label(""); - label_row_stats(ui, metadata_registry); - ui.end_row(); - ui.label("Static:"); - ui.label(""); - label_row_stats(ui, static_tables); + label_chunk_stats(ui, static_chunks); ui.end_row(); ui.label("Temporal:"); - ui.label(re_format::format_uint(temporal_buckets)); - label_row_stats(ui, temporal); + label_chunk_stats(ui, temporal_chunks); ui.end_row(); - ui.label("Total"); - ui.label(re_format::format_uint(temporal_buckets)); - label_row_stats(ui, total); + ui.label("Total:"); + label_chunk_stats(ui, static_chunks + temporal_chunks); ui.end_row(); }); } @@ -510,7 +479,7 @@ impl MemoryPanel { plot_ui.line(to_line(resident).name("Resident").width(1.5)); plot_ui.line(to_line(counted).name("Counted").width(1.5)); plot_ui.line(to_line(counted_gpu).name("Counted GPU").width(1.5)); - plot_ui.line(to_line(counted_store).name("Counted store").width(1.5)); + plot_ui.line(to_line(counted_store).name("Counted store 2").width(1.5)); plot_ui.line( to_line(counted_primary_caches) .name("Counted primary caches") @@ -529,7 +498,7 @@ fn summarize_callstack(callstack: &str) -> String { let patterns = [ ("App::receive_messages", "App::receive_messages"), ("w_store::store::ComponentBucket>::archive", "archive"), - ("DataStore>::insert", "DataStore"), + ("ChunkStore>::insert", "ChunkStore"), ("EntityDb", "EntityDb"), ("EntityDb", "EntityDb"), ("EntityTree", "EntityTree"), diff --git a/crates/re_viewer_context/Cargo.toml b/crates/re_viewer_context/Cargo.toml index ea2487ab43a5..7320d17fe531 100644 --- a/crates/re_viewer_context/Cargo.toml +++ b/crates/re_viewer_context/Cargo.toml @@ -19,8 +19,9 @@ workspace = true all-features = true [dependencies] +re_chunk.workspace = true re_data_source.workspace = true -re_data_store.workspace = true +re_chunk_store.workspace = true re_entity_db = { workspace = true, features = ["serde"] } re_error.workspace = true re_format.workspace = true diff --git a/crates/re_viewer_context/src/annotations.rs b/crates/re_viewer_context/src/annotations.rs index 82a6c215be54..28d8331347e0 100644 --- a/crates/re_viewer_context/src/annotations.rs +++ b/crates/re_viewer_context/src/annotations.rs @@ -3,9 +3,9 @@ use std::{collections::BTreeMap, sync::Arc}; use ahash::HashMap; use nohash_hasher::IntSet; -use re_data_store::LatestAtQuery; +use re_chunk::RowId; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityPath; -use re_log_types::RowId; use re_query::LatestAtMonoResult; use re_types::components::AnnotationContext; use re_types::datatypes::{AnnotationInfo, ClassDescription, ClassId, KeypointId, Utf8}; diff --git a/crates/re_viewer_context/src/blueprint_helpers.rs b/crates/re_viewer_context/src/blueprint_helpers.rs index e6ccbfc0189f..7a02834c1f5e 100644 --- a/crates/re_viewer_context/src/blueprint_helpers.rs +++ b/crates/re_viewer_context/src/blueprint_helpers.rs @@ -1,5 +1,7 @@ -use re_log_types::{DataCell, DataRow, EntityPath, RowId, TimeInt, TimePoint, Timeline}; -use re_types::{external::arrow2, AsComponents, ComponentBatch, ComponentName}; +use re_chunk::{ArrowArray, RowId}; +use re_chunk_store::external::re_chunk::Chunk; +use re_log_types::{EntityPath, TimeInt, TimePoint, Timeline}; +use re_types::{AsComponents, ComponentBatch, ComponentName}; use crate::{StoreContext, SystemCommand, SystemCommandSender as _, ViewerContext}; @@ -31,80 +33,80 @@ impl StoreContext<'_> { } impl ViewerContext<'_> { - pub fn save_blueprint_archetype(&self, entity_path: EntityPath, components: &dyn AsComponents) { + pub fn save_blueprint_archetype( + &self, + entity_path: &EntityPath, + components: &dyn AsComponents, + ) { let timepoint = self.store_context.blueprint_timepoint_for_writes(); - let data_row = - match DataRow::from_archetype(RowId::new(), timepoint.clone(), entity_path, components) - { - Ok(data_cell) => data_cell, - Err(err) => { - re_log::error_once!( - "Failed to create DataRow for blueprint components: {}", - err - ); - return; - } - }; + let chunk = match Chunk::builder(entity_path.clone()) + .with_archetype(RowId::new(), timepoint.clone(), components) + .build() + { + Ok(chunk) => chunk, + Err(err) => { + re_log::error_once!("Failed to create Chunk for blueprint components: {}", err); + return; + } + }; self.command_sender .send_system(SystemCommand::UpdateBlueprint( self.store_context.blueprint.store_id().clone(), - vec![data_row], + vec![chunk], )); } - /// Helper to save a component batch to the blueprint store. pub fn save_blueprint_component( &self, entity_path: &EntityPath, - components: &dyn ComponentBatch, + component_batch: &dyn ComponentBatch, ) { - let data_cell = match DataCell::from_component_batch(components) { - Ok(data_cell) => data_cell, + let timepoint = self.store_context.blueprint_timepoint_for_writes(); + + let chunk = match Chunk::builder(entity_path.clone()) + .with_component_batches(RowId::new(), timepoint.clone(), [component_batch]) + .build() + { + Ok(chunk) => chunk, Err(err) => { - re_log::error_once!( - "Failed to create DataCell for blueprint components: {}", - err - ); + re_log::error_once!("Failed to create Chunk for blueprint components: {}", err); return; } }; - self.save_blueprint_data_cell(entity_path, data_cell); + self.command_sender + .send_system(SystemCommand::UpdateBlueprint( + self.store_context.blueprint.store_id().clone(), + vec![chunk], + )); } - /// Helper to save a data cell to the blueprint store. - pub fn save_blueprint_data_cell(&self, entity_path: &EntityPath, mut data_cell: DataCell) { - data_cell.compute_size_bytes(); - + pub fn save_blueprint_array( + &self, + entity_path: &EntityPath, + component_name: ComponentName, + array: Box, + ) { let timepoint = self.store_context.blueprint_timepoint_for_writes(); - re_log::trace!( - "Writing {} components of type {:?} to {:?}", - data_cell.num_instances(), - data_cell.component_name(), - entity_path - ); - - let data_row_result = DataRow::from_cells( - RowId::new(), - timepoint.clone(), - entity_path.clone(), - [data_cell], - ); - - match data_row_result { - Ok(row) => self - .command_sender - .send_system(SystemCommand::UpdateBlueprint( - self.store_context.blueprint.store_id().clone(), - vec![row], - )), + let chunk = match Chunk::builder(entity_path.clone()) + .with_row(RowId::new(), timepoint.clone(), [(component_name, array)]) + .build() + { + Ok(chunk) => chunk, Err(err) => { - re_log::error_once!("Failed to create DataRow for blueprint components: {}", err); + re_log::error_once!("Failed to create Chunk for blueprint components: {}", err); + return; } - } + }; + + self.command_sender + .send_system(SystemCommand::UpdateBlueprint( + self.store_context.blueprint.store_id().clone(), + vec![chunk], + )); } /// Helper to save a component to the blueprint store. @@ -121,7 +123,7 @@ impl ViewerContext<'_> { &self, entity_path: &EntityPath, component_name: ComponentName, - ) -> Option> { + ) -> Option> { self.store_context .default_blueprint .and_then(|default_blueprint| { @@ -143,10 +145,7 @@ impl ViewerContext<'_> { if let Some(default_value) = self.raw_latest_at_in_default_blueprint(entity_path, component_name) { - self.save_blueprint_data_cell( - entity_path, - DataCell::from_arrow(component_name, default_value), - ); + self.save_blueprint_array(entity_path, component_name, default_value); } else { self.save_empty_blueprint_component_by_name(entity_path, component_name); } @@ -160,15 +159,16 @@ impl ViewerContext<'_> { ) { let blueprint = &self.store_context.blueprint; - // Don't do anything if the component does not exist (if we don't the datatype lookup may fail). - if !blueprint + let Some(datatype) = blueprint .latest_at(self.blueprint_query, entity_path, [component_name]) - .contains(component_name) - { - return; - } - - let Some(datatype) = blueprint.store().lookup_datatype(&component_name) else { + .get(component_name) + .and_then(|result| { + result + .resolved(blueprint.resolver()) + .map(|array| array.data_type().clone()) + .ok() + }) + else { re_log::error!( "Tried to clear a component with unknown type: {}", component_name @@ -177,17 +177,26 @@ impl ViewerContext<'_> { }; let timepoint = self.store_context.blueprint_timepoint_for_writes(); - let cell = DataCell::from_arrow_empty(component_name, datatype.clone()); - - match DataRow::from_cells1(RowId::new(), entity_path.clone(), timepoint.clone(), cell) { - Ok(row) => self + let chunk = Chunk::builder(entity_path.clone()) + .with_row( + RowId::new(), + timepoint, + [( + component_name, + re_chunk::external::arrow2::array::new_empty_array(datatype), + )], + ) + .build(); + + match chunk { + Ok(chunk) => self .command_sender .send_system(SystemCommand::UpdateBlueprint( blueprint.store_id().clone(), - vec![row], + vec![chunk], )), Err(err) => { - re_log::error_once!("Failed to create DataRow for blueprint component: {}", err); + re_log::error_once!("Failed to create Chunk for blueprint component: {}", err); } } } diff --git a/crates/re_viewer_context/src/command_sender.rs b/crates/re_viewer_context/src/command_sender.rs index 4f18cc74e7f8..98be37d19e57 100644 --- a/crates/re_viewer_context/src/command_sender.rs +++ b/crates/re_viewer_context/src/command_sender.rs @@ -1,5 +1,6 @@ +use re_chunk_store::external::re_chunk::Chunk; use re_data_source::DataSource; -use re_log_types::{DataRow, StoreId}; +use re_log_types::StoreId; use re_ui::{UICommand, UICommandSender}; // ---------------------------------------------------------------------------- @@ -44,7 +45,7 @@ pub enum SystemCommand { /// The [`StoreId`] should generally be the currently selected blueprint /// but is tracked manually to ensure self-consistency if the blueprint /// is both modified and changed in the same frame. - UpdateBlueprint(StoreId, Vec), + UpdateBlueprint(StoreId, Vec), /// Show a timeline of the blueprint data. #[cfg(debug_assertions)] diff --git a/crates/re_viewer_context/src/component_ui_registry.rs b/crates/re_viewer_context/src/component_ui_registry.rs index d88c26e77863..aa7a2f0ba02c 100644 --- a/crates/re_viewer_context/src/component_ui_registry.rs +++ b/crates/re_viewer_context/src/component_ui_registry.rs @@ -1,6 +1,6 @@ use std::collections::BTreeMap; -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::{external::re_query::LatestAtComponentResults, EntityDb, EntityPath}; use re_log::ResultExt; use re_log_types::Instance; @@ -367,7 +367,7 @@ impl ComponentUiRegistry { // Don't use component.raw_instance here since we want to handle the case where there's several // elements differently. // Also, it allows us to slice the array without cloning any elements. - let cell = match component.resolved(db.resolver()) { + let array = match component.resolved(db.resolver()) { re_query::PromiseResult::Pending => { re_log::error_once!("Couldn't get {component_name}: promise still pending"); ui.error_label("pending..."); @@ -385,8 +385,8 @@ impl ComponentUiRegistry { }; // Component ui can only show a single instance. - if cell.num_instances() == 0 || (instance.is_all() && cell.num_instances() > 1) { - none_or_many_values_ui(ui, cell.num_instances() as _); + if array.len() == 0 || (instance.is_all() && array.len() > 1) { + none_or_many_values_ui(ui, array.len()); return; } @@ -399,8 +399,8 @@ impl ComponentUiRegistry { // Enforce clamp-to-border semantics. // TODO(andreas): Is that always what we want? - let index = index.clamp(0, (cell.num_instances() as usize).saturating_sub(1)); - let component_raw = cell.as_arrow_ref().sliced(index, 1); + let index = index.clamp(0, array.len().saturating_sub(1)); + let component_raw = array.sliced(index, 1); self.ui_raw( ctx, @@ -541,9 +541,9 @@ impl ComponentUiRegistry { Err(format!("Promise for {component_name} is still pending.")) } } - re_query::PromiseResult::Ready(cell) => { - if !cell.is_empty() { - Ok(cell.to_arrow()) + re_query::PromiseResult::Ready(array) => { + if !array.is_empty() { + Ok(array) } else { create_fallback() } @@ -632,10 +632,7 @@ impl ComponentUiRegistry { if let Some(edit_callback) = editors.get(&component_name) { if let Some(updated) = (*edit_callback)(ctx, ui, raw_current_value) { - ctx.save_blueprint_data_cell( - blueprint_write_path, - re_log_types::DataCell::from_arrow(component_name, updated), - ); + ctx.save_blueprint_array(blueprint_write_path, component_name, updated); } true } else { diff --git a/crates/re_viewer_context/src/gpu_bridge/tensor_to_gpu.rs b/crates/re_viewer_context/src/gpu_bridge/tensor_to_gpu.rs index ec25c9bc3dbc..b47ea5c0910c 100644 --- a/crates/re_viewer_context/src/gpu_bridge/tensor_to_gpu.rs +++ b/crates/re_viewer_context/src/gpu_bridge/tensor_to_gpu.rs @@ -7,7 +7,7 @@ use bytemuck::{allocation::pod_collect_to_vec, cast_slice, Pod}; use egui::util::hash; use wgpu::TextureFormat; -use re_log_types::RowId; +use re_chunk::RowId; use re_renderer::{ pad_rgb_to_rgba, renderer::{ColorMapper, ColormappedTexture, ShaderDecoding}, diff --git a/crates/re_viewer_context/src/item.rs b/crates/re_viewer_context/src/item.rs index 8dcd0cfcfa21..853abf0115a1 100644 --- a/crates/re_viewer_context/src/item.rs +++ b/crates/re_viewer_context/src/item.rs @@ -17,13 +17,13 @@ pub enum Item { /// A recording (or blueprint) StoreId(re_log_types::StoreId), - /// A component of an entity from the data store. + /// A component of an entity from the chunk store. ComponentPath(ComponentPath), /// A space view. SpaceView(SpaceViewId), - /// An entity or instance from the data store. + /// An entity or instance from the chunk store. InstancePath(InstancePath), /// An entity or instance in the context of a space view's data results. @@ -150,7 +150,7 @@ impl Item { /// If the given item refers to the first element of an instance with a single element, resolve to a unindexed entity path. pub fn resolve_mono_instance_path_item( entity_db: &EntityDb, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, item: &Item, ) -> Item { // Resolve to entity path if there's only a single instance. @@ -174,14 +174,14 @@ pub fn resolve_mono_instance_path_item( /// If the given path refers to the first element of an instance with a single element, resolve to a unindexed entity path. pub fn resolve_mono_instance_path( entity_db: &EntityDb, - query: &re_data_store::LatestAtQuery, + query: &re_chunk_store::LatestAtQuery, instance: &re_entity_db::InstancePath, ) -> re_entity_db::InstancePath { re_tracing::profile_function!(); if instance.instance.get() == 0 { // NOTE: While we normally frown upon direct queries to the datastore, `all_components` is fine. - let Some(components) = entity_db + let Some(component_names) = entity_db .store() .all_components(&query.timeline(), &instance.entity_path) else { @@ -189,17 +189,18 @@ pub fn resolve_mono_instance_path( return re_entity_db::InstancePath::entity_all(instance.entity_path.clone()); }; - for component in components { + for component_name in component_names { let results = entity_db.query_caches().latest_at( entity_db.store(), query, &instance.entity_path, - [component], + [component_name], ); - if let Some(results) = results.get(component) { - if let re_query::PromiseResult::Ready(cell) = results.resolved(entity_db.resolver()) + if let Some(results) = results.get(component_name) { + if let re_query::PromiseResult::Ready(array) = + results.resolved(entity_db.resolver()) { - if cell.num_instances() > 1 { + if array.len() > 1 { return instance.clone(); } } diff --git a/crates/re_viewer_context/src/lib.rs b/crates/re_viewer_context/src/lib.rs index 51af70e05f5f..83edf449b5b5 100644 --- a/crates/re_viewer_context/src/lib.rs +++ b/crates/re_viewer_context/src/lib.rs @@ -87,7 +87,7 @@ pub use clipboard::Clipboard; pub mod external { pub use nohash_hasher; - pub use {re_data_store, re_entity_db, re_log_types, re_query, re_ui}; + pub use {re_chunk_store, re_entity_db, re_log_types, re_query, re_ui}; } // --------------------------------------------------------------------------- diff --git a/crates/re_viewer_context/src/query_context.rs b/crates/re_viewer_context/src/query_context.rs index 531484128b4a..f4fc0e7227d2 100644 --- a/crates/re_viewer_context/src/query_context.rs +++ b/crates/re_viewer_context/src/query_context.rs @@ -31,7 +31,7 @@ pub struct QueryContext<'a> { pub archetype_name: Option, /// Query which didn't yield a result for the component at the target entity path. - pub query: &'a re_data_store::LatestAtQuery, + pub query: &'a re_chunk_store::LatestAtQuery, /// The view state of the view in which the query is executed. pub view_state: &'a dyn SpaceViewState, diff --git a/crates/re_viewer_context/src/query_range.rs b/crates/re_viewer_context/src/query_range.rs index d81f31401b4b..2fcc1c89b7d3 100644 --- a/crates/re_viewer_context/src/query_range.rs +++ b/crates/re_viewer_context/src/query_range.rs @@ -1,4 +1,4 @@ -/// Range & type of data store query. +/// Range & type of chunk store query. #[derive(Debug, Clone, PartialEq, Eq, Default)] pub enum QueryRange { /// Use a time range on the currently active timeline. diff --git a/crates/re_viewer_context/src/space_view/space_view_class_registry.rs b/crates/re_viewer_context/src/space_view/space_view_class_registry.rs index a0ee01f74d2a..4b9866a2eeb5 100644 --- a/crates/re_viewer_context/src/space_view/space_view_class_registry.rs +++ b/crates/re_viewer_context/src/space_view/space_view_class_registry.rs @@ -1,7 +1,7 @@ use ahash::{HashMap, HashSet}; use itertools::Itertools as _; -use re_data_store::DataStore; +use re_chunk_store::{ChunkStore, ChunkStoreSubscriberHandle}; use re_types::SpaceViewClassIdentifier; use crate::{ @@ -100,7 +100,7 @@ impl SpaceViewSystemRegistrator<'_> { .visualizers .entry(T::identifier()) .or_insert_with(|| { - let entity_subscriber_handle = DataStore::register_subscriber(Box::new( + let entity_subscriber_handle = ChunkStore::register_subscriber(Box::new( VisualizerEntitySubscriber::new(&T::default()), )); @@ -156,13 +156,13 @@ struct VisualizerTypeRegistryEntry { used_by: HashSet, /// Handle to subscription of [`VisualizerEntitySubscriber`] for this visualizer. - entity_subscriber_handle: re_data_store::StoreSubscriberHandle, + entity_subscriber_handle: ChunkStoreSubscriberHandle, } impl Drop for VisualizerTypeRegistryEntry { fn drop(&mut self) { - // TODO(andreas): DataStore unsubscribe is not yet implemented! - //DataStore::unregister_subscriber(self.entity_subscriber_handle); + // TODO(andreas): ChunkStore unsubscribe is not yet implemented! + //ChunkStore::unregister_subscriber(self.entity_subscriber_handle); } } @@ -297,7 +297,7 @@ impl SpaceViewClassRegistry { .map(|(id, entry)| { ( *id, - DataStore::with_subscriber::( + ChunkStore::with_subscriber::( entry.entity_subscriber_handle, |subscriber| subscriber.applicable_entities(store_id).cloned(), ) @@ -322,7 +322,7 @@ impl SpaceViewClassRegistry { .map(|(id, entry)| { ( *id, - DataStore::with_subscriber::( + ChunkStore::with_subscriber::( entry.entity_subscriber_handle, |subscriber| subscriber.indicated_entities(store_id).cloned(), ) diff --git a/crates/re_viewer_context/src/space_view/view_context.rs b/crates/re_viewer_context/src/space_view/view_context.rs index c009a6727b2d..a242c43caf61 100644 --- a/crates/re_viewer_context/src/space_view/view_context.rs +++ b/crates/re_viewer_context/src/space_view/view_context.rs @@ -1,7 +1,8 @@ use std::sync::Arc; -use re_data_store::LatestAtQuery; -use re_log_types::{DataCell, EntityPath, TimePoint}; +use re_chunk::ArrowArray; +use re_chunk_store::LatestAtQuery; +use re_log_types::{EntityPath, TimePoint}; use re_types::{AsComponents, ComponentBatch, ComponentName}; use crate::{DataQueryResult, DataResult, QueryContext, SpaceViewId}; @@ -44,9 +45,9 @@ impl<'a> ViewContext<'a> { self.viewer_ctx.recording() } - /// The data store of the active recording. + /// The chunk store of the active recording. #[inline] - pub fn recording_store(&self) -> &re_data_store::DataStore { + pub fn recording_store(&self) -> &re_chunk_store::ChunkStore { self.viewer_ctx.recording_store() } @@ -81,7 +82,7 @@ impl<'a> ViewContext<'a> { /// The current time query, based on the current time control. #[inline] - pub fn current_query(&self) -> re_data_store::LatestAtQuery { + pub fn current_query(&self) -> LatestAtQuery { self.viewer_ctx.current_query() } @@ -101,25 +102,34 @@ impl<'a> ViewContext<'a> { } #[inline] - pub fn save_blueprint_archetype(&self, entity_path: EntityPath, components: &dyn AsComponents) { + pub fn save_blueprint_array( + &self, + entity_path: &EntityPath, + component_name: ComponentName, + array: Box, + ) { self.viewer_ctx - .save_blueprint_archetype(entity_path, components); + .save_blueprint_array(entity_path, component_name, array); } #[inline] - pub fn save_blueprint_component( + pub fn save_blueprint_archetype( &self, entity_path: &EntityPath, - components: &dyn ComponentBatch, + components: &dyn AsComponents, ) { self.viewer_ctx - .save_blueprint_component(entity_path, components); + .save_blueprint_archetype(entity_path, components); } #[inline] - pub fn save_blueprint_data_cell(&self, entity_path: &EntityPath, data_cell: DataCell) { + pub fn save_blueprint_component( + &self, + entity_path: &EntityPath, + components: &dyn ComponentBatch, + ) { self.viewer_ctx - .save_blueprint_data_cell(entity_path, data_cell); + .save_blueprint_component(entity_path, components); } #[inline] diff --git a/crates/re_viewer_context/src/space_view/view_context_system.rs b/crates/re_viewer_context/src/space_view/view_context_system.rs index f0e6f899b5d4..e6b8ce8ed926 100644 --- a/crates/re_viewer_context/src/space_view/view_context_system.rs +++ b/crates/re_viewer_context/src/space_view/view_context_system.rs @@ -21,7 +21,7 @@ pub trait ViewContextSystem: Send + Sync { /// do so, see [`crate::SpaceViewSystemRegistrator`]. fn compatible_component_sets(&self) -> Vec; - /// Queries the data store and performs data conversions to make it ready for consumption by scene elements. + /// Queries the chunk store and performs data conversions to make it ready for consumption by scene elements. fn execute(&mut self, ctx: &ViewContext<'_>, query: &ViewQuery<'_>); /// Converts itself to a reference of [`std::any::Any`], which enables downcasting to concrete types. diff --git a/crates/re_viewer_context/src/space_view/view_query.rs b/crates/re_viewer_context/src/space_view/view_query.rs index c1d3b4f97269..aaab6eb65a16 100644 --- a/crates/re_viewer_context/src/space_view/view_query.rs +++ b/crates/re_viewer_context/src/space_view/view_query.rs @@ -4,7 +4,7 @@ use itertools::Itertools; use nohash_hasher::IntMap; use smallvec::SmallVec; -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::{EntityPath, TimeInt, Timeline}; use re_log_types::StoreKind; use re_types::ComponentName; @@ -17,7 +17,7 @@ use crate::{ /// Path to a specific entity in a specific store used for overrides. #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct OverridePath { - // NOTE: StoreKind is easier to work with than a `StoreId`` or full `DataStore` but + // NOTE: StoreKind is easier to work with than a `StoreId`` or full `ChunkStore` but // might still be ambiguous when we have multiple stores active at a time. pub store_kind: StoreKind, pub path: EntityPath, @@ -52,7 +52,7 @@ pub struct PropertyOverrides { /// for properties that apply to the individual entity only. pub individual_override_path: EntityPath, - /// What range is queried on the data store. + /// What range is queried on the chunk store. pub query_range: QueryRange, } @@ -313,7 +313,7 @@ impl DataResult { /// Shorthand for checking for visibility on data overrides. /// - /// Note that this won't check if the data store has visibility logged. + /// Note that this won't check if the chunk store has visibility logged. // TODO(#6541): Check the datastore. #[inline] pub fn is_visible(&self, ctx: &ViewerContext<'_>) -> bool { @@ -324,7 +324,7 @@ impl DataResult { /// Shorthand for checking for interactivity on data overrides. /// - /// Note that this won't check if the data store has interactivity logged. + /// Note that this won't check if the chunk store has interactivity logged. // TODO(#6541): Check the datastore. #[inline] pub fn is_interactive(&self, ctx: &ViewerContext<'_>) -> bool { diff --git a/crates/re_viewer_context/src/space_view/visualizer_entity_subscriber.rs b/crates/re_viewer_context/src/space_view/visualizer_entity_subscriber.rs index 3b83a37ddaee..f5891a56f705 100644 --- a/crates/re_viewer_context/src/space_view/visualizer_entity_subscriber.rs +++ b/crates/re_viewer_context/src/space_view/visualizer_entity_subscriber.rs @@ -1,9 +1,8 @@ use ahash::HashMap; use bit_vec::BitVec; -use itertools::Itertools; use nohash_hasher::IntMap; -use re_data_store::StoreSubscriber; +use re_chunk_store::{ChunkStoreDiffKind, ChunkStoreEvent, ChunkStoreSubscriber}; use re_log_types::{EntityPathHash, StoreId}; use re_types::{ComponentName, ComponentNameSet}; @@ -51,14 +50,14 @@ pub trait VisualizerAdditionalApplicabilityFilter: Send + Sync { /// **This implies that the filter does not _need_ to be stateful.** /// It is perfectly fine to return `true` only if something in the diff is regarded as applicable and false otherwise. /// (However, if necessary, the applicability filter *can* keep track of state.) - fn update_applicability(&mut self, _event: &re_data_store::StoreEvent) -> bool; + fn update_applicability(&mut self, _event: &ChunkStoreEvent) -> bool; } struct DefaultVisualizerApplicabilityFilter; impl VisualizerAdditionalApplicabilityFilter for DefaultVisualizerApplicabilityFilter { #[inline] - fn update_applicability(&mut self, _event: &re_data_store::StoreEvent) -> bool { + fn update_applicability(&mut self, _event: &ChunkStoreEvent) -> bool { true } } @@ -126,7 +125,7 @@ impl VisualizerEntitySubscriber { } } -impl StoreSubscriber for VisualizerEntitySubscriber { +impl ChunkStoreSubscriber for VisualizerEntitySubscriber { #[inline] fn name(&self) -> String { self.visualizer.as_str().to_owned() @@ -142,13 +141,13 @@ impl StoreSubscriber for VisualizerEntitySubscriber { self } - fn on_events(&mut self, events: &[re_data_store::StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { re_tracing::profile_function!(self.visualizer); // TODO(andreas): Need to react to store removals as well. As of writing doesn't exist yet. for event in events { - if event.diff.kind != re_data_store::StoreDiffKind::Addition { + if event.diff.kind != ChunkStoreDiffKind::Addition { // Applicability is only additive, don't care about removals. continue; } @@ -158,14 +157,13 @@ impl StoreSubscriber for VisualizerEntitySubscriber { .entry(event.store_id.clone()) .or_default(); - let entity_path = &event.diff.entity_path; + let entity_path = event.diff.chunk.entity_path(); // Update indicator component tracking: if self.indicator_components.is_empty() - || self - .indicator_components - .iter() - .any(|component_name| event.diff.cells.keys().contains(component_name)) + || self.indicator_components.iter().any(|component_name| { + event.diff.chunk.components().contains_key(component_name) + }) { store_mapping .indicated_entities @@ -186,8 +184,8 @@ impl StoreSubscriber for VisualizerEntitySubscriber { continue; } - for component_name in event.diff.cells.keys() { - if let Some(index) = self.required_components_indices.get(component_name) { + for component_name in event.diff.chunk.component_names() { + if let Some(index) = self.required_components_indices.get(&component_name) { required_components_bitmap.set(*index, true); } } diff --git a/crates/re_viewer_context/src/space_view/visualizer_system.rs b/crates/re_viewer_context/src/space_view/visualizer_system.rs index 36d30f53dc9b..a563e6fdcb92 100644 --- a/crates/re_viewer_context/src/space_view/visualizer_system.rs +++ b/crates/re_viewer_context/src/space_view/visualizer_system.rs @@ -106,7 +106,7 @@ pub trait VisualizerSystem: Send + Sync + ComponentFallbackProvider + 'static { None } - /// Queries the data store and performs data conversions to make it ready for display. + /// Queries the chunk store and performs data conversions to make it ready for display. /// /// Mustn't query any data outside of the archetype. fn execute( @@ -116,7 +116,7 @@ pub trait VisualizerSystem: Send + Sync + ComponentFallbackProvider + 'static { context_systems: &ViewContextCollection, ) -> Result, SpaceViewSystemExecutionError>; - /// Optionally retrieves a data store reference from the scene element. + /// Optionally retrieves a chunk store reference from the scene element. /// /// This is useful for retrieving data that is common to several visualizers of a [`crate::SpaceViewClass`]. /// For example, if most visualizers produce ui elements, a concrete [`crate::SpaceViewClass`] diff --git a/crates/re_viewer_context/src/store_hub.rs b/crates/re_viewer_context/src/store_hub.rs index ae27b30f4d81..848c5805dce4 100644 --- a/crates/re_viewer_context/src/store_hub.rs +++ b/crates/re_viewer_context/src/store_hub.rs @@ -3,8 +3,7 @@ use ahash::{HashMap, HashMapExt}; use anyhow::Context as _; use itertools::Itertools as _; -use re_data_store::StoreGeneration; -use re_data_store::{DataStoreConfig, DataStoreStats}; +use re_chunk_store::{ChunkStoreConfig, ChunkStoreGeneration, ChunkStoreStats}; use re_entity_db::{EntityDb, StoreBundle}; use re_log_types::{ApplicationId, StoreId, StoreKind}; use re_query::CachesStats; @@ -43,11 +42,11 @@ pub struct StoreHub { active_blueprint_by_app_id: HashMap, store_bundle: StoreBundle, - /// The [`StoreGeneration`] from when the [`EntityDb`] was last saved - blueprint_last_save: HashMap, + /// The [`ChunkStoreGeneration`] from when the [`EntityDb`] was last saved + blueprint_last_save: HashMap, - /// The [`StoreGeneration`] from when the [`EntityDb`] was last garbage collected - blueprint_last_gc: HashMap, + /// The [`ChunkStoreGeneration`] from when the [`EntityDb`] was last garbage collected + blueprint_last_gc: HashMap, } /// Load a blueprint from persisted storage, e.g. disk. @@ -72,12 +71,13 @@ pub struct BlueprintPersistence { /// Convenient information used for `MemoryPanel` #[derive(Default)] pub struct StoreHubStats { - pub blueprint_stats: DataStoreStats, - pub blueprint_config: DataStoreConfig, + pub blueprint_stats: ChunkStoreStats, + pub blueprint_cached_stats: CachesStats, + pub blueprint_config: ChunkStoreConfig, - pub recording_stats: DataStoreStats, + pub recording_stats2: ChunkStoreStats, pub recording_cached_stats: CachesStats, - pub recording_config: DataStoreConfig, + pub recording_config2: ChunkStoreConfig, } impl StoreHub { @@ -561,11 +561,9 @@ impl StoreHub { return; // unreachable }; - let store_size_before = - entity_db.store().static_size_bytes() + entity_db.store().temporal_size_bytes(); + let store_size_before = entity_db.store().stats().total().total_size_bytes; entity_db.purge_fraction_of_ram(fraction_to_purge); - let store_size_after = - entity_db.store().static_size_bytes() + entity_db.store().temporal_size_bytes(); + let store_size_after = entity_db.store().stats().total().total_size_bytes; // No point keeping an empty recording around. if entity_db.is_empty() { @@ -734,7 +732,11 @@ impl StoreHub { .and_then(|blueprint_id| self.store_bundle.get(blueprint_id)); let blueprint_stats = blueprint - .map(|entity_db| DataStoreStats::from_store(entity_db.store())) + .map(|entity_db| entity_db.store().stats()) + .unwrap_or_default(); + + let blueprint_cached_stats = blueprint + .map(|entity_db| entity_db.query_caches().stats()) .unwrap_or_default(); let blueprint_config = blueprint @@ -746,24 +748,26 @@ impl StoreHub { .as_ref() .and_then(|rec_id| self.store_bundle.get(rec_id)); - let recording_stats = recording - .map(|entity_db| DataStoreStats::from_store(entity_db.store())) + let recording_stats2 = recording + .map(|entity_db| entity_db.store().stats()) .unwrap_or_default(); let recording_cached_stats = recording .map(|entity_db| entity_db.query_caches().stats()) .unwrap_or_default(); - let recording_config = recording + let recording_config2 = recording .map(|entity_db| entity_db.store().config().clone()) .unwrap_or_default(); StoreHubStats { blueprint_stats, + blueprint_cached_stats, blueprint_config, - recording_stats, + + recording_stats2, recording_cached_stats, - recording_config, + recording_config2, } } } diff --git a/crates/re_viewer_context/src/tensor/tensor_decode_cache.rs b/crates/re_viewer_context/src/tensor/tensor_decode_cache.rs index fc66c5e25d5a..9e70743bdb45 100644 --- a/crates/re_viewer_context/src/tensor/tensor_decode_cache.rs +++ b/crates/re_viewer_context/src/tensor/tensor_decode_cache.rs @@ -1,4 +1,4 @@ -use re_log_types::RowId; +use re_chunk::RowId; use re_types::{ datatypes::TensorData, tensor_data::{DecodedTensor, TensorImageLoadError}, diff --git a/crates/re_viewer_context/src/tensor/tensor_stats_cache.rs b/crates/re_viewer_context/src/tensor/tensor_stats_cache.rs index 11ea8410580d..deb091147ef8 100644 --- a/crates/re_viewer_context/src/tensor/tensor_stats_cache.rs +++ b/crates/re_viewer_context/src/tensor/tensor_stats_cache.rs @@ -1,4 +1,4 @@ -use re_log_types::RowId; +use re_chunk::RowId; use re_types::datatypes::TensorData; use super::TensorStats; diff --git a/crates/re_viewer_context/src/test_context.rs b/crates/re_viewer_context/src/test_context.rs index e011d76eb159..13c1f23a6cd5 100644 --- a/crates/re_viewer_context/src/test_context.rs +++ b/crates/re_viewer_context/src/test_context.rs @@ -3,7 +3,7 @@ use crate::{ SpaceViewClassRegistry, StoreContext, ViewerContext, }; -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityDb; use re_log_types::{StoreId, StoreKind, Timeline}; diff --git a/crates/re_viewer_context/src/time_control.rs b/crates/re_viewer_context/src/time_control.rs index 0b828bf1a039..1d5a09935a9e 100644 --- a/crates/re_viewer_context/src/time_control.rs +++ b/crates/re_viewer_context/src/time_control.rs @@ -468,8 +468,8 @@ impl TimeControl { } /// Query for latest value at the currently selected time on the currently selected timeline. - pub fn current_query(&self) -> re_data_store::LatestAtQuery { - re_data_store::LatestAtQuery::new( + pub fn current_query(&self) -> re_chunk_store::LatestAtQuery { + re_chunk_store::LatestAtQuery::new( *self.timeline, self.time().map_or(TimeInt::MAX, |t| t.floor()), ) diff --git a/crates/re_viewer_context/src/viewer_context.rs b/crates/re_viewer_context/src/viewer_context.rs index 064037f0328f..7ee6fdc99dc3 100644 --- a/crates/re_viewer_context/src/viewer_context.rs +++ b/crates/re_viewer_context/src/viewer_context.rs @@ -1,7 +1,7 @@ use ahash::HashMap; use parking_lot::RwLock; -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::entity_db::EntityDb; use crate::{ @@ -87,18 +87,18 @@ impl<'a> ViewerContext<'a> { self.store_context.recording } - /// The data store of the active recording. - #[inline] - pub fn recording_store(&self) -> &re_data_store::DataStore { - self.store_context.recording.store() - } - /// The active blueprint. #[inline] pub fn blueprint_db(&self) -> &re_entity_db::EntityDb { self.store_context.blueprint } + /// The chunk store of the active recording. + #[inline] + pub fn recording_store(&self) -> &re_chunk_store::ChunkStore { + self.store_context.recording.store() + } + /// The `StoreId` of the active recording. #[inline] pub fn recording_id(&self) -> &re_log_types::StoreId { @@ -120,7 +120,7 @@ impl<'a> ViewerContext<'a> { } /// The current time query, based on the current time control. - pub fn current_query(&self) -> re_data_store::LatestAtQuery { + pub fn current_query(&self) -> re_chunk_store::LatestAtQuery { self.rec_cfg.time_ctrl.read().current_query() } diff --git a/crates/re_viewport_blueprint/Cargo.toml b/crates/re_viewport_blueprint/Cargo.toml index b2c31844090e..90c981d08e66 100644 --- a/crates/re_viewport_blueprint/Cargo.toml +++ b/crates/re_viewport_blueprint/Cargo.toml @@ -19,10 +19,11 @@ workspace = true all-features = true [dependencies] -re_data_store.workspace = true +re_chunk.workspace = true +re_chunk_store.workspace = true re_entity_db.workspace = true -re_log_types.workspace = true re_log.workspace = true +re_log_types.workspace = true re_tracing.workspace = true re_types_blueprint.workspace = true re_types_core.workspace = true diff --git a/crates/re_viewport_blueprint/src/container.rs b/crates/re_viewport_blueprint/src/container.rs index 108c2a157572..08745901d3ae 100644 --- a/crates/re_viewport_blueprint/src/container.rs +++ b/crates/re_viewport_blueprint/src/container.rs @@ -1,10 +1,10 @@ use ahash::HashMap; use egui_tiles::TileId; -use re_data_store::LatestAtQuery; +use re_chunk::{Chunk, LatestAtQuery, RowId}; use re_entity_db::EntityDb; use re_log::ResultExt; -use re_log_types::{DataRow, EntityPath, RowId}; +use re_log_types::EntityPath; use re_types::components::Name; use re_types::{blueprint::components::Visible, Archetype as _}; use re_types_blueprint::blueprint::archetypes as blueprint_archetypes; @@ -171,18 +171,15 @@ impl ContainerBlueprint { ctx.save_empty_blueprint_component::(&id.as_entity_path()); } - let mut deltas = vec![]; - - if let Some(row) = - DataRow::from_archetype(RowId::new(), timepoint.clone(), id.as_entity_path(), &arch) - .warn_on_err_once("Failed to create container blueprint.") + if let Some(chunk) = Chunk::builder(id.as_entity_path()) + .with_archetype(RowId::new(), timepoint.clone(), &arch) + .build() + .warn_on_err_once("Failed to create container blueprint.") { - deltas.push(row); - ctx.command_sender .send_system(SystemCommand::UpdateBlueprint( ctx.store_context.blueprint.store_id().clone(), - deltas, + vec![chunk], )); } } diff --git a/crates/re_viewport_blueprint/src/space_view.rs b/crates/re_viewport_blueprint/src/space_view.rs index 984c28e9dcc0..088a10efa8a5 100644 --- a/crates/re_viewport_blueprint/src/space_view.rs +++ b/crates/re_viewport_blueprint/src/space_view.rs @@ -5,10 +5,10 @@ use itertools::{FoldWhile, Itertools}; use parking_lot::Mutex; use re_types::SpaceViewClassIdentifier; -use crate::{SpaceViewContents, ViewProperty}; -use re_data_store::LatestAtQuery; +use re_chunk::{Chunk, RowId}; +use re_chunk_store::LatestAtQuery; use re_entity_db::{EntityDb, EntityPath}; -use re_log_types::{DataRow, EntityPathSubs, RowId, Timeline}; +use re_log_types::{EntityPathSubs, Timeline}; use re_types::{ blueprint::{ archetypes::{self as blueprint_archetypes}, @@ -24,6 +24,8 @@ use re_viewer_context::{ SystemCommandSender as _, ViewContext, ViewStates, ViewerContext, VisualizerCollection, }; +use crate::{SpaceViewContents, ViewProperty}; + /// A view of a space. /// /// Note: [`SpaceViewBlueprint`] doesn't implement Clone because it stores an internal @@ -53,7 +55,7 @@ pub struct SpaceViewBlueprint { pub defaults_path: EntityPath, /// Pending blueprint writes for nested components from duplicate. - pending_writes: Vec, + pending_writes: Vec, } impl SpaceViewBlueprint { @@ -219,10 +221,11 @@ impl SpaceViewBlueprint { let mut deltas = pending_writes.clone(); // Add all the additional components from the archetype - if let Ok(row) = - DataRow::from_archetype(RowId::new(), timepoint.clone(), id.as_entity_path(), &arch) + if let Ok(chunk) = Chunk::builder(id.as_entity_path()) + .with_archetype(RowId::new(), timepoint.clone(), &arch) + .build() { - deltas.push(row); + deltas.push(chunk); } contents.save_to_blueprint_store(ctx); @@ -255,29 +258,35 @@ impl SpaceViewBlueprint { .cloned() .collect(); - if let Ok(row) = DataRow::from_cells( - RowId::new(), - store_context.blueprint_timepoint_for_writes(), - sub_path, - info.components - .keys() - // It's important that we don't include the SpaceViewBlueprint's components - // since those will be updated separately and may contain different data. - .filter(|component| { - *path != current_path - || !blueprint_archetypes::SpaceViewBlueprint::all_components() - .contains(component) - }) - .filter_map(|component| { - blueprint - .store() - .latest_at(query, path, *component, &[*component]) - .and_then(|(_, _, cells)| cells[0].clone()) - }), - ) { - if row.num_cells() > 0 { - pending_writes.push(row); - } + let chunk = Chunk::builder(sub_path) + .with_row( + RowId::new(), + store_context.blueprint_timepoint_for_writes(), + info.components + .keys() + // It's important that we don't include the SpaceViewBlueprint's components + // since those will be updated separately and may contain different data. + .filter(|component| { + *path != current_path + || !blueprint_archetypes::SpaceViewBlueprint::all_components() + .contains(component) + }) + .filter_map(|&component_name| { + let results = blueprint.query_caches().latest_at( + blueprint.store(), + query, + path, + [component_name], + ); + let results = results.get(component_name)?; + let array = results.raw(blueprint.resolver(), component_name); + array.map(|array| (component_name, array)) + }), + ) + .build(); + + if let Ok(chunk) = chunk { + pending_writes.push(chunk); } }); } @@ -387,7 +396,7 @@ impl SpaceViewBlueprint { // Visual time range works with regular overrides for the most part but it's a bit special: // * we need it for all entities unconditionally // * default does not vary per visualizer - // * can't be specified in the data store + // * can't be specified in the chunk store // Here, we query the visual time range that serves as the default for all entities in this space. let property = ViewProperty::from_archetype::( @@ -465,17 +474,19 @@ impl SpaceViewBlueprint { #[cfg(test)] mod tests { + use std::collections::HashMap; + + use re_chunk::RowId; use re_entity_db::EntityDb; use re_log_types::{ example_components::{MyColor, MyLabel, MyPoint}, - DataRow, RowId, StoreId, StoreKind, TimePoint, + StoreId, StoreKind, TimePoint, }; use re_types::{ComponentBatch, ComponentName, Loggable as _}; use re_viewer_context::{ test_context::TestContext, ApplicableEntities, DataResult, IndicatedEntities, OverridePath, PerVisualizer, StoreContext, VisualizableEntities, }; - use std::collections::HashMap; use crate::space_view_contents::DataQueryPropertyResolver; @@ -494,14 +505,19 @@ mod tests { .map(Into::into) .collect(); for entity_path in &entity_paths { - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::default(), - entity_path.clone(), - [&[MyPoint::new(1.0, 2.0)] as _], - ) - .unwrap(); - test_ctx.recording_store.add_data_row(row).unwrap(); + let chunk = Chunk::builder(entity_path.clone()) + .with_component_batches( + RowId::new(), + TimePoint::default(), + [&[MyPoint::new(1.0, 2.0)] as _], + ) + .build() + .unwrap(); + + test_ctx + .recording_store + .add_chunk(&Arc::new(chunk)) + .unwrap(); } // All of them are visualizable with some arbitrary visualizer. @@ -712,14 +728,15 @@ mod tests { test_ctx.blueprint_store = EntityDb::new(StoreId::random(StoreKind::Blueprint)); let mut add_to_blueprint = |path: &EntityPath, batch: &dyn ComponentBatch| { - let row = DataRow::from_component_batches( - RowId::new(), - TimePoint::default(), - path.clone(), - std::iter::once(batch), - ) - .unwrap(); - test_ctx.blueprint_store.add_data_row(row).unwrap(); + let chunk = Chunk::builder(path.clone()) + .with_component_batch(RowId::new(), TimePoint::default(), batch as _) + .build() + .unwrap(); + + test_ctx + .blueprint_store + .add_chunk(&Arc::new(chunk)) + .unwrap(); }; // log individual and override components as instructed. diff --git a/crates/re_viewport_blueprint/src/space_view_contents.rs b/crates/re_viewport_blueprint/src/space_view_contents.rs index 9ed12d65fca6..cf4d918e3aeb 100644 --- a/crates/re_viewport_blueprint/src/space_view_contents.rs +++ b/crates/re_viewport_blueprint/src/space_view_contents.rs @@ -3,7 +3,7 @@ use re_types_blueprint::blueprint::components::VisualizerOverrides; use slotmap::SlotMap; use smallvec::SmallVec; -use re_entity_db::{external::re_data_store::LatestAtQuery, EntityDb, EntityTree}; +use re_entity_db::{external::re_chunk_store::LatestAtQuery, EntityDb, EntityTree}; use re_log_types::{ path::RuleEffect, EntityPath, EntityPathFilter, EntityPathRule, EntityPathSubs, Timeline, }; @@ -135,7 +135,7 @@ impl SpaceViewContents { /// update directly to the store. pub fn save_to_blueprint_store(&self, ctx: &ViewerContext<'_>) { ctx.save_blueprint_archetype( - self.blueprint_entity_path.clone(), + &self.blueprint_entity_path, &blueprint_archetypes::SpaceViewContents::new( self.entity_path_filter.iter_expressions(), ), @@ -425,20 +425,20 @@ impl DataQueryPropertyResolver<'_> { if let Some(recursive_override_subtree) = blueprint.tree().subtree(&recursive_override_path) { - for component in recursive_override_subtree.entity.components.keys() { - if let Some(component_data) = blueprint - .store() - .latest_at( - blueprint_query, - &recursive_override_path, - *component, - &[*component], - ) - .and_then(|(_, _, cells)| cells[0].clone()) + for &component_name in recursive_override_subtree.entity.components.keys() { + let results = blueprint.query_caches().latest_at( + blueprint.store(), + blueprint_query, + &recursive_override_path, + [component_name], + ); + if let Some(component_data) = results + .get(component_name) + .and_then(|results| results.raw(blueprint.resolver(), component_name)) { if !component_data.is_empty() { recursive_property_overrides.to_mut().insert( - *component, + component_name, OverridePath::blueprint_path(recursive_override_path.clone()), ); } @@ -452,20 +452,20 @@ impl DataQueryPropertyResolver<'_> { if let Some(individual_override_subtree) = blueprint.tree().subtree(&individual_override_path) { - for component in individual_override_subtree.entity.components.keys() { - if let Some(component_data) = blueprint - .store() - .latest_at( - blueprint_query, - &individual_override_path, - *component, - &[*component], - ) - .and_then(|(_, _, cells)| cells[0].clone()) + for &component_name in individual_override_subtree.entity.components.keys() { + let results = blueprint.query_caches().latest_at( + blueprint.store(), + blueprint_query, + &individual_override_path, + [component_name], + ); + if let Some(component_data) = results + .get(component_name) + .and_then(|results| results.raw(blueprint.resolver(), component_name)) { if !component_data.is_empty() { resolved_component_overrides.insert( - *component, + component_name, OverridePath::blueprint_path(individual_override_path.clone()), ); } @@ -552,8 +552,11 @@ impl DataQueryPropertyResolver<'_> { #[cfg(test)] mod tests { + use std::sync::Arc; + + use re_chunk::{Chunk, RowId}; use re_entity_db::EntityDb; - use re_log_types::{example_components::MyPoint, DataRow, RowId, StoreId, TimePoint, Timeline}; + use re_log_types::{example_components::MyPoint, StoreId, TimePoint, Timeline}; use re_viewer_context::{StoreContext, StoreHub, VisualizableEntities}; use super::*; @@ -572,15 +575,12 @@ mod tests { for entity_path in ["parent", "parent/skipped/child1", "parent/skipped/child2"] { let row_id = RowId::new(); let point = MyPoint::new(1.0, 2.0); - let row = DataRow::from_component_batches( - row_id, - timepoint.clone(), - entity_path.into(), - [&[point] as _], - ) - .unwrap(); + let chunk = Chunk::builder(entity_path.into()) + .with_component_batch(row_id, timepoint.clone(), &[point] as _) + .build() + .unwrap(); - recording.add_data_row(row).unwrap(); + recording.add_chunk(&Arc::new(chunk)).unwrap(); } let mut visualizable_entities_for_visualizer_systems = diff --git a/crates/re_viewport_blueprint/src/view_properties.rs b/crates/re_viewport_blueprint/src/view_properties.rs index 4d5e1f5051dd..bb4bddd41299 100644 --- a/crates/re_viewport_blueprint/src/view_properties.rs +++ b/crates/re_viewport_blueprint/src/view_properties.rs @@ -1,4 +1,4 @@ -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::{external::re_query::LatestAtResults, EntityDb}; use re_log_types::EntityPath; use re_types::{ diff --git a/crates/re_viewport_blueprint/src/viewport_blueprint.rs b/crates/re_viewport_blueprint/src/viewport_blueprint.rs index 888137b6c822..93afa4352511 100644 --- a/crates/re_viewport_blueprint/src/viewport_blueprint.rs +++ b/crates/re_viewport_blueprint/src/viewport_blueprint.rs @@ -7,8 +7,7 @@ use nohash_hasher::IntSet; use re_types::{Archetype as _, SpaceViewClassIdentifier}; use smallvec::SmallVec; -use crate::SpaceViewBlueprint; -use re_data_store::LatestAtQuery; +use re_chunk_store::LatestAtQuery; use re_entity_db::EntityPath; use re_types::blueprint::components::ViewerRecommendationHash; use re_types_blueprint::blueprint::archetypes as blueprint_archetypes; @@ -19,7 +18,7 @@ use re_viewer_context::{ blueprint_id_to_tile_id, ContainerId, Contents, Item, SpaceViewId, ViewerContext, }; -use crate::{container::ContainerBlueprint, TreeAction, VIEWPORT_PATH}; +use crate::{container::ContainerBlueprint, SpaceViewBlueprint, TreeAction, VIEWPORT_PATH}; // ---------------------------------------------------------------------------- diff --git a/crates/rerun/Cargo.toml b/crates/rerun/Cargo.toml index 03ee3a3c5646..fe33618a01e2 100644 --- a/crates/rerun/Cargo.toml +++ b/crates/rerun/Cargo.toml @@ -105,6 +105,7 @@ web_viewer = ["server", "dep:re_web_viewer_server", "re_sdk?/web_viewer"] [dependencies] re_build_info.workspace = true +re_chunk.workspace = true re_crash_handler.workspace = true re_entity_db.workspace = true re_format.workspace = true @@ -117,6 +118,7 @@ re_tracing.workspace = true anyhow.workspace = true document-features.workspace = true itertools.workspace = true +similar-asserts.workspace = true # Optional dependencies: re_analytics = { workspace = true, optional = true } diff --git a/crates/rerun/src/lib.rs b/crates/rerun/src/lib.rs index ce0f8ecf70c9..786c31107d70 100644 --- a/crates/rerun/src/lib.rs +++ b/crates/rerun/src/lib.rs @@ -136,9 +136,10 @@ pub use run::{run, CallSource}; #[cfg(feature = "sdk")] pub use sdk::*; -/// Everything needed to build custom `StoreSubscriber`s. -pub use re_entity_db::external::re_data_store::{ - DataStore, StoreDiff, StoreDiffKind, StoreEvent, StoreGeneration, StoreSubscriber, +/// Everything needed to build custom `ChunkStoreSubscriber`s. +pub use re_entity_db::external::re_chunk_store::{ + ChunkStore, ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent, ChunkStoreGeneration, + ChunkStoreSubscriber, }; /// To register a new external data loader, simply add an executable in your $PATH whose name diff --git a/crates/rerun/src/run.rs b/crates/rerun/src/run.rs index 66e6f895cc33..ae8c04b6e683 100644 --- a/crates/rerun/src/run.rs +++ b/crates/rerun/src/run.rs @@ -1,11 +1,15 @@ -use std::path::{Path, PathBuf}; +use std::{ + path::{Path, PathBuf}, + sync::Arc, +}; use anyhow::Context as _; use clap::Subcommand; -use itertools::Itertools; +use itertools::{izip, Itertools}; use re_data_source::DataSource; -use re_log_types::{DataTable, LogMsg, SetStoreInfo}; +use re_log_types::{LogMsg, SetStoreInfo}; +use re_sdk::log::Chunk; use re_smart_channel::{ReceiveSet, Receiver, SmartMessagePayload}; #[cfg(feature = "web_viewer")] @@ -443,13 +447,13 @@ fn initialize_thread_pool(threads_args: i32) { /// /// Returns `Ok(())` if they match, or an error containing a detailed diff otherwise. fn run_compare(path_to_rrd1: &Path, path_to_rrd2: &Path, full_dump: bool) -> anyhow::Result<()> { - /// Given a path to an rrd file, builds up a `DataStore` and returns its contents as one big - /// `DataTable`. + /// Given a path to an rrd file, builds up a `ChunkStore` and returns its contents a stream of + /// `Chunk`s. /// /// Fails if there are more than one data recordings present in the rrd file. fn compute_uber_table( path_to_rrd: &Path, - ) -> anyhow::Result<(re_log_types::ApplicationId, re_log_types::DataTable)> { + ) -> anyhow::Result<(re_log_types::ApplicationId, Vec>)> { use re_entity_db::EntityDb; use re_log_types::StoreId; @@ -486,21 +490,25 @@ fn run_compare(path_to_rrd1: &Path, path_to_rrd2: &Path, full_dump: bool) -> any .app_id() .cloned() .unwrap_or_else(re_log_types::ApplicationId::unknown), - store.store().to_data_table()?, + store.store().iter_chunks().map(Arc::clone).collect_vec(), )) } - let (app_id1, table1) = + let (app_id1, chunks1) = compute_uber_table(path_to_rrd1).with_context(|| format!("path: {path_to_rrd1:?}"))?; - let (app_id2, table2) = + let (app_id2, chunks2) = compute_uber_table(path_to_rrd2).with_context(|| format!("path: {path_to_rrd2:?}"))?; if full_dump { println!("{app_id1}"); - println!("{table1}"); + for chunk in &chunks1 { + println!("{chunk}"); + } println!("{app_id2}"); - println!("{table2}"); + for chunk in &chunks2 { + println!("{chunk}"); + } } anyhow::ensure!( @@ -508,7 +516,27 @@ fn run_compare(path_to_rrd1: &Path, path_to_rrd2: &Path, full_dump: bool) -> any "Application IDs do not match: '{app_id1}' vs. '{app_id2}'" ); - re_log_types::DataTable::similar(&table1, &table2) + anyhow::ensure!( + chunks1.len() == chunks2.len(), + "Number of Chunks does not match: '{}' vs. '{}'", + re_format::format_uint(chunks1.len()), + re_format::format_uint(chunks2.len()), + ); + + for (chunk1, chunk2) in izip!(chunks1, chunks2) { + anyhow::ensure!( + re_chunk::Chunk::are_similar(&chunk1, &chunk2), + "Chunks do not match:\n{}", + similar_asserts::SimpleDiff::from_str( + &format!("{chunk1}"), + &format!("{chunk2}"), + "got", + "expected", + ), + ); + } + + Ok(()) } impl PrintCommand { @@ -537,27 +565,27 @@ impl PrintCommand { } LogMsg::ArrowMsg(_row_id, arrow_msg) => { - let mut table = - DataTable::from_arrow_msg(&arrow_msg).context("Decode arrow message")?; + let chunk = match Chunk::from_arrow_msg(&arrow_msg) { + Ok(chunk) => chunk, + Err(err) => { + eprintln!("discarding broken chunk: {err}"); + continue; + } + }; if *verbose { - println!("{table}"); + println!("{chunk}"); } else { - table.compute_all_size_bytes(); - - let column_names = - table.columns.keys().map(|name| name.short_name()).join(" "); - - let entity_paths = if table.col_entity_path.len() == 1 { - format!("{:?}", table.col_entity_path[0]) - } else { - format!("{} different entity paths", table.col_entity_path.len()) - }; + let column_names = chunk + .component_names() + .map(|name| name.short_name()) + .join(" "); println!( - "Table with {} rows ({}) - {entity_paths} - columns: [{column_names}]", - table.num_rows(), - re_format::format_bytes(table.heap_size_bytes() as _), + "Chunk with {} rows ({}) - {:?} - columns: [{column_names}]", + chunk.num_rows(), + re_format::format_bytes(chunk.total_size_bytes() as _), + chunk.entity_path(), ); } } @@ -841,7 +869,6 @@ fn assert_receive_into_entity_db( if let Some(err) = err { anyhow::bail!("data source has disconnected unexpectedly: {err}") } else if let Some(db) = rec { - db.store().sanity_check()?; anyhow::ensure!(0 < num_messages, "No messages received"); re_log::info!("Successfully ingested {num_messages} messages."); return Ok(db); diff --git a/docs/content/reference/types/datatypes.md b/docs/content/reference/types/datatypes.md index faf7660ed9a8..24fbb617e855 100644 --- a/docs/content/reference/types/datatypes.md +++ b/docs/content/reference/types/datatypes.md @@ -13,7 +13,7 @@ Data types are the lowest layer of the data model hierarchy. They are re-usable * [`ClassDescription`](datatypes/class_description.md): The description of a semantic Class. * [`ClassDescriptionMapElem`](datatypes/class_description_map_elem.md): A helper type for mapping class IDs to class descriptions. * [`ClassId`](datatypes/class_id.md): A 16-bit ID representing a type of semantic class. -* [`EntityPath`](datatypes/entity_path.md): A path to an entity in the `DataStore`. +* [`EntityPath`](datatypes/entity_path.md): A path to an entity in the `ChunkStore`. * [`Float32`](datatypes/float32.md): A single-precision 32-bit IEEE 754 floating point number. * [`KeypointId`](datatypes/keypoint_id.md): A 16-bit ID representing a type of semantic keypoint within a class. * [`KeypointPair`](datatypes/keypoint_pair.md): A connection between two `Keypoints`. diff --git a/docs/content/reference/types/datatypes/entity_path.md b/docs/content/reference/types/datatypes/entity_path.md index a519dbcf3cc0..c80d5f05a294 100644 --- a/docs/content/reference/types/datatypes/entity_path.md +++ b/docs/content/reference/types/datatypes/entity_path.md @@ -3,7 +3,7 @@ title: "EntityPath" --- -A path to an entity in the `DataStore`. +A path to an entity in the `ChunkStore`. ## Fields diff --git a/examples/rust/custom_data_loader/src/main.rs b/examples/rust/custom_data_loader/src/main.rs index ccdc0216cecf..1faed5d57e64 100644 --- a/examples/rust/custom_data_loader/src/main.rs +++ b/examples/rust/custom_data_loader/src/main.rs @@ -8,7 +8,7 @@ use rerun::{ external::{anyhow, re_build_info, re_data_loader, re_log}, - log::{DataRow, RowId}, + log::{Chunk, RowId}, EntityPath, TimePoint, }; @@ -72,9 +72,11 @@ fn hash_and_log( let entity_path = EntityPath::from_file_path(filepath); let entity_path = format!("{entity_path}/hashed").into(); - let row = DataRow::from_archetype(RowId::new(), TimePoint::default(), entity_path, &doc)?; + let chunk = Chunk::builder(entity_path) + .with_archetype(RowId::new(), TimePoint::default(), &doc) + .build()?; - tx.send(row.into()).ok(); + tx.send(chunk.into()).ok(); Ok(()) } diff --git a/examples/rust/custom_store_subscriber/README.md b/examples/rust/custom_store_subscriber/README.md index 64f48f7fc690..006f763e09ae 100644 --- a/examples/rust/custom_store_subscriber/README.md +++ b/examples/rust/custom_store_subscriber/README.md @@ -12,7 +12,7 @@ tags = ["Store event", "Store diff", "Store subscriber"] -This example demonstrates how to use [`StoreSubscriber`]s and [`StoreEvent`]s to implement both custom secondary indices and trigger systems. +This example demonstrates how to use [`ChunkStoreSubscriber`]s and [`ChunkStoreEvent`]s to implement both custom secondary indices and trigger systems. Usage: ```sh diff --git a/examples/rust/custom_store_subscriber/src/main.rs b/examples/rust/custom_store_subscriber/src/main.rs index bbbf7f58503c..dd4ab8bdfcba 100644 --- a/examples/rust/custom_store_subscriber/src/main.rs +++ b/examples/rust/custom_store_subscriber/src/main.rs @@ -1,4 +1,4 @@ -//! This example demonstrates how to use [`StoreSubscriber`]s and [`StoreEvent`]s to implement both +//! This example demonstrates how to use [`ChunkStoreSubscriber`]s and [`ChunkStoreEvent`]s to implement both //! custom secondary indices and trigger systems. //! //! Usage: @@ -13,15 +13,15 @@ use std::collections::BTreeMap; use rerun::{ - external::{anyhow, re_build_info, re_data_store, re_log, re_log_types::ResolvedTimeRange}, + external::{anyhow, re_build_info, re_chunk_store, re_log, re_log_types::ResolvedTimeRange}, time::TimeInt, - ComponentName, EntityPath, StoreEvent, StoreId, StoreSubscriber, Timeline, + ChunkStoreEvent, ChunkStoreSubscriber, ComponentName, EntityPath, StoreId, Timeline, }; fn main() -> anyhow::Result { re_log::setup_logging(); - let _handle = re_data_store::DataStore::register_subscriber(Box::::default()); + let _handle = re_chunk_store::ChunkStore::register_subscriber(Box::::default()); // Could use the returned handle to get a reference to the view if needed. let build_info = re_build_info::build_info!(); @@ -31,19 +31,19 @@ fn main() -> anyhow::Result { // --- -/// A meta [`StoreSubscriber`] that distributes work to our other views. +/// A meta [`ChunkStoreSubscriber`] that distributes work to our other views. /// /// The order is which registered views are executed is undefined: if you rely on a specific order /// of execution between your views, orchestrate it yourself! /// -/// Clears the terminal and resets the cursor for every new batch of [`StoreEvent`]s. +/// Clears the terminal and resets the cursor for every new batch of [`ChunkStoreEvent`]s. #[derive(Default)] struct Orchestrator { components_per_recording: ComponentsPerRecording, time_ranges_per_entity: TimeRangesPerEntity, } -impl StoreSubscriber for Orchestrator { +impl ChunkStoreSubscriber for Orchestrator { fn name(&self) -> String { "rerun.store_subscriber.ScreenClearer".into() } @@ -56,7 +56,7 @@ impl StoreSubscriber for Orchestrator { self } - fn on_events(&mut self, events: &[StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { print!("\x1B[2J\x1B[1;1H"); // terminal clear + cursor reset self.components_per_recording.on_events(events); @@ -66,19 +66,19 @@ impl StoreSubscriber for Orchestrator { // --- -/// A [`StoreSubscriber`] that maintains a secondary index that keeps count of the number of occurrences -/// of each component in each [`rerun::DataStore`]. +/// A [`ChunkStoreSubscriber`] that maintains a secondary index that keeps count of the number of occurrences +/// of each component in each [`rerun::ChunkStore`]. /// /// It also implements a trigger that prints to the console each time a component is first introduced /// and retired. /// -/// For every [`StoreEvent`], it displays the state of the secondary index to the terminal. +/// For every [`ChunkStoreEvent`], it displays the state of the secondary index to the terminal. #[derive(Default, Debug, PartialEq, Eq)] struct ComponentsPerRecording { counters: BTreeMap>, } -impl StoreSubscriber for ComponentsPerRecording { +impl ChunkStoreSubscriber for ComponentsPerRecording { fn name(&self) -> String { "rerun.store_subscriber.ComponentsPerRecording".into() } @@ -91,11 +91,11 @@ impl StoreSubscriber for ComponentsPerRecording { self } - fn on_events(&mut self, events: &[StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { for event in events { // update counters let per_component = self.counters.entry(event.store_id.clone()).or_default(); - for &component_name in event.cells.keys() { + for component_name in event.chunk.component_names() { let count = per_component.entry(component_name).or_default(); // if first occurrence, speak! @@ -135,16 +135,16 @@ impl StoreSubscriber for ComponentsPerRecording { // --- -/// A [`StoreSubscriber`] that maintains a secondary index of the time ranges covered by each entity, -/// on every timeline, across all recordings (i.e. [`rerun::DataStore`]s). +/// A [`ChunkStoreSubscriber`] that maintains a secondary index of the time ranges covered by each entity, +/// on every timeline, across all recordings (i.e. [`rerun::ChunkStore`]s). /// -/// For every [`StoreEvent`], it displays the state of the secondary index to the terminal. +/// For every [`ChunkStoreEvent`], it displays the state of the secondary index to the terminal. #[derive(Default, Debug, PartialEq, Eq)] struct TimeRangesPerEntity { times: BTreeMap>>, } -impl StoreSubscriber for TimeRangesPerEntity { +impl ChunkStoreSubscriber for TimeRangesPerEntity { fn name(&self) -> String { "rerun.store_subscriber.TimeRangesPerEntity".into() } @@ -157,18 +157,23 @@ impl StoreSubscriber for TimeRangesPerEntity { self } - fn on_events(&mut self, events: &[StoreEvent]) { + fn on_events(&mut self, events: &[ChunkStoreEvent]) { for event in events { - for &(timeline, time) in &event.times { - // update counters - let per_timeline = self.times.entry(event.entity_path.clone()).or_default(); - let per_time = per_timeline.entry(timeline).or_default(); - let count = per_time.entry(time).or_default(); - - *count = count.saturating_add_signed(event.delta()); - - if *count == 0 { - per_time.remove(&time); + for (timeline, time_chunk) in event.chunk.timelines() { + for time in time_chunk.times() { + // update counters + let per_timeline = self + .times + .entry(event.chunk.entity_path().clone()) + .or_default(); + let per_time = per_timeline.entry(*timeline).or_default(); + let count = per_time.entry(time).or_default(); + + *count = count.saturating_add_signed(event.delta()); + + if *count == 0 { + per_time.remove(&time); + } } } } diff --git a/examples/rust/dna/src/main.rs b/examples/rust/dna/src/main.rs index fcb65cbbc2b6..43f518f25075 100644 --- a/examples/rust/dna/src/main.rs +++ b/examples/rust/dna/src/main.rs @@ -12,7 +12,8 @@ use rerun::{ const NUM_POINTS: usize = 100; fn main() -> Result<(), Box> { - let rec = rerun::RecordingStreamBuilder::new("rerun_example_dna_abacus").spawn()?; + let rec = rerun::RecordingStreamBuilder::new("rerun_example_dna_abacus") + .save("/tmp/helix_chunks.rrd")?; let (points1, colors1) = color_spiral(NUM_POINTS, 2.0, 0.02, 0.0, 0.1); let (points2, colors2) = color_spiral(NUM_POINTS, 2.0, 0.02, TAU * 0.5, 0.1); diff --git a/examples/rust/extend_viewer_ui/src/main.rs b/examples/rust/extend_viewer_ui/src/main.rs index 06190693b9b0..a7eb54f07476 100644 --- a/examples/rust/extend_viewer_ui/src/main.rs +++ b/examples/rust/extend_viewer_ui/src/main.rs @@ -1,7 +1,7 @@ //! This example shows how to wrap the Rerun Viewer in your own GUI. use re_viewer::external::{ - arrow2, eframe, egui, re_data_store, re_entity_db, re_log, re_log_types, re_memory, re_types, + arrow2, eframe, egui, re_chunk_store, re_entity_db, re_log, re_log_types, re_memory, re_types, }; // By using `re_memory::AccountingAllocator` Rerun can keep track of exactly how much memory it is using, @@ -148,7 +148,7 @@ fn component_ui( ) { // You can query the data for any time point, but for now // just show the last value logged for each component: - let query = re_data_store::LatestAtQuery::latest(timeline); + let query = re_chunk_store::LatestAtQuery::latest(timeline); let results = entity_db.query_caches().latest_at( entity_db.store(), diff --git a/rerun_cpp/src/rerun/datatypes/entity_path.hpp b/rerun_cpp/src/rerun/datatypes/entity_path.hpp index 05e9f35e74dd..b57a5b68b13a 100644 --- a/rerun_cpp/src/rerun/datatypes/entity_path.hpp +++ b/rerun_cpp/src/rerun/datatypes/entity_path.hpp @@ -17,7 +17,7 @@ namespace arrow { } // namespace arrow namespace rerun::datatypes { - /// **Datatype**: A path to an entity in the `DataStore`. + /// **Datatype**: A path to an entity in the `ChunkStore`. struct EntityPath { std::string path; diff --git a/rerun_py/rerun_sdk/rerun/datatypes/entity_path.py b/rerun_py/rerun_sdk/rerun/datatypes/entity_path.py index 6b6da2e4dc3b..5b3669ab906c 100644 --- a/rerun_py/rerun_sdk/rerun/datatypes/entity_path.py +++ b/rerun_py/rerun_sdk/rerun/datatypes/entity_path.py @@ -20,7 +20,7 @@ @define(init=False) class EntityPath: - """**Datatype**: A path to an entity in the `DataStore`.""" + """**Datatype**: A path to an entity in the `ChunkStore`.""" def __init__(self: Any, path: EntityPathLike): """Create a new instance of the EntityPath datatype.""" diff --git a/rerun_py/src/arrow.rs b/rerun_py/src/arrow.rs index 88a805db9908..0d7641c02d13 100644 --- a/rerun_py/src/arrow.rs +++ b/rerun_py/src/arrow.rs @@ -5,8 +5,8 @@ use pyo3::{ exceptions::PyValueError, ffi::Py_uintptr_t, types::PyDict, types::PyString, PyAny, PyResult, }; -use re_chunk::PendingRow; -use re_log_types::{RowId, TimePoint}; +use re_chunk::{PendingRow, RowId}; +use re_log_types::TimePoint; /// Perform conversion between a pyarrow array to arrow2 types. ///