diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 5b7d0e5e7ac7..49eaecbb22c3 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -39,7 +39,7 @@ NOTE: `.rrd` files do not yet guarantee any backwards or forwards compatibility.
## Technologies we use
### Apache Arrow
-[Apache Arrow](https://arrow.apache.org/) is a language-independent columnar memory format for arbitrary data. We use it to encode the log data when transmitting it over the network or storing it in an `.rrd` file. We also use it in our in-RAM data store, [`re_data_store`](crates/re_data_store/README.md).
+[Apache Arrow](https://arrow.apache.org/) is a language-independent columnar memory format for arbitrary data. We use it to encode the log data when transmitting it over the network or storing it in an `.rrd` file. We also use it in our in-RAM data store, [`re_chunk_store`](crates/re_chunk_store/README.md).
In Rust, we use the [`arrow2` crate](https://crates.io/crates/arrow2).
@@ -88,11 +88,11 @@ Of course, this will only take us so far. In the future we plan on caching queri
Here is an overview of the crates included in the project:
@@ -160,7 +160,7 @@ Update instructions:
| Crate | Description |
|----------------------|--------------------------------------------------------------------------|
| re_entity_db | In-memory storage of Rerun entities |
-| re_query | Querying data in the re_data_store |
+| re_query | Querying data in the re_chunk_store |
| re_types | The built-in Rerun data types, component types, and archetypes. |
| re_types_blueprint | The core traits and types that power Rerun's Blueprint sub-system. |
| re_log_encoding | Helpers for encoding and transporting Rerun log messages |
@@ -171,7 +171,7 @@ Update instructions:
| Crate | Description |
|-----------------|-----------------------------------------------------------------------------------------------|
| re_chunk | A chunk of Rerun data, encoded using Arrow. Used for logging, transport, storage and compute. |
-| re_data_store | An in-memory time series database for Rerun log data, based on Apache Arrow. |
+| re_chunk_store | An in-memory time series database for Rerun log data, based on Apache Arrow. |
| re_log_types | The basic building blocks of the Rerun data types and tables. |
| re_types_core | The core traits and types that power Rerun's data model. |
| re_format_arrow | Formatting of Apache Arrow tables. |
diff --git a/Cargo.lock b/Cargo.lock
index 9502048a834c..ba77905d55b0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4305,12 +4305,46 @@ dependencies = [
"re_tracing",
"re_tuid",
"re_types_core",
+ "serde",
+ "serde_bytes",
"similar-asserts",
"smallvec",
"static_assertions",
"thiserror",
]
+[[package]]
+name = "re_chunk_store"
+version = "0.17.0-alpha.9"
+dependencies = [
+ "ahash",
+ "anyhow",
+ "criterion",
+ "document-features",
+ "indent",
+ "insta",
+ "itertools 0.13.0",
+ "mimalloc",
+ "nohash-hasher",
+ "once_cell",
+ "parking_lot",
+ "rand",
+ "re_arrow2",
+ "re_chunk",
+ "re_format",
+ "re_format_arrow",
+ "re_log",
+ "re_log_types",
+ "re_tracing",
+ "re_types",
+ "re_types_core",
+ "similar-asserts",
+ "smallvec",
+ "thiserror",
+ "tinyvec",
+ "web-time",
+]
+
[[package]]
name = "re_context_menu"
version = "0.17.0-alpha.9"
@@ -4356,6 +4390,7 @@ dependencies = [
"rayon",
"re_build_info",
"re_build_tools",
+ "re_chunk",
"re_log",
"re_log_encoding",
"re_log_types",
@@ -4383,37 +4418,6 @@ dependencies = [
"re_ws_comms",
]
-[[package]]
-name = "re_data_store"
-version = "0.17.0-alpha.9"
-dependencies = [
- "ahash",
- "anyhow",
- "criterion",
- "document-features",
- "indent",
- "insta",
- "itertools 0.13.0",
- "mimalloc",
- "nohash-hasher",
- "once_cell",
- "parking_lot",
- "rand",
- "re_arrow2",
- "re_format",
- "re_format_arrow",
- "re_log",
- "re_log_types",
- "re_tracing",
- "re_types",
- "re_types_core",
- "similar-asserts",
- "smallvec",
- "thiserror",
- "tinyvec",
- "web-time",
-]
-
[[package]]
name = "re_data_ui"
version = "0.17.0-alpha.9"
@@ -4426,7 +4430,7 @@ dependencies = [
"egui_plot",
"image",
"itertools 0.13.0",
- "re_data_store",
+ "re_chunk_store",
"re_entity_db",
"re_error",
"re_format",
@@ -4498,7 +4502,8 @@ dependencies = [
"parking_lot",
"rand",
"re_build_info",
- "re_data_store",
+ "re_chunk",
+ "re_chunk_store",
"re_format",
"re_int_histogram",
"re_log",
@@ -4574,6 +4579,7 @@ dependencies = [
"mimalloc",
"parking_lot",
"re_build_info",
+ "re_chunk",
"re_log",
"re_log_types",
"re_smart_channel",
@@ -4666,7 +4672,8 @@ dependencies = [
"paste",
"rand",
"re_arrow2",
- "re_data_store",
+ "re_chunk",
+ "re_chunk_store",
"re_error",
"re_format",
"re_log",
@@ -4770,8 +4777,8 @@ dependencies = [
"re_build_info",
"re_build_tools",
"re_chunk",
+ "re_chunk_store",
"re_data_loader",
- "re_data_store",
"re_log",
"re_log_encoding",
"re_log_types",
@@ -4811,8 +4818,9 @@ dependencies = [
"itertools 0.13.0",
"nohash-hasher",
"once_cell",
+ "re_chunk",
+ "re_chunk_store",
"re_context_menu",
- "re_data_store",
"re_data_ui",
"re_entity_db",
"re_log",
@@ -4850,7 +4858,7 @@ dependencies = [
"ahash",
"egui",
"nohash-hasher",
- "re_data_store",
+ "re_chunk_store",
"re_entity_db",
"re_log",
"re_log_types",
@@ -4868,7 +4876,7 @@ version = "0.17.0-alpha.9"
dependencies = [
"egui",
"egui_plot",
- "re_data_store",
+ "re_chunk_store",
"re_entity_db",
"re_log",
"re_log_types",
@@ -4887,7 +4895,7 @@ version = "0.17.0-alpha.9"
dependencies = [
"egui",
"egui_extras",
- "re_data_store",
+ "re_chunk_store",
"re_data_ui",
"re_entity_db",
"re_log_types",
@@ -4914,7 +4922,7 @@ dependencies = [
"mimalloc",
"nohash-hasher",
"once_cell",
- "re_data_store",
+ "re_chunk_store",
"re_data_ui",
"re_entity_db",
"re_error",
@@ -4944,7 +4952,7 @@ dependencies = [
"egui",
"half 2.3.1",
"ndarray",
- "re_data_store",
+ "re_chunk_store",
"re_data_ui",
"re_entity_db",
"re_log",
@@ -4967,7 +4975,7 @@ version = "0.17.0-alpha.9"
dependencies = [
"egui",
"egui_commonmark",
- "re_data_store",
+ "re_chunk_store",
"re_renderer",
"re_space_view",
"re_tracing",
@@ -4982,7 +4990,7 @@ version = "0.17.0-alpha.9"
dependencies = [
"egui",
"egui_extras",
- "re_data_store",
+ "re_chunk_store",
"re_data_ui",
"re_entity_db",
"re_log",
@@ -5004,7 +5012,7 @@ dependencies = [
"egui_plot",
"itertools 0.13.0",
"rayon",
- "re_data_store",
+ "re_chunk_store",
"re_format",
"re_log",
"re_log_types",
@@ -5036,8 +5044,8 @@ version = "0.17.0-alpha.9"
dependencies = [
"egui",
"itertools 0.13.0",
+ "re_chunk_store",
"re_context_menu",
- "re_data_store",
"re_data_ui",
"re_entity_db",
"re_format",
@@ -5220,9 +5228,10 @@ dependencies = [
"re_blueprint_tree",
"re_build_info",
"re_build_tools",
+ "re_chunk",
+ "re_chunk_store",
"re_data_loader",
"re_data_source",
- "re_data_store",
"re_data_ui",
"re_edit_ui",
"re_entity_db",
@@ -5294,8 +5303,9 @@ dependencies = [
"nohash-hasher",
"once_cell",
"parking_lot",
+ "re_chunk",
+ "re_chunk_store",
"re_data_source",
- "re_data_store",
"re_entity_db",
"re_error",
"re_format",
@@ -5354,7 +5364,8 @@ dependencies = [
"nohash-hasher",
"once_cell",
"parking_lot",
- "re_data_store",
+ "re_chunk",
+ "re_chunk_store",
"re_entity_db",
"re_log",
"re_log_types",
@@ -5482,6 +5493,7 @@ dependencies = [
"re_analytics",
"re_build_info",
"re_build_tools",
+ "re_chunk",
"re_crash_handler",
"re_data_source",
"re_entity_db",
@@ -5498,6 +5510,7 @@ dependencies = [
"re_viewer",
"re_web_viewer_server",
"re_ws_comms",
+ "similar-asserts",
]
[[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 542bfa83b1c5..822b1d8c1a5c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,6 +34,7 @@ re_build_info = { path = "crates/re_build_info", version = "=0.17.0-alpha.9", de
re_build_tools = { path = "crates/re_build_tools", version = "=0.17.0-alpha.9", default-features = false }
re_case = { path = "crates/re_case", version = "=0.17.0-alpha.9", default-features = false }
re_chunk = { path = "crates/re_chunk", version = "=0.17.0-alpha.9", default-features = false }
+re_chunk_store = { path = "crates/re_chunk_store", version = "=0.17.0-alpha.9", default-features = false }
re_context_menu = { path = "crates/re_context_menu", version = "=0.17.0-alpha.9", default-features = false }
re_crash_handler = { path = "crates/re_crash_handler", version = "=0.17.0-alpha.9", default-features = false }
re_data_loader = { path = "crates/re_data_loader", version = "=0.17.0-alpha.9", default-features = false }
diff --git a/crates/re_analytics/src/event.rs b/crates/re_analytics/src/event.rs
index f6e7fe520896..ad3ee04b08f5 100644
--- a/crates/re_analytics/src/event.rs
+++ b/crates/re_analytics/src/event.rs
@@ -75,7 +75,7 @@ pub struct OpenRecording {
pub data_source: Option<&'static str>,
}
-/// Basic information about a recording's data store.
+/// Basic information about a recording's chunk store.
pub struct StoreInfo {
/// Name of the application.
///
diff --git a/crates/re_chunk/Cargo.toml b/crates/re_chunk/Cargo.toml
index e9f1c56ac4c9..f32fb079e859 100644
--- a/crates/re_chunk/Cargo.toml
+++ b/crates/re_chunk/Cargo.toml
@@ -22,6 +22,16 @@ all-features = true
[features]
default = []
+## Enable (de)serialization using serde.
+serde = [
+ "dep:serde",
+ "dep:serde_bytes",
+ "re_log_types/serde",
+ "re_string_interner/serde",
+ "re_tuid/serde",
+ "re_types_core/serde",
+]
+
[dependencies]
@@ -40,10 +50,8 @@ re_types_core.workspace = true
ahash.workspace = true
anyhow.workspace = true
arrow2 = { workspace = true, features = [
- "io_ipc",
- "io_print",
- "compute_comparison",
"compute_concatenate",
+ "compute_filter",
] }
backtrace.workspace = true
document-features.workspace = true
@@ -55,12 +63,17 @@ smallvec.workspace = true
static_assertions.workspace = true
thiserror.workspace = true
+# Optional dependencies:
+serde = { workspace = true, optional = true, features = ["derive", "rc"] }
+serde_bytes = { workspace = true, optional = true }
+
# Native dependencies:
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
crossbeam.workspace = true
[dev-dependencies]
+re_log = { workspace = true, features = ["setup"] }
criterion.workspace = true
mimalloc.workspace = true
similar-asserts.workspace = true
diff --git a/crates/re_chunk/examples/latest_at.rs b/crates/re_chunk/examples/latest_at.rs
new file mode 100644
index 000000000000..2aed6de34767
--- /dev/null
+++ b/crates/re_chunk/examples/latest_at.rs
@@ -0,0 +1,73 @@
+use re_chunk::{Chunk, LatestAtQuery, RowId, Timeline};
+use re_log_types::example_components::{MyColor, MyLabel, MyPoint};
+use re_types_core::Loggable as _;
+
+// ---
+
+fn main() -> anyhow::Result<()> {
+ let chunk = create_chunk()?;
+
+ eprintln!("Data:\n{chunk}");
+
+ let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 4);
+
+ // Find all relevant data for a query:
+ let chunk = chunk.latest_at(&query, MyPoint::name());
+ eprintln!("{:?} @ {query:?}:\n{chunk}", MyPoint::name());
+
+ // And then slice it as appropriate:
+ let chunk = chunk
+ .timeline_sliced(Timeline::log_time())
+ .component_sliced(MyPoint::name());
+ eprintln!("Sliced down to specific timeline and component:\n{chunk}");
+
+ Ok(())
+}
+
+fn create_chunk() -> anyhow::Result {
+ let mut chunk = Chunk::builder("my/entity".into())
+ .with_component_batches(
+ RowId::new(),
+ [
+ (Timeline::log_time(), 1000),
+ (Timeline::new_sequence("frame"), 1),
+ ],
+ [
+ &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)] as _, //
+ ],
+ )
+ .with_component_batches(
+ RowId::new(),
+ [
+ (Timeline::log_time(), 1032),
+ (Timeline::new_sequence("frame"), 3),
+ ],
+ [
+ &[MyColor::from_rgb(1, 1, 1)] as _, //
+ &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ] as _, //
+ ],
+ )
+ .with_component_batches(
+ RowId::new(),
+ [
+ (Timeline::log_time(), 1064),
+ (Timeline::new_sequence("frame"), 5),
+ ],
+ [
+ &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ] as _, //
+ ],
+ )
+ .build()?;
+
+ chunk.sort_if_unsorted();
+
+ Ok(chunk)
+}
diff --git a/crates/re_chunk/examples/range.rs b/crates/re_chunk/examples/range.rs
new file mode 100644
index 000000000000..822d1901c399
--- /dev/null
+++ b/crates/re_chunk/examples/range.rs
@@ -0,0 +1,79 @@
+use re_chunk::{Chunk, RangeQuery, RowId, Timeline};
+use re_log_types::{
+ example_components::{MyColor, MyLabel, MyPoint},
+ ResolvedTimeRange,
+};
+use re_types_core::Loggable as _;
+
+// ---
+
+fn main() -> anyhow::Result<()> {
+ let chunk = create_chunk()?;
+
+ eprintln!("Data:\n{chunk}");
+
+ let query = RangeQuery::new(
+ Timeline::new_sequence("frame"),
+ ResolvedTimeRange::EVERYTHING,
+ );
+
+ // Find all relevant data for a query:
+ let chunk = chunk.range(&query, MyPoint::name());
+ eprintln!("{:?} @ {query:?}:\n{chunk}", MyPoint::name());
+
+ // And then slice it as appropriate:
+ let chunk = chunk
+ .timeline_sliced(Timeline::log_time())
+ .component_sliced(MyPoint::name());
+ eprintln!("Sliced down to specific timeline and component:\n{chunk}");
+
+ Ok(())
+}
+
+fn create_chunk() -> anyhow::Result {
+ let mut chunk = Chunk::builder("my/entity".into())
+ .with_component_batches(
+ RowId::new(),
+ [
+ (Timeline::log_time(), 1000),
+ (Timeline::new_sequence("frame"), 1),
+ ],
+ [
+ &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)] as _, //
+ ],
+ )
+ .with_component_batches(
+ RowId::new(),
+ [
+ (Timeline::log_time(), 1032),
+ (Timeline::new_sequence("frame"), 3),
+ ],
+ [
+ &[MyColor::from_rgb(1, 1, 1)] as _, //
+ &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ] as _, //
+ ],
+ )
+ .with_component_batches(
+ RowId::new(),
+ [
+ (Timeline::log_time(), 1064),
+ (Timeline::new_sequence("frame"), 5),
+ ],
+ [
+ &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ] as _, //
+ ],
+ )
+ .build()?;
+
+ chunk.sort_if_unsorted();
+
+ Ok(chunk)
+}
diff --git a/crates/re_chunk/src/batcher.rs b/crates/re_chunk/src/batcher.rs
index 284eb5800b41..182a799891a3 100644
--- a/crates/re_chunk/src/batcher.rs
+++ b/crates/re_chunk/src/batcher.rs
@@ -5,14 +5,14 @@ use std::{
time::{Duration, Instant},
};
-use arrow2::array::Array as ArrowArray;
+use arrow2::array::{Array as ArrowArray, PrimitiveArray as ArrowPrimitiveArray};
use crossbeam::channel::{Receiver, Sender};
use nohash_hasher::IntMap;
-use re_log_types::{EntityPath, RowId, TimePoint, Timeline};
+use re_log_types::{EntityPath, ResolvedTimeRange, TimeInt, TimePoint, Timeline};
use re_types_core::{ComponentName, SizeBytes as _};
-use crate::{arrays_to_list_array, chunk::ChunkResult, Chunk, ChunkId, ChunkTimeline};
+use crate::{Chunk, ChunkId, ChunkResult, ChunkTimeline, RowId};
// ---
@@ -551,6 +551,10 @@ fn batching_thread(config: ChunkBatcherConfig, rx_cmd: Receiver, tx_chu
re_format::format_bytes(config.flush_num_bytes as _),
);
+ // Set to `true` when a flush is triggered for a reason other than hitting the time threshold,
+ // so that the next tick will not unncessarily fire early.
+ let mut skip_next_tick = false;
+
use crossbeam::select;
loop {
select! {
@@ -574,12 +578,15 @@ fn batching_thread(config: ChunkBatcherConfig, rx_cmd: Receiver, tx_chu
if acc.pending_rows.len() as u64 >= config.flush_num_rows {
do_flush_all(acc, &tx_chunk, "rows", config.max_chunk_rows_if_unsorted);
+ skip_next_tick = true;
} else if acc.pending_num_bytes >= config.flush_num_bytes {
do_flush_all(acc, &tx_chunk, "bytes", config.max_chunk_rows_if_unsorted);
+ skip_next_tick = true;
}
},
Command::Flush(oneshot) => {
+ skip_next_tick = true;
for acc in accs.values_mut() {
do_flush_all(acc, &tx_chunk, "manual", config.max_chunk_rows_if_unsorted);
}
@@ -591,9 +598,13 @@ fn batching_thread(config: ChunkBatcherConfig, rx_cmd: Receiver, tx_chu
},
recv(rx_tick) -> _ => {
- // TODO(cmc): It would probably be better to have a ticker per entity path. Maybe. At some point.
- for acc in accs.values_mut() {
- do_flush_all(acc, &tx_chunk, "tick", config.max_chunk_rows_if_unsorted);
+ if skip_next_tick {
+ skip_next_tick = false;
+ } else {
+ // TODO(cmc): It would probably be better to have a ticker per entity path. Maybe. At some point.
+ for acc in accs.values_mut() {
+ do_flush_all(acc, &tx_chunk, "tick", config.max_chunk_rows_if_unsorted);
+ }
}
},
};
@@ -678,23 +689,26 @@ impl PendingRow {
let timelines = timepoint
.into_iter()
- .filter_map(|(timeline, time)| {
- ChunkTimeline::new(Some(true), vec![time]).map(|time_chunk| (timeline, time_chunk))
+ .map(|(timeline, time)| {
+ let times = ArrowPrimitiveArray::::from_vec(vec![time.as_i64()]);
+ let time_chunk = ChunkTimeline::new(Some(true), timeline, times);
+ (timeline, time_chunk)
})
.collect();
let components = components
.into_iter()
.filter_map(|(component_name, array)| {
- arrays_to_list_array(&[Some(&*array as _)]).map(|array| (component_name, array))
+ crate::util::arrays_to_list_array_opt(&[Some(&*array as _)])
+ .map(|array| (component_name, array))
})
.collect();
- Chunk::new(
+ Chunk::from_native_row_ids(
ChunkId::new(),
entity_path,
Some(true),
- vec![row_id],
+ &[row_id],
timelines,
components,
)
@@ -772,7 +786,7 @@ impl PendingRow {
re_tracing::profile_scope!("iterate per datatype set");
let mut row_ids: Vec = Vec::with_capacity(rows.len());
- let mut timelines: BTreeMap = BTreeMap::default();
+ let mut timelines: BTreeMap = BTreeMap::default();
// Create all the logical list arrays that we're going to need, accounting for the
// possibility of sparse components in the data.
@@ -798,22 +812,27 @@ impl PendingRow {
// the pre-configured `max_chunk_rows_if_unsorted` threshold, then split _even_
// further!
for (&timeline, _) in row_timepoint {
- let time_chunk = timelines.entry(timeline).or_default();
+ let time_chunk = timelines
+ .entry(timeline)
+ .or_insert_with(|| PendingChunkTimeline::new(timeline));
if !row_ids.is_empty() // just being extra cautious
&& row_ids.len() as u64 >= max_chunk_rows_if_unsorted
- && !time_chunk.is_sorted()
+ && !time_chunk.is_sorted
{
- chunks.push(Chunk::new(
+ chunks.push(Chunk::from_native_row_ids(
ChunkId::new(),
entity_path.clone(),
Some(true),
- std::mem::take(&mut row_ids),
- std::mem::take(&mut timelines),
+ &std::mem::take(&mut row_ids),
+ std::mem::take(&mut timelines)
+ .into_iter()
+ .map(|(timeline, time_chunk)| (timeline, time_chunk.finish()))
+ .collect(),
std::mem::take(&mut components)
.into_iter()
.filter_map(|(component_name, arrays)| {
- arrays_to_list_array(&arrays)
+ crate::util::arrays_to_list_array_opt(&arrays)
.map(|list_array| (component_name, list_array))
})
.collect(),
@@ -826,7 +845,9 @@ impl PendingRow {
row_ids.push(*row_id);
for (&timeline, &time) in row_timepoint {
- let time_chunk = timelines.entry(timeline).or_default();
+ let time_chunk = timelines
+ .entry(timeline)
+ .or_insert_with(|| PendingChunkTimeline::new(timeline));
time_chunk.push(time);
}
@@ -841,16 +862,19 @@ impl PendingRow {
}
}
- chunks.push(Chunk::new(
+ chunks.push(Chunk::from_native_row_ids(
ChunkId::new(),
entity_path.clone(),
Some(true),
- row_ids,
- timelines,
+ &std::mem::take(&mut row_ids),
+ timelines
+ .into_iter()
+ .map(|(timeline, time_chunk)| (timeline, time_chunk.finish()))
+ .collect(),
components
.into_iter()
.filter_map(|(component_name, arrays)| {
- arrays_to_list_array(&arrays)
+ crate::util::arrays_to_list_array_opt(&arrays)
.map(|list_array| (component_name, list_array))
})
.collect(),
@@ -862,6 +886,58 @@ impl PendingRow {
}
}
+/// Helper class used to buffer time data.
+///
+/// See [`PendingRow::many_into_chunks`] for usage.
+struct PendingChunkTimeline {
+ timeline: Timeline,
+ times: Vec,
+ is_sorted: bool,
+ time_range: ResolvedTimeRange,
+}
+
+impl PendingChunkTimeline {
+ fn new(timeline: Timeline) -> Self {
+ Self {
+ timeline,
+ times: Default::default(),
+ is_sorted: true,
+ time_range: ResolvedTimeRange::EMPTY,
+ }
+ }
+
+ /// Push a single time value at the end of this chunk.
+ fn push(&mut self, time: TimeInt) {
+ let Self {
+ timeline: _,
+ times,
+ is_sorted,
+ time_range,
+ } = self;
+
+ *is_sorted &= times.last().copied().unwrap_or(TimeInt::MIN.as_i64()) <= time.as_i64();
+ time_range.set_min(TimeInt::min(time_range.min(), time));
+ time_range.set_max(TimeInt::max(time_range.max(), time));
+ times.push(time.as_i64());
+ }
+
+ fn finish(self) -> ChunkTimeline {
+ let Self {
+ timeline,
+ times,
+ is_sorted,
+ time_range,
+ } = self;
+
+ ChunkTimeline {
+ timeline,
+ times: ArrowPrimitiveArray::::from_vec(times).to(timeline.datatype()),
+ is_sorted,
+ time_range,
+ }
+ }
+}
+
// ---
// NOTE:
@@ -871,12 +947,8 @@ impl PendingRow {
#[cfg(test)]
mod tests {
use crossbeam::channel::TryRecvError;
- use itertools::Itertools as _;
- use re_log_types::{
- example_components::{MyPoint, MyPoint64},
- TimeInt,
- };
+ use re_log_types::example_components::{MyPoint, MyPoint64};
use re_types_core::Loggable as _;
use super::*;
@@ -922,7 +994,7 @@ mod tests {
chunks.push(chunk);
}
- chunks.sort_by_key(|chunk| chunk.row_id_range().0);
+ chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0);
// Make the programmer's life easier if this test fails.
eprintln!("Chunks:");
@@ -938,22 +1010,92 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- [42, 43, 44]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![42, 43, 44]),
+ ),
+ )];
+ let expected_components = [(
+ MyPoint::name(),
+ crate::util::arrays_to_list_array_opt(&[&*points1, &*points2, &*points3].map(Some))
+ .unwrap(),
)];
+ let expected_chunk = Chunk::from_native_row_ids(
+ chunks[0].id,
+ entity_path1.clone(),
+ None,
+ &expected_row_ids,
+ expected_timelines.into_iter().collect(),
+ expected_components.into_iter().collect(),
+ )?;
+
+ eprintln!("Expected:\n{expected_chunk}");
+ eprintln!("Got:\n{}", chunks[0]);
+ assert_eq!(expected_chunk, chunks[0]);
+ }
+
+ Ok(())
+ }
+
+ /// A bunch of rows that don't fit any of the split conditions should end up together.
+ #[test]
+ fn simple_static() -> anyhow::Result<()> {
+ let batcher = ChunkBatcher::new(ChunkBatcherConfig::NEVER)?;
+
+ let timeless = TimePoint::default();
+
+ let points1 = MyPoint::to_arrow([MyPoint::new(1.0, 2.0), MyPoint::new(3.0, 4.0)])?;
+ let points2 = MyPoint::to_arrow([MyPoint::new(10.0, 20.0), MyPoint::new(30.0, 40.0)])?;
+ let points3 = MyPoint::to_arrow([MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)])?;
+
+ let components1 = [(MyPoint::name(), points1.clone())];
+ let components2 = [(MyPoint::name(), points2.clone())];
+ let components3 = [(MyPoint::name(), points3.clone())];
+
+ let row1 = PendingRow::new(timeless.clone(), components1.into());
+ let row2 = PendingRow::new(timeless.clone(), components2.into());
+ let row3 = PendingRow::new(timeless.clone(), components3.into());
+
+ let entity_path1: EntityPath = "a/b/c".into();
+ batcher.push_row(entity_path1.clone(), row1.clone());
+ batcher.push_row(entity_path1.clone(), row2.clone());
+ batcher.push_row(entity_path1.clone(), row3.clone());
+
+ let chunks_rx = batcher.chunks();
+ drop(batcher); // flush and close
+
+ let mut chunks = Vec::new();
+ loop {
+ let chunk = match chunks_rx.try_recv() {
+ Ok(chunk) => chunk,
+ Err(TryRecvError::Empty) => panic!("expected chunk, got none"),
+ Err(TryRecvError::Disconnected) => break,
+ };
+ chunks.push(chunk);
+ }
+
+ chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0);
+
+ // Make the programmer's life easier if this test fails.
+ eprintln!("Chunks:");
+ for chunk in &chunks {
+ eprintln!("{chunk}");
+ }
+
+ assert_eq!(1, chunks.len());
+
+ {
+ let expected_row_ids = vec![row1.row_id, row2.row_id, row3.row_id];
+ let expected_timelines = [];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points1, &*points2, &*points3].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points1, &*points2, &*points3].map(Some))
+ .unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[0].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1008,7 +1150,7 @@ mod tests {
chunks.push(chunk);
}
- chunks.sort_by_key(|chunk| chunk.row_id_range().0);
+ chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0);
// Make the programmer's life easier if this test fails.
eprintln!("Chunks:");
@@ -1024,22 +1166,19 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- [42, 44]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![42, 44]),
+ ),
)];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points1, &*points3].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points1, &*points3].map(Some)).unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[0].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1055,19 +1194,19 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- std::iter::once(43).map(TimeInt::new_temporal).collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![43]),
+ ),
)];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points2].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points2].map(Some)).unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[1].id,
entity_path2.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1126,7 +1265,7 @@ mod tests {
chunks.push(chunk);
}
- chunks.sort_by_key(|chunk| chunk.row_id_range().0);
+ chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0);
// Make the programmer's life easier if this test fails.
eprintln!("Chunks:");
@@ -1142,19 +1281,19 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- std::iter::once(42).map(TimeInt::new_temporal).collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![42]),
+ ),
)];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points1].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points1].map(Some)).unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[0].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1171,34 +1310,28 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- [43, 44]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![43, 44]),
+ ),
),
(
timeline2,
ChunkTimeline::new(
Some(true),
- [1000, 1001]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline2,
+ ArrowPrimitiveArray::from_vec(vec![1000, 1001]),
+ ),
),
];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points2, &*points3].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points2, &*points3].map(Some)).unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[1].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1253,7 +1386,7 @@ mod tests {
chunks.push(chunk);
}
- chunks.sort_by_key(|chunk| chunk.row_id_range().0);
+ chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0);
// Make the programmer's life easier if this test fails.
eprintln!("Chunks:");
@@ -1269,22 +1402,19 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- [42, 44]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![42, 44]),
+ ),
)];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points1, &*points3].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points1, &*points3].map(Some)).unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[0].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1300,19 +1430,19 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- std::iter::once(43).map(TimeInt::new_temporal).collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![43]),
+ ),
)];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points2].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points2].map(Some)).unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[1].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1385,7 +1515,7 @@ mod tests {
chunks.push(chunk);
}
- chunks.sort_by_key(|chunk| chunk.row_id_range().0);
+ chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0);
// Make the programmer's life easier if this test fails.
eprintln!("Chunks:");
@@ -1402,35 +1532,31 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(false),
- [45, 42, 43, 44]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![45, 42, 43, 44]),
+ ),
),
(
timeline2,
ChunkTimeline::new(
Some(false),
- [1003, 1000, 1001, 1002]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline2,
+ ArrowPrimitiveArray::from_vec(vec![1003, 1000, 1001, 1002]),
+ ),
),
];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points1, &*points2, &*points3, &*points4].map(Some))
- .unwrap(),
+ crate::util::arrays_to_list_array_opt(
+ &[&*points1, &*points2, &*points3, &*points4].map(Some),
+ )
+ .unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[0].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1503,7 +1629,7 @@ mod tests {
chunks.push(chunk);
}
- chunks.sort_by_key(|chunk| chunk.row_id_range().0);
+ chunks.sort_by_key(|chunk| chunk.row_id_range().unwrap().0);
// Make the programmer's life easier if this test fails.
eprintln!("Chunks:");
@@ -1520,34 +1646,29 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(false),
- [45, 42, 43]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![45, 42, 43]),
+ ),
),
(
timeline2,
ChunkTimeline::new(
Some(false),
- [1003, 1000, 1001]
- .into_iter()
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline2,
+ ArrowPrimitiveArray::from_vec(vec![1003, 1000, 1001]),
+ ),
),
];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points1, &*points2, &*points3].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points1, &*points2, &*points3].map(Some))
+ .unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[0].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
@@ -1564,30 +1685,28 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- std::iter::once(44).map(TimeInt::new_temporal).collect_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::from_vec(vec![44]),
+ ),
),
(
timeline2,
ChunkTimeline::new(
Some(true),
- std::iter::once(1002)
- .map(TimeInt::new_temporal)
- .collect_vec(),
- )
- .unwrap(),
+ timeline2,
+ ArrowPrimitiveArray::from_vec(vec![1002]),
+ ),
),
];
let expected_components = [(
MyPoint::name(),
- arrays_to_list_array(&[&*points4].map(Some)).unwrap(),
+ crate::util::arrays_to_list_array_opt(&[&*points4].map(Some)).unwrap(),
)];
- let expected_chunk = Chunk::new(
+ let expected_chunk = Chunk::from_native_row_ids(
chunks[1].id,
entity_path1.clone(),
None,
- expected_row_ids,
+ &expected_row_ids,
expected_timelines.into_iter().collect(),
expected_components.into_iter().collect(),
)?;
diff --git a/crates/re_chunk/src/builder.rs b/crates/re_chunk/src/builder.rs
new file mode 100644
index 000000000000..496f59cc2633
--- /dev/null
+++ b/crates/re_chunk/src/builder.rs
@@ -0,0 +1,345 @@
+use std::collections::BTreeMap;
+
+use arrow2::{
+ array::{Array as ArrowArray, PrimitiveArray as ArrowPrimitiveArray},
+ datatypes::DataType as ArrowDatatype,
+};
+use itertools::Itertools;
+
+use nohash_hasher::IntMap;
+use re_log_types::{EntityPath, TimeInt, TimePoint, Timeline};
+use re_types_core::{AsComponents, ComponentBatch, ComponentName};
+
+use crate::{Chunk, ChunkId, ChunkResult, ChunkTimeline, RowId};
+
+// ---
+
+/// Helper to incrementally build a [`Chunk`].
+///
+/// Can be created using [`Chunk::builder`].
+pub struct ChunkBuilder {
+ id: ChunkId,
+ entity_path: EntityPath,
+
+ row_ids: Vec,
+ timelines: BTreeMap,
+ components: BTreeMap>>>,
+}
+
+impl Chunk {
+ /// Initializes a new [`ChunkBuilder`].
+ #[inline]
+ pub fn builder(entity_path: EntityPath) -> ChunkBuilder {
+ ChunkBuilder::new(ChunkId::new(), entity_path)
+ }
+
+ /// Initializes a new [`ChunkBuilder`].
+ ///
+ /// The final [`Chunk`] will have the specified `id`.
+ #[inline]
+ pub fn builder_with_id(id: ChunkId, entity_path: EntityPath) -> ChunkBuilder {
+ ChunkBuilder::new(id, entity_path)
+ }
+}
+
+impl ChunkBuilder {
+ /// Initializes a new [`ChunkBuilder`].
+ ///
+ /// See also [`Chunk::builder`].
+ #[inline]
+ pub fn new(id: ChunkId, entity_path: EntityPath) -> Self {
+ Self {
+ id,
+ entity_path,
+
+ row_ids: Vec::new(),
+ timelines: BTreeMap::new(),
+ components: BTreeMap::new(),
+ }
+ }
+
+ /// Add a row's worth of data using the given sparse component data.
+ pub fn with_sparse_row(
+ mut self,
+ row_id: RowId,
+ timepoint: impl Into,
+ components: impl IntoIterator- >)>,
+ ) -> Self {
+ let components = components.into_iter().collect_vec();
+
+ // Align all columns by appending null values for rows where we don't have data.
+ for (component_name, _) in &components {
+ let arrays = self.components.entry(*component_name).or_default();
+ arrays.extend(
+ std::iter::repeat(None).take(self.row_ids.len().saturating_sub(arrays.len())),
+ );
+ }
+
+ self.row_ids.push(row_id);
+
+ for (timeline, time) in timepoint.into() {
+ self.timelines
+ .entry(timeline)
+ .or_insert_with(|| ChunkTimeline::builder(timeline))
+ .with_row(time);
+ }
+
+ for (component_name, array) in components {
+ self.components
+ .entry(component_name)
+ .or_default()
+ .push(array);
+ }
+
+ // Align all columns by appending null values for rows where we don't have data.
+ for arrays in self.components.values_mut() {
+ arrays.extend(
+ std::iter::repeat(None).take(self.row_ids.len().saturating_sub(arrays.len())),
+ );
+ }
+
+ self
+ }
+
+ /// Add a row's worth of data using the given component data.
+ #[inline]
+ pub fn with_row(
+ self,
+ row_id: RowId,
+ timepoint: impl Into,
+ components: impl IntoIterator
- )>,
+ ) -> Self {
+ self.with_sparse_row(
+ row_id,
+ timepoint,
+ components
+ .into_iter()
+ .map(|(component_name, array)| (component_name, Some(array))),
+ )
+ }
+
+ /// Add a row's worth of data by destructuring an archetype into component columns.
+ #[inline]
+ pub fn with_archetype(
+ self,
+ row_id: RowId,
+ timepoint: impl Into,
+ as_components: &dyn AsComponents,
+ ) -> Self {
+ let batches = as_components.as_component_batches();
+ self.with_component_batches(
+ row_id,
+ timepoint,
+ batches.iter().map(|batch| batch.as_ref()),
+ )
+ }
+
+ /// Add a row's worth of data by serializing a single [`ComponentBatch`].
+ #[inline]
+ pub fn with_component_batch(
+ self,
+ row_id: RowId,
+ timepoint: impl Into,
+ component_batch: &dyn ComponentBatch,
+ ) -> Self {
+ self.with_row(
+ row_id,
+ timepoint,
+ component_batch
+ .to_arrow()
+ .ok()
+ .map(|array| (component_batch.name(), array)),
+ )
+ }
+
+ /// Add a row's worth of data by serializing many [`ComponentBatch`]es.
+ #[inline]
+ pub fn with_component_batches<'a>(
+ self,
+ row_id: RowId,
+ timepoint: impl Into,
+ component_batches: impl IntoIterator
- ,
+ ) -> Self {
+ self.with_row(
+ row_id,
+ timepoint,
+ component_batches.into_iter().filter_map(|component_batch| {
+ component_batch
+ .to_arrow()
+ .ok()
+ .map(|array| (component_batch.name(), array))
+ }),
+ )
+ }
+
+ /// Add a row's worth of data by serializing many sparse [`ComponentBatch`]es.
+ #[inline]
+ pub fn with_sparse_component_batches<'a>(
+ self,
+ row_id: RowId,
+ timepoint: impl Into,
+ component_batches: impl IntoIterator
- )>,
+ ) -> Self {
+ self.with_sparse_row(
+ row_id,
+ timepoint,
+ component_batches
+ .into_iter()
+ .map(|(component_name, component_batch)| {
+ (
+ component_name,
+ component_batch.and_then(|batch| batch.to_arrow().ok()),
+ )
+ }),
+ )
+ }
+
+ /// Builds and returns the final [`Chunk`].
+ ///
+ /// The arrow datatype of each individual column will be guessed by inspecting the data.
+ ///
+ /// If any component column turns out to be fully sparse (i.e. only null values), that column
+ /// will be stripped out (how could we guess its datatype without any single value to inspect)?
+ ///
+ /// This is generally the desired behavior but, if you want to make sure to keep fully sparse
+ /// columns (can be useful e.g. for testing purposes), see [`ChunkBuilder::build_with_datatypes`]
+ /// instead.
+ ///
+ /// This returns an error if the chunk fails to `sanity_check`.
+ #[inline]
+ pub fn build(self) -> ChunkResult {
+ let Self {
+ id,
+ entity_path,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ Chunk::from_native_row_ids(
+ id,
+ entity_path,
+ None,
+ &row_ids,
+ timelines
+ .into_iter()
+ .map(|(timeline, time_chunk)| (timeline, time_chunk.build()))
+ .collect(),
+ components
+ .into_iter()
+ .filter_map(|(component_name, arrays)| {
+ let arrays = arrays.iter().map(|array| array.as_deref()).collect_vec();
+ crate::util::arrays_to_list_array_opt(&arrays)
+ .map(|list_array| (component_name, list_array))
+ })
+ .collect(),
+ )
+ }
+
+ /// Builds and returns the final [`Chunk`].
+ ///
+ /// The arrow datatype of each individual column will be guessed by inspecting the data.
+ ///
+ /// If any component column turns out to be fully sparse (i.e. only null values), `datatypes`
+ /// will be used as a fallback.
+ ///
+ /// If any component column turns out to be fully sparse (i.e. only null values) _and_ doesn't
+ /// have an explicit datatype passed in, that column will be stripped out (how could we guess
+ /// its datatype without any single value to inspect)?
+ ///
+ /// You should rarely want to keep fully sparse columns around outside of testing scenarios.
+ /// See [`Self::build`].
+ ///
+ /// This returns an error if the chunk fails to `sanity_check`.
+ #[inline]
+ pub fn build_with_datatypes(
+ self,
+ datatypes: &IntMap,
+ ) -> ChunkResult {
+ let Self {
+ id,
+ entity_path,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ Chunk::from_native_row_ids(
+ id,
+ entity_path,
+ None,
+ &row_ids,
+ timelines
+ .into_iter()
+ .map(|(timeline, time_chunk)| (timeline, time_chunk.build()))
+ .collect(),
+ components
+ .into_iter()
+ .filter_map(|(component_name, arrays)| {
+ let arrays = arrays.iter().map(|array| array.as_deref()).collect_vec();
+
+ // If we know the datatype in advance, we're able to keep even fully sparse
+ // columns around.
+ if let Some(datatype) = datatypes.get(&component_name) {
+ crate::util::arrays_to_list_array(datatype.clone(), &arrays)
+ .map(|list_array| (component_name, list_array))
+ } else {
+ crate::util::arrays_to_list_array_opt(&arrays)
+ .map(|list_array| (component_name, list_array))
+ }
+ })
+ .collect(),
+ )
+ }
+}
+
+// ---
+
+/// Helper to incrementally build a [`ChunkTimeline`].
+///
+/// Can be created using [`ChunkTimeline::builder`].
+pub struct ChunkTimelineBuilder {
+ timeline: Timeline,
+
+ times: Vec,
+}
+
+impl ChunkTimeline {
+ /// Initializes a new [`ChunkTimelineBuilder`].
+ #[inline]
+ pub fn builder(timeline: Timeline) -> ChunkTimelineBuilder {
+ ChunkTimelineBuilder::new(timeline)
+ }
+}
+
+impl ChunkTimelineBuilder {
+ /// Initializes a new [`ChunkTimelineBuilder`].
+ ///
+ /// See also [`ChunkTimeline::builder`].
+ #[inline]
+ pub fn new(timeline: Timeline) -> Self {
+ Self {
+ timeline,
+ times: Vec::new(),
+ }
+ }
+
+ /// Add a row's worth of time data using the given timestamp.
+ #[inline]
+ pub fn with_row(&mut self, time: TimeInt) -> &mut Self {
+ let Self { timeline: _, times } = self;
+
+ times.push(time.as_i64());
+
+ self
+ }
+
+ /// Builds and returns the final [`ChunkTimeline`].
+ #[inline]
+ pub fn build(self) -> ChunkTimeline {
+ let Self { timeline, times } = self;
+
+ let times = ArrowPrimitiveArray::::from_vec(times).to(timeline.datatype());
+ ChunkTimeline::new(None, timeline, times)
+ }
+}
diff --git a/crates/re_chunk/src/chunk.rs b/crates/re_chunk/src/chunk.rs
index 4d0b84b00e5c..651ec3af60e0 100644
--- a/crates/re_chunk/src/chunk.rs
+++ b/crates/re_chunk/src/chunk.rs
@@ -1,9 +1,18 @@
-use std::collections::BTreeMap;
+use std::{
+ collections::BTreeMap,
+ sync::atomic::{AtomicU64, Ordering},
+};
-use arrow2::array::Array as ArrowArray;
+use arrow2::array::{
+ Array as ArrowArray, ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray,
+ StructArray as ArrowStructArray,
+};
-use re_log_types::{EntityPath, ResolvedTimeRange, RowId, TimeInt, TimePoint, Timeline};
-use re_types_core::{ComponentName, SerializationError};
+use itertools::{izip, Itertools};
+use re_log_types::{EntityPath, ResolvedTimeRange, TimeInt, TimePoint, Timeline};
+use re_types_core::{ComponentName, Loggable, LoggableBatch, SerializationError, SizeBytes};
+
+use crate::{ChunkId, RowId};
// ---
@@ -16,18 +25,12 @@ pub enum ChunkError {
#[error(transparent)]
Serialization(#[from] SerializationError),
-
- #[error("Chunks cannot be empty")]
- Empty,
}
pub type ChunkResult = Result;
// ---
-/// Unique identifier for a [`Chunk`], using a [`re_tuid::Tuid`].
-pub type ChunkId = re_tuid::Tuid;
-
/// Dense arrow-based storage of N rows of multi-component multi-temporal data for a specific entity.
///
/// This is our core datastructure for logging, storing, querying and transporting data around.
@@ -37,16 +40,23 @@ pub type ChunkId = re_tuid::Tuid;
///
/// This is the in-memory representation of a chunk, optimized for efficient manipulation of the
/// data within. For transport, see [`crate::TransportChunk`] instead.
-#[derive(Debug, Clone)]
+#[derive(Debug)]
pub struct Chunk {
pub(crate) id: ChunkId,
+
pub(crate) entity_path: EntityPath,
+ /// The heap size of this chunk in bytes.
+ ///
+ /// Must be cached as it is very costly to compute, and needs to be computed repeatedly on the
+ /// hot path (e.g. during garbage collection).
+ pub(crate) heap_size_bytes: AtomicU64,
+
/// Is the chunk as a whole sorted by [`RowId`]?
pub(crate) is_sorted: bool,
/// The respective [`RowId`]s for each row of data.
- pub(crate) row_ids: Vec,
+ pub(crate) row_ids: ArrowStructArray,
/// The time columns.
///
@@ -60,92 +70,238 @@ pub struct Chunk {
/// Each `ListArray` must be the same length as `row_ids`.
///
/// Sparse so that we can e.g. log a `Position` at one timestamp but not a `Color`.
- pub(crate) components: BTreeMap>,
+ //
+ // TODO(#6576): support non-list based columns?
+ pub(crate) components: BTreeMap>,
}
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ChunkTimeline {
- /// Every single timestamp for this timeline.
- ///
- /// * This might or might not be sorted, depending on how the data was logged.
- /// * This is guaranteed to always be dense, because chunks are split anytime a timeline is
- /// added or removed.
- /// * This can never contain `TimeInt::STATIC`, since static data doesn't even have timelines.
- //
- // TODO(cmc): maybe this would be better as raw i64s so getting time columns in and out of
- // chunks is just a blind memcpy… it's probably not worth the hassle for now though.
- // We'll see how things evolve as we start putting chunks in the backend.
- pub(crate) times: Vec,
+impl PartialEq for Chunk {
+ #[inline]
+ fn eq(&self, other: &Self) -> bool {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
- /// Is [`Self::times`] sorted?
- ///
- /// This is completely independent of [`Chunk::is_sorted`]: a timeline doesn't necessarily
- /// follow the global [`RowId`]-based order, although it does in most cases (happy path).
- pub(crate) is_sorted: bool,
+ *id == other.id
+ && *entity_path == other.entity_path
+ && *is_sorted == other.is_sorted
+ && *row_ids == other.row_ids
+ && *timelines == other.timelines
+ && *components == other.components
+ }
+}
- /// The time range covered by [`Self::times`].
+impl Chunk {
+ /// Returns `true` is two [`Chunk`]s are similar, although not byte-for-byte equal.
///
- /// Not necessarily contiguous! Just the min and max value found in [`Self::times`].
- pub(crate) time_range: ResolvedTimeRange,
+ /// In particular, this ignores chunks and row IDs, as well as temporal timestamps.
+ ///
+ /// Useful for tests.
+ pub fn are_similar(lhs: &Self, rhs: &Self) -> bool {
+ let Self {
+ id: _,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted: _,
+ row_ids: _,
+ timelines,
+ components,
+ } = lhs;
+
+ *entity_path == rhs.entity_path
+ && timelines.keys().collect_vec() == rhs.timelines.keys().collect_vec()
+ && {
+ let timelines: BTreeMap<_, _> = timelines
+ .iter()
+ .filter(|(timeline, _time_chunk)| {
+ timeline.typ() != re_log_types::TimeType::Time
+ })
+ .collect();
+ let rhs_timelines: BTreeMap<_, _> = rhs
+ .timelines
+ .iter()
+ .filter(|(timeline, _time_chunk)| {
+ timeline.typ() != re_log_types::TimeType::Time
+ })
+ .collect();
+ timelines == rhs_timelines
+ }
+ && *components == rhs.components
+ }
}
-impl Default for ChunkTimeline {
+impl Clone for Chunk {
#[inline]
- fn default() -> Self {
+ fn clone(&self) -> Self {
Self {
- times: Default::default(),
- is_sorted: true,
- time_range: ResolvedTimeRange::EMPTY,
+ id: self.id,
+ entity_path: self.entity_path.clone(),
+ heap_size_bytes: AtomicU64::new(self.heap_size_bytes.load(Ordering::Relaxed)),
+ is_sorted: self.is_sorted,
+ row_ids: self.row_ids.clone(),
+ timelines: self.timelines.clone(),
+ components: self.components.clone(),
}
}
}
-#[cfg(test)] // do not ever use this outside internal testing, it's extremely slow and hackish
-impl PartialEq for Chunk {
+impl Chunk {
+ /// Clones the chunk and assign new IDs to the resulting chunk and its rows.
+ ///
+ /// `first_row_id` will become the [`RowId`] of the first row in the duplicated chunk.
+ /// Each row after that will be monotonically increasing.
#[inline]
- fn eq(&self, rhs: &Self) -> bool {
- let Self {
- id: _, // we're comparing the contents
- entity_path,
- is_sorted,
+ pub fn clone_as(&self, id: ChunkId, first_row_id: RowId) -> Self {
+ let row_ids = std::iter::from_fn({
+ let mut row_id = first_row_id;
+ move || {
+ let yielded = row_id;
+ row_id = row_id.next();
+ Some(yielded)
+ }
+ })
+ .take(self.row_ids.len())
+ .collect_vec();
+
+ #[allow(clippy::unwrap_used)]
+ let row_ids = ::to_arrow(&row_ids)
+ // Unwrap: native RowIds cannot fail to serialize.
+ .unwrap()
+ .as_any()
+ .downcast_ref::()
+ // Unwrap: RowId schema is known in advance to be a struct array -- always.
+ .unwrap()
+ .clone();
+
+ Self {
+ id,
row_ids,
- timelines,
- components,
- } = self;
+ ..self.clone()
+ }
+ }
- use itertools::Itertools as _;
+ /// Clones the chunk into a new chunk without any time data.
+ #[inline]
+ pub fn into_static(mut self) -> Self {
+ self.timelines.clear();
+ self
+ }
- *entity_path == rhs.entity_path
- && *is_sorted == rhs.is_sorted
- && *row_ids == rhs.row_ids
- && *timelines == rhs.timelines
- && components.keys().collect_vec() == rhs.components.keys().collect_vec()
- && components.iter().all(|(component_name, list_array)| {
- let Some(rhs_list_array) = rhs
- .components
- .get(component_name)
- .map(|list_array| &**list_array)
- else {
- return false;
- };
-
- // `arrow2::compute::comparison` has very limited support for the different arrow
- // types, so we just do our best here.
- // This is just a testing/debugging tool.
- if arrow2::compute::comparison::can_eq(list_array.data_type()) {
- arrow2::compute::comparison::eq(&**list_array, rhs_list_array)
- .values_iter()
- .all(|v| v)
- } else {
- list_array.data_type() == rhs_list_array.data_type()
- && list_array.len() == rhs_list_array.len()
- }
+ /// Computes the time range covered by each individual component column on each timeline.
+ ///
+ /// This is different from the time range covered by the [`Chunk`] as a whole because component
+ /// columns are potentially sparse.
+ ///
+ /// This is crucial for indexing and queries to work properly.
+ //
+ // TODO(cmc): This needs to be stored in chunk metadata and transported across IPC.
+ #[inline]
+ pub fn time_range_per_component(
+ &self,
+ ) -> BTreeMap> {
+ re_tracing::profile_function!();
+
+ self.timelines
+ .iter()
+ .map(|(&timeline, time_chunk)| {
+ (
+ timeline,
+ time_chunk.time_range_per_component(&self.components),
+ )
})
+ .collect()
+ }
+
+ /// Computes the `RowId` range covered by each individual component column on each timeline.
+ ///
+ /// This is different from the `RowId` range covered by the [`Chunk`] as a whole because component
+ /// columns are potentially sparse.
+ ///
+ /// This is crucial for indexing and queries to work properly.
+ //
+ // TODO(cmc): This needs to be stored in chunk metadata and transported across IPC.
+ pub fn row_id_range_per_component(&self) -> BTreeMap {
+ re_tracing::profile_function!();
+
+ let row_ids = self.row_ids().collect_vec();
+
+ if self.is_sorted() {
+ self.components
+ .iter()
+ .filter_map(|(component_name, list_array)| {
+ let mut row_id_min = None;
+ let mut row_id_max = None;
+
+ for (i, &row_id) in row_ids.iter().enumerate() {
+ if list_array.is_valid(i) {
+ row_id_min = Some(row_id);
+ }
+ }
+ for (i, &row_id) in row_ids.iter().enumerate().rev() {
+ if list_array.is_valid(i) {
+ row_id_max = Some(row_id);
+ }
+ }
+
+ Some((*component_name, (row_id_min?, row_id_max?)))
+ })
+ .collect()
+ } else {
+ self.components
+ .iter()
+ .filter_map(|(component_name, list_array)| {
+ let mut row_id_min = Some(RowId::MAX);
+ let mut row_id_max = Some(RowId::ZERO);
+
+ for (i, &row_id) in row_ids.iter().enumerate() {
+ if list_array.is_valid(i) && Some(row_id) > row_id_min {
+ row_id_min = Some(row_id);
+ }
+ }
+ for (i, &row_id) in row_ids.iter().enumerate().rev() {
+ if list_array.is_valid(i) && Some(row_id) < row_id_max {
+ row_id_max = Some(row_id);
+ }
+ }
+
+ Some((*component_name, (row_id_min?, row_id_max?)))
+ })
+ .collect()
+ }
}
}
-#[cfg(test)] // do not ever use this outside internal testing, it's extremely slow and hackish
-impl Eq for Chunk {}
+// ---
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct ChunkTimeline {
+ pub(crate) timeline: Timeline,
+
+ /// Every single timestamp for this timeline.
+ ///
+ /// * This might or might not be sorted, depending on how the data was logged.
+ /// * This is guaranteed to always be dense, because chunks are split anytime a timeline is
+ /// added or removed.
+ /// * This cannot ever contain `TimeInt::STATIC`, since static data doesn't even have timelines.
+ pub(crate) times: ArrowPrimitiveArray,
+
+ /// Is [`Self::times`] sorted?
+ ///
+ /// This is completely independent of [`Chunk::is_sorted`]: a timeline doesn't necessarily
+ /// follow the global [`RowId`]-based order, although it does in most cases (happy path).
+ pub(crate) is_sorted: bool,
+
+ /// The time range covered by [`Self::times`].
+ ///
+ /// Not necessarily contiguous! Just the min and max value found in [`Self::times`].
+ pub(crate) time_range: ResolvedTimeRange,
+}
impl Chunk {
/// Creates a new [`Chunk`].
@@ -155,27 +311,23 @@ impl Chunk {
///
/// Iff you know for sure whether the data is already appropriately sorted or not, specify `is_sorted`.
/// When left unspecified (`None`), it will be computed in O(n) time.
+ ///
+ /// For a row-oriented constructor, see [`Self::builder`].
pub fn new(
id: ChunkId,
entity_path: EntityPath,
is_sorted: Option,
- row_ids: Vec,
+ row_ids: ArrowStructArray,
timelines: BTreeMap,
- components: BTreeMap>,
+ components: BTreeMap>,
) -> ChunkResult {
- if row_ids.is_empty() {
- return Err(ChunkError::Empty);
- }
-
let mut chunk = Self {
id,
entity_path,
+ heap_size_bytes: AtomicU64::new(0),
is_sorted: false,
row_ids,
- timelines: timelines
- .into_iter()
- .filter(|(_, time_chunk)| !time_chunk.times.is_empty())
- .collect(),
+ timelines,
components,
};
@@ -186,14 +338,50 @@ impl Chunk {
Ok(chunk)
}
+ /// Creates a new [`Chunk`].
+ ///
+ /// This will fail if the passed in data is malformed in any way -- see [`Self::sanity_check`]
+ /// for details.
+ ///
+ /// Iff you know for sure whether the data is already appropriately sorted or not, specify `is_sorted`.
+ /// When left unspecified (`None`), it will be computed in O(n) time.
+ ///
+ /// For a row-oriented constructor, see [`Self::builder`].
+ pub fn from_native_row_ids(
+ id: ChunkId,
+ entity_path: EntityPath,
+ is_sorted: Option,
+ row_ids: &[RowId],
+ timelines: BTreeMap,
+ components: BTreeMap>,
+ ) -> ChunkResult {
+ let row_ids = row_ids
+ .to_arrow()
+ // NOTE: impossible, but better safe than sorry.
+ .map_err(|err| ChunkError::Malformed {
+ reason: format!("RowIds failed to serialize: {err}"),
+ })?
+ .as_any()
+ .downcast_ref::()
+ // NOTE: impossible, but better safe than sorry.
+ .ok_or_else(|| ChunkError::Malformed {
+ reason: "RowIds failed to downcast".to_owned(),
+ })?
+ .clone();
+
+ Self::new(id, entity_path, is_sorted, row_ids, timelines, components)
+ }
+
/// Simple helper for [`Self::new`] for static data.
+ ///
+ /// For a row-oriented constructor, see [`Self::builder`].
#[inline]
pub fn new_static(
id: ChunkId,
entity_path: EntityPath,
is_sorted: Option,
- row_ids: Vec,
- components: BTreeMap>,
+ row_ids: ArrowStructArray,
+ components: BTreeMap>,
) -> ChunkResult {
Self::new(
id,
@@ -204,58 +392,74 @@ impl Chunk {
components,
)
}
+
+ #[inline]
+ pub fn empty(id: ChunkId, entity_path: EntityPath) -> Self {
+ Self {
+ id,
+ entity_path,
+ heap_size_bytes: Default::default(),
+ is_sorted: true,
+ row_ids: ArrowStructArray::new_empty(RowId::arrow_datatype()),
+ timelines: Default::default(),
+ components: Default::default(),
+ }
+ }
}
impl ChunkTimeline {
/// Creates a new [`ChunkTimeline`].
///
- /// Returns `None` if `times` is empty.
- ///
/// Iff you know for sure whether the data is already appropriately sorted or not, specify `is_sorted`.
/// When left unspecified (`None`), it will be computed in O(n) time.
- pub fn new(is_sorted: Option, times: Vec) -> Option {
+ ///
+ /// For a row-oriented constructor, see [`Self::builder`].
+ pub fn new(
+ is_sorted: Option,
+ timeline: Timeline,
+ times: ArrowPrimitiveArray,
+ ) -> Self {
re_tracing::profile_function!(format!("{} times", times.len()));
- if times.is_empty() {
- return None;
- }
+ let times = times.to(timeline.datatype());
+ let time_slice = times.values().as_slice();
let is_sorted =
- is_sorted.unwrap_or_else(|| times.windows(2).all(|times| times[0] <= times[1]));
+ is_sorted.unwrap_or_else(|| time_slice.windows(2).all(|times| times[0] <= times[1]));
let time_range = if is_sorted {
- // NOTE: The 'or' in 'unwrap_or' is never hit, but better safe than sorry.
- let min_time = times.first().copied().unwrap_or(TimeInt::MIN);
- let max_time = times.last().copied().unwrap_or(TimeInt::MAX);
+ // NOTE: The 'or' in 'map_or' is never hit, but better safe than sorry.
+ let min_time = time_slice
+ .first()
+ .copied()
+ .map_or(TimeInt::MIN, TimeInt::new_temporal);
+ let max_time = time_slice
+ .last()
+ .copied()
+ .map_or(TimeInt::MAX, TimeInt::new_temporal);
ResolvedTimeRange::new(min_time, max_time)
} else {
// NOTE: Do the iteration multiple times in a cache-friendly way rather than the opposite.
// NOTE: The 'or' in 'unwrap_or' is never hit, but better safe than sorry.
- let min_time = times.iter().min().copied().unwrap_or(TimeInt::MIN);
- let max_time = times.iter().max().copied().unwrap_or(TimeInt::MAX);
+ let min_time = time_slice
+ .iter()
+ .min()
+ .copied()
+ .map_or(TimeInt::MIN, TimeInt::new_temporal);
+ let max_time = time_slice
+ .iter()
+ .max()
+ .copied()
+ .map_or(TimeInt::MAX, TimeInt::new_temporal);
ResolvedTimeRange::new(min_time, max_time)
};
- Some(Self {
- times,
- is_sorted,
- time_range,
- })
- }
-
- /// Push a single time value at the end of this chunk.
- #[inline]
- pub fn push(&mut self, time: TimeInt) {
- let Self {
+ Self {
+ timeline,
times,
is_sorted,
time_range,
- } = self;
-
- *is_sorted &= times.last().copied().unwrap_or(TimeInt::MIN) <= time;
- time_range.set_min(TimeInt::min(time_range.min(), time));
- time_range.set_max(TimeInt::max(time_range.max(), time));
- times.push(time);
+ }
}
}
@@ -278,6 +482,7 @@ impl Chunk {
let Self {
id: _,
entity_path: _, // not an actual column
+ heap_size_bytes: _,
is_sorted: _,
row_ids: _,
timelines,
@@ -308,23 +513,106 @@ impl Chunk {
self.row_ids.len()
}
- /// Returns the [`RowId`]-range in this [`Chunk`].
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.num_rows() == 0
+ }
+
+ /// Returns the [`RowId`]s in their raw-est form: a tuple of (times, counters) arrays.
+ #[inline]
+ pub fn row_ids_raw(&self) -> (&ArrowPrimitiveArray, &ArrowPrimitiveArray) {
+ let [times, counters] = self.row_ids.values() else {
+ panic!("RowIds are corrupt -- this should be impossible (sanity checked)");
+ };
+
+ #[allow(clippy::unwrap_used)]
+ let times = times
+ .as_any()
+ .downcast_ref::>()
+ .unwrap(); // sanity checked
+
+ #[allow(clippy::unwrap_used)]
+ let counters = counters
+ .as_any()
+ .downcast_ref::>()
+ .unwrap(); // sanity checked
+
+ (times, counters)
+ }
+
+ #[inline]
+ pub fn row_ids(&self) -> impl Iterator
- + '_ {
+ let (times, counters) = self.row_ids_raw();
+ izip!(times.values().as_slice(), counters.values().as_slice())
+ .map(|(&time, &counter)| RowId::from_u128((time as u128) << 64 | (counter as u128)))
+ }
+
+ /// Returns the [`RowId`]-range covered by this [`Chunk`].
+ ///
+ /// `None` if the chunk `is_empty`.
///
/// This is O(1) if the chunk is sorted, O(n) otherwise.
#[inline]
- pub fn row_id_range(&self) -> (RowId, RowId) {
- #[allow(clippy::unwrap_used)] // cannot create empty chunks
- if self.is_sorted() {
+ pub fn row_id_range(&self) -> Option<(RowId, RowId)> {
+ if self.is_empty() {
+ return None;
+ }
+
+ let (times, counters) = self.row_ids_raw();
+ let (times, counters) = (times.values().as_slice(), counters.values().as_slice());
+
+ #[allow(clippy::unwrap_used)] // checked above
+ let (index_min, index_max) = if self.is_sorted() {
(
- self.row_ids.first().copied().unwrap(),
- self.row_ids.last().copied().unwrap(),
+ (
+ times.first().copied().unwrap(),
+ counters.first().copied().unwrap(),
+ ),
+ (
+ times.last().copied().unwrap(),
+ counters.last().copied().unwrap(),
+ ),
)
} else {
(
- self.row_ids.iter().min().copied().unwrap(),
- self.row_ids.iter().max().copied().unwrap(),
+ (
+ times.iter().min().copied().unwrap(),
+ counters.iter().min().copied().unwrap(),
+ ),
+ (
+ times.iter().max().copied().unwrap(),
+ counters.iter().max().copied().unwrap(),
+ ),
)
- }
+ };
+
+ let (time_min, counter_min) = index_min;
+ let (time_max, counter_max) = index_max;
+
+ Some((
+ RowId::from_u128((time_min as u128) << 64 | (counter_min as u128)),
+ RowId::from_u128((time_max as u128) << 64 | (counter_max as u128)),
+ ))
+ }
+
+ #[inline]
+ pub fn is_static(&self) -> bool {
+ self.timelines.is_empty()
+ }
+
+ #[inline]
+ pub fn timelines(&self) -> &BTreeMap {
+ &self.timelines
+ }
+
+ #[inline]
+ pub fn component_names(&self) -> impl Iterator
- + '_ {
+ self.components.keys().copied()
+ }
+
+ #[inline]
+ pub fn components(&self) -> &BTreeMap> {
+ &self.components
}
/// Computes the maximum value for each and every timeline present across this entire chunk,
@@ -349,7 +637,137 @@ impl std::fmt::Display for Chunk {
}
}
-// TODO(cmc): sizebytes impl + sizebytes caching + sizebytes in transport metadata
+impl ChunkTimeline {
+ #[inline]
+ pub fn time_range(&self) -> ResolvedTimeRange {
+ self.time_range
+ }
+
+ #[inline]
+ pub fn times(&self) -> impl DoubleEndedIterator
- + '_ {
+ self.times
+ .values()
+ .as_slice()
+ .iter()
+ .copied()
+ .map(TimeInt::new_temporal)
+ }
+
+ #[inline]
+ pub fn times_raw(&self) -> &[i64] {
+ self.times.values().as_slice()
+ }
+
+ #[inline]
+ pub fn num_rows(&self) -> usize {
+ self.times.len()
+ }
+
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.num_rows() == 0
+ }
+
+ /// Computes the time range covered by each individual component column.
+ ///
+ /// This is different from the time range covered by the [`ChunkTimeline`] as a whole
+ /// because component columns are potentially sparse.
+ ///
+ /// This is crucial for indexing and queries to work properly.
+ //
+ // TODO(cmc): This needs to be stored in chunk metadata and transported across IPC.
+ pub fn time_range_per_component(
+ &self,
+ components: &BTreeMap>,
+ ) -> BTreeMap {
+ let times = self.times_raw();
+ components
+ .iter()
+ .filter_map(|(&component_name, list_array)| {
+ if let Some(validity) = list_array.validity() {
+ // _Potentially_ sparse
+
+ if validity.is_empty() {
+ return None;
+ }
+
+ let is_dense = validity.unset_bits() == 0;
+ if is_dense {
+ return Some((component_name, self.time_range));
+ }
+
+ let mut time_min = TimeInt::MAX;
+ for (i, time) in times.iter().copied().enumerate() {
+ if validity.get(i).unwrap_or(false) {
+ time_min = TimeInt::new_temporal(time);
+ break;
+ }
+ }
+
+ let mut time_max = TimeInt::MIN;
+ for (i, time) in times.iter().copied().enumerate().rev() {
+ if validity.get(i).unwrap_or(false) {
+ time_max = TimeInt::new_temporal(time);
+ break;
+ }
+ }
+
+ Some((component_name, ResolvedTimeRange::new(time_min, time_max)))
+ } else {
+ // Dense
+
+ Some((component_name, self.time_range))
+ }
+ })
+ .collect()
+ }
+}
+
+impl re_types_core::SizeBytes for Chunk {
+ #[inline]
+ fn heap_size_bytes(&self) -> u64 {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ let mut size_bytes = heap_size_bytes.load(Ordering::Relaxed);
+
+ if size_bytes == 0 {
+ size_bytes = id.heap_size_bytes()
+ + entity_path.heap_size_bytes()
+ + is_sorted.heap_size_bytes()
+ + row_ids.heap_size_bytes()
+ + timelines.heap_size_bytes()
+ + components.heap_size_bytes();
+ heap_size_bytes.store(size_bytes, Ordering::Relaxed);
+ }
+
+ size_bytes
+ }
+}
+
+impl re_types_core::SizeBytes for ChunkTimeline {
+ #[inline]
+ fn heap_size_bytes(&self) -> u64 {
+ let Self {
+ timeline,
+ times,
+ is_sorted,
+ time_range,
+ } = self;
+
+ timeline.heap_size_bytes()
+ + times.heap_size_bytes() // cheap
+ + is_sorted.heap_size_bytes()
+ + time_range.heap_size_bytes()
+ }
+}
// TODO(cmc): methods to merge chunks (compaction).
@@ -365,27 +783,51 @@ impl Chunk {
let Self {
id: _,
entity_path: _,
+ heap_size_bytes,
is_sorted,
row_ids,
timelines,
components,
} = self;
- if row_ids.is_empty() || components.is_empty() {
- return Err(ChunkError::Empty);
+ #[allow(clippy::collapsible_if)] // readability
+ if cfg!(debug_assertions) {
+ let measured = self.heap_size_bytes();
+ let advertised = heap_size_bytes.load(Ordering::Relaxed);
+ if advertised != measured {
+ return Err(ChunkError::Malformed {
+ reason: format!(
+ "Chunk advertises a heap size of {} but we measure {} instead",
+ re_format::format_bytes(advertised as _),
+ re_format::format_bytes(measured as _),
+ ),
+ });
+ }
}
// Row IDs
- #[allow(clippy::collapsible_if)] // readability
- if cfg!(debug_assertions) {
- if *is_sorted != self.is_sorted_uncached() {
+ {
+ if *row_ids.data_type().to_logical_type() != RowId::arrow_datatype() {
return Err(ChunkError::Malformed {
reason: format!(
- "Chunk is marked as {}sorted but isn't: {row_ids:?}",
- if *is_sorted { "" } else { "un" },
+ "RowId data has the wrong datatype: expected {:?} but got {:?} instead",
+ RowId::arrow_datatype(),
+ *row_ids.data_type(),
),
});
}
+
+ #[allow(clippy::collapsible_if)] // readability
+ if cfg!(debug_assertions) {
+ if *is_sorted != self.is_sorted_uncached() {
+ return Err(ChunkError::Malformed {
+ reason: format!(
+ "Chunk is marked as {}sorted but isn't: {row_ids:?}",
+ if *is_sorted { "" } else { "un" },
+ ),
+ });
+ }
+ }
}
// Timelines
@@ -432,6 +874,18 @@ impl Chunk {
),
});
}
+
+ let validity_is_empty = list_array
+ .validity()
+ .map_or(false, |validity| validity.is_empty());
+ if !self.is_empty() && validity_is_empty {
+ return Err(ChunkError::Malformed {
+ reason: format!(
+ "All component batches in a chunk must contain at least one non-null entry.\
+ Found a completely empty column for {component_name}",
+ ),
+ });
+ }
}
Ok(())
@@ -444,11 +898,25 @@ impl ChunkTimeline {
/// Costly checks are only run in debug builds.
pub fn sanity_check(&self) -> ChunkResult<()> {
let Self {
+ timeline,
times,
is_sorted,
time_range,
} = self;
+ if *times.data_type() != timeline.datatype() {
+ return Err(ChunkError::Malformed {
+ reason: format!(
+ "Time data for timeline {} has the wrong datatype: expected {:?} but got {:?} instead",
+ timeline.name(),
+ timeline.datatype(),
+ *times.data_type(),
+ ),
+ });
+ }
+
+ let times = times.values().as_slice();
+
#[allow(clippy::collapsible_if)] // readability
if cfg!(debug_assertions) {
if *is_sorted != times.windows(2).all(|times| times[0] <= times[1]) {
@@ -463,26 +931,27 @@ impl ChunkTimeline {
#[allow(clippy::collapsible_if)] // readability
if cfg!(debug_assertions) {
- let is_tight_bound = times.iter().any(|&time| time == time_range.min())
- && times.iter().any(|&time| time == time_range.max());
- if !is_tight_bound {
+ let is_tight_lower_bound = times.iter().any(|&time| time == time_range.min().as_i64());
+ let is_tight_upper_bound = times.iter().any(|&time| time == time_range.max().as_i64());
+ let is_tight_bound = is_tight_lower_bound && is_tight_upper_bound;
+
+ if !self.is_empty() && !is_tight_bound {
return Err(ChunkError::Malformed {
reason: "Chunk timeline's cached time range isn't a tight bound.".to_owned(),
});
}
for &time in times {
- if time < time_range.min() || time > time_range.max() {
+ if time < time_range.min().as_i64() || time > time_range.max().as_i64() {
return Err(ChunkError::Malformed {
reason: format!(
"Chunk timeline's cached time range is wrong.\
- Found a time value of {} while its time range is {time_range:?}",
- time.as_i64(),
+ Found a time value of {time} while its time range is {time_range:?}",
),
});
}
- if time.is_static() {
+ if time == TimeInt::STATIC.as_i64() {
return Err(ChunkError::Malformed {
reason: "A chunk's timeline should never contain a static time value."
.to_owned(),
diff --git a/crates/re_chunk/src/id.rs b/crates/re_chunk/src/id.rs
new file mode 100644
index 000000000000..c35b08f34717
--- /dev/null
+++ b/crates/re_chunk/src/id.rs
@@ -0,0 +1,249 @@
+/// A unique ID for a [`crate::Chunk`].
+///
+/// `Chunk`s are the atomic unit of ingestion, transport, storage, events and GC in Rerun.
+///
+/// Internally, a [`crate::Chunk`] is made up of rows, which are themselves uniquely identified by
+/// their [`RowId`].
+///
+/// There is no relationship whatsoever between a [`ChunkId`] and the [`RowId`]s within that chunk.
+///
+/// ### Uniqueness
+///
+/// [`ChunkId`] are assumed unique within a single Recording.
+///
+/// The chunk store will treat two chunks with the same [`ChunkId`] as the same, and only keep one
+/// of them (which one is kept is an arbitrary and unstable implementation detail).
+///
+/// This makes it easy to build and maintain secondary indices around [`RowId`]s with few to no
+/// extraneous state tracking.
+///
+/// ### Garbage collection
+///
+/// Garbage collection is handled at the chunk level by first ordering the chunks based on the minimum
+/// [`RowId`] present in each of them.
+/// Garbage collection therefore happens (roughly) in the logger's wall-clock order.
+///
+/// This has very important implications when inserting data far into the past or into the future:
+/// think carefully about your `RowId`s in these cases.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
+pub struct ChunkId(pub(crate) re_tuid::Tuid);
+
+impl std::fmt::Display for ChunkId {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ self.0.fmt(f)
+ }
+}
+
+impl ChunkId {
+ pub const ZERO: Self = Self(re_tuid::Tuid::ZERO);
+ pub const MAX: Self = Self(re_tuid::Tuid::MAX);
+
+ /// Create a new unique [`ChunkId`] based on the current time.
+ #[allow(clippy::new_without_default)]
+ #[inline]
+ pub fn new() -> Self {
+ Self(re_tuid::Tuid::new())
+ }
+
+ /// Returns the next logical [`ChunkId`].
+ ///
+ /// Beware: wrong usage can easily lead to conflicts.
+ /// Prefer [`ChunkId::new`] when unsure.
+ #[must_use]
+ #[inline]
+ pub fn next(&self) -> Self {
+ Self(self.0.next())
+ }
+
+ /// Returns the `n`-next logical [`ChunkId`].
+ ///
+ /// This is equivalent to calling [`ChunkId::next`] `n` times.
+ /// Wraps the monotonically increasing back to zero on overflow.
+ ///
+ /// Beware: wrong usage can easily lead to conflicts.
+ /// Prefer [`ChunkId::new`] when unsure.
+ #[must_use]
+ #[inline]
+ pub fn incremented_by(&self, n: u64) -> Self {
+ Self(self.0.incremented_by(n))
+ }
+
+ /// When the `ChunkId` was created, in nanoseconds since unix epoch.
+ #[inline]
+ pub fn nanoseconds_since_epoch(&self) -> u64 {
+ self.0.nanoseconds_since_epoch()
+ }
+
+ #[inline]
+ pub fn from_u128(id: u128) -> Self {
+ Self(re_tuid::Tuid::from_u128(id))
+ }
+
+ #[inline]
+ pub fn as_u128(&self) -> u128 {
+ self.0.as_u128()
+ }
+}
+
+impl re_types_core::SizeBytes for ChunkId {
+ #[inline]
+ fn heap_size_bytes(&self) -> u64 {
+ 0
+ }
+
+ #[inline]
+ fn is_pod() -> bool {
+ true
+ }
+}
+
+impl std::ops::Deref for ChunkId {
+ type Target = re_tuid::Tuid;
+
+ #[inline]
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl std::ops::DerefMut for ChunkId {
+ #[inline]
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.0
+ }
+}
+
+re_types_core::delegate_arrow_tuid!(ChunkId as "rerun.controls.ChunkId");
+
+// ---
+
+/// A unique ID for a row's worth of data within a chunk.
+///
+/// There is no relationship whatsoever between a [`ChunkId`] and the [`RowId`]s within that chunk.
+///
+/// ### Uniqueness
+///
+/// Duplicated [`RowId`]s within a single recording is considered undefined behavior.
+///
+/// While it is benign in most cases, care has to be taken when manually crafting [`RowId`]s.
+/// Ideally: don't do so and stick to [`RowId::new`] instead to avoid bad surprises.
+///
+/// This makes it easy to build and maintain secondary indices around [`RowId`]s with few to no
+/// extraneous state tracking.
+///
+/// ### Query
+///
+/// Queries (both latest-at & range semantics) will defer to `RowId` order as a tie-breaker when
+/// looking at several rows worth of data that rest at the exact same timestamp.
+///
+/// In pseudo-code:
+/// ```text
+/// rr.set_time_sequence("frame", 10)
+///
+/// rr.log("my_entity", point1, row_id=#1)
+/// rr.log("my_entity", point2, row_id=#0)
+///
+/// rr.query("my_entity", at=("frame", 10)) # returns `point1`
+/// ```
+///
+/// Think carefully about your `RowId`s when logging a lot of data at the same timestamp.
+///
+/// ### Garbage collection
+///
+/// Garbage collection is handled at the chunk level by first ordering the chunks based on the minimum
+/// [`RowId`] present in each of them.
+/// Garbage collection therefore happens (roughly) in the logger's wall-clock order.
+///
+/// This has very important implications when inserting data far into the past or into the future:
+/// think carefully about your `RowId`s in these cases.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
+pub struct RowId(pub(crate) re_tuid::Tuid);
+
+impl std::fmt::Display for RowId {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ self.0.fmt(f)
+ }
+}
+
+impl RowId {
+ pub const ZERO: Self = Self(re_tuid::Tuid::ZERO);
+ pub const MAX: Self = Self(re_tuid::Tuid::MAX);
+
+ /// Create a new unique [`RowId`] based on the current time.
+ #[allow(clippy::new_without_default)]
+ #[inline]
+ pub fn new() -> Self {
+ Self(re_tuid::Tuid::new())
+ }
+
+ /// Returns the next logical [`RowId`].
+ ///
+ /// Beware: wrong usage can easily lead to conflicts.
+ /// Prefer [`RowId::new`] when unsure.
+ #[must_use]
+ #[inline]
+ pub fn next(&self) -> Self {
+ Self(self.0.next())
+ }
+
+ /// Returns the `n`-next logical [`RowId`].
+ ///
+ /// This is equivalent to calling [`RowId::next`] `n` times.
+ /// Wraps the monotonically increasing back to zero on overflow.
+ ///
+ /// Beware: wrong usage can easily lead to conflicts.
+ /// Prefer [`RowId::new`] when unsure.
+ #[must_use]
+ #[inline]
+ pub fn incremented_by(&self, n: u64) -> Self {
+ Self(self.0.incremented_by(n))
+ }
+
+ /// When the `RowId` was created, in nanoseconds since unix epoch.
+ #[inline]
+ pub fn nanoseconds_since_epoch(&self) -> u64 {
+ self.0.nanoseconds_since_epoch()
+ }
+
+ #[inline]
+ pub fn from_u128(id: u128) -> Self {
+ Self(re_tuid::Tuid::from_u128(id))
+ }
+
+ #[inline]
+ pub fn as_u128(&self) -> u128 {
+ self.0.as_u128()
+ }
+}
+
+impl re_types_core::SizeBytes for RowId {
+ #[inline]
+ fn heap_size_bytes(&self) -> u64 {
+ 0
+ }
+
+ #[inline]
+ fn is_pod() -> bool {
+ true
+ }
+}
+
+impl std::ops::Deref for RowId {
+ type Target = re_tuid::Tuid;
+
+ #[inline]
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl std::ops::DerefMut for RowId {
+ #[inline]
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.0
+ }
+}
+
+re_types_core::delegate_arrow_tuid!(RowId as "rerun.controls.RowId");
diff --git a/crates/re_chunk/src/iter.rs b/crates/re_chunk/src/iter.rs
new file mode 100644
index 000000000000..6c0345fec79a
--- /dev/null
+++ b/crates/re_chunk/src/iter.rs
@@ -0,0 +1,55 @@
+use arrow2::array::Array as ArrowArray;
+
+use re_log_types::{TimeInt, Timeline};
+use re_types_core::ComponentName;
+
+use crate::{Chunk, RowId};
+
+// ---
+
+impl Chunk {
+ /// Returns an iterator over the rows of the [`Chunk`].
+ ///
+ /// Each yielded item is a component batch with its associated index ([`RowId`] + data time).
+ ///
+ /// Iterating a [`Chunk`] on a row basis is very wasteful, performance-wise.
+ /// Prefer columnar access when possible.
+ //
+ // TODO(cmc): a row-based iterator is obviously not what we want -- one of the benefits of
+ // chunks is to amortize the cost of downcasting & "deserialization".
+ // But at the moment we still need to run with the native deserialization cache, which expects
+ // row-based data.
+ // As soon as we remove the native cache and start exposing `Chunk`s directly to downstream
+ // systems, we will look into ergonomic ways to do columnar access.
+ pub fn iter_rows(
+ &self,
+ timeline: &Timeline,
+ component_name: &ComponentName,
+ ) -> impl Iterator
- >)> + '_ {
+ let Self {
+ id: _,
+ entity_path: _,
+ heap_size_bytes: _,
+ is_sorted: _,
+ row_ids: _,
+ timelines,
+ components,
+ } = self;
+
+ let row_ids = self.row_ids();
+
+ let data_times = timelines
+ .get(timeline)
+ .into_iter()
+ .flat_map(|time_chunk| time_chunk.times().collect::>())
+ // If there's no time data, then the associate data time must be `TimeInt::STATIC`.
+ .chain(std::iter::repeat(TimeInt::STATIC));
+
+ let arrays = components
+ .get(component_name)
+ .into_iter()
+ .flat_map(|list_array| list_array.into_iter());
+
+ itertools::izip!(data_times, row_ids, arrays)
+ }
+}
diff --git a/crates/re_chunk/src/latest_at.rs b/crates/re_chunk/src/latest_at.rs
new file mode 100644
index 000000000000..3940131bb5c8
--- /dev/null
+++ b/crates/re_chunk/src/latest_at.rs
@@ -0,0 +1,170 @@
+use arrow2::array::Array as ArrowArray;
+
+use re_log_types::{TimeInt, Timeline};
+use re_types_core::ComponentName;
+
+use crate::{Chunk, RowId};
+
+// ---
+
+/// A query at a given time, for a given timeline.
+///
+/// Get the latest version of the data available at this time.
+#[derive(Clone, PartialEq, Eq, Hash)]
+pub struct LatestAtQuery {
+ timeline: Timeline,
+ at: TimeInt,
+}
+
+impl std::fmt::Debug for LatestAtQuery {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.write_fmt(format_args!(
+ "",
+ self.timeline.typ().format_utc(self.at),
+ self.timeline.name(),
+ ))
+ }
+}
+
+impl LatestAtQuery {
+ /// The returned query is guaranteed to never include [`TimeInt::STATIC`].
+ #[inline]
+ pub fn new(timeline: Timeline, at: impl TryInto) -> Self {
+ let at = at.try_into().unwrap_or(TimeInt::MIN);
+ Self { timeline, at }
+ }
+
+ #[inline]
+ pub const fn latest(timeline: Timeline) -> Self {
+ Self {
+ timeline,
+ at: TimeInt::MAX,
+ }
+ }
+
+ #[inline]
+ pub fn timeline(&self) -> Timeline {
+ self.timeline
+ }
+
+ #[inline]
+ pub fn at(&self) -> TimeInt {
+ self.at
+ }
+}
+
+// ---
+
+impl Chunk {
+ /// Runs a [`LatestAtQuery`] filter on a [`Chunk`].
+ ///
+ /// This behaves as a row-based filter: the result is a new [`Chunk`] that is vertically
+ /// sliced to only contain the row relevant for the specified `query`.
+ ///
+ /// The resulting [`Chunk`] is guaranteed to contain all the same columns has the queried
+ /// chunk: there is no horizontal slicing going on.
+ ///
+ /// An empty [`Chunk`] (i.e. 0 rows, but N columns) is returned if the `query` yields nothing.
+ ///
+ /// Because the resulting chunk doesn't discard any column information, you can find extra relevant
+ /// information by inspecting the data, for examples timestamps on other timelines.
+ /// See [`Self::timeline_sliced`] and [`Self::component_sliced`] if you do want to filter this
+ /// extra data.
+ pub fn latest_at(&self, query: &LatestAtQuery, component_name: ComponentName) -> Self {
+ if self.is_empty() {
+ return self.clone();
+ }
+
+ re_tracing::profile_function!(format!("{query:?}"));
+
+ let Some(component_list_array) = self.components.get(&component_name) else {
+ return self.emptied();
+ };
+
+ let mut index = None;
+
+ let is_static = self.is_static();
+ let is_sorted_by_row_id = self.is_sorted();
+
+ if is_static {
+ if is_sorted_by_row_id {
+ // Static, row-sorted chunk
+
+ for i in (0..self.num_rows()).rev() {
+ if !component_list_array.is_valid(i) {
+ continue;
+ }
+
+ index = Some(i);
+ break;
+ }
+ } else {
+ // Static, row-unsorted chunk
+
+ let mut closest_row_id = RowId::ZERO;
+
+ for (i, row_id) in self.row_ids().enumerate() {
+ if !component_list_array.is_valid(i) {
+ continue;
+ }
+
+ let is_closer_row_id = row_id > closest_row_id;
+
+ if is_closer_row_id {
+ closest_row_id = row_id;
+ index = Some(i);
+ }
+ }
+ }
+ } else {
+ let Some(time_chunk) = self.timelines.get(&query.timeline()) else {
+ return self.emptied();
+ };
+
+ let is_sorted_by_time = time_chunk.is_sorted();
+ let times = time_chunk.times_raw();
+
+ if is_sorted_by_time {
+ // Temporal, row-sorted, time-sorted chunk
+
+ let i = times
+ .partition_point(|&time| time <= query.at().as_i64())
+ .saturating_sub(1);
+
+ for i in (0..=i).rev() {
+ if !component_list_array.is_valid(i) {
+ continue;
+ }
+
+ index = Some(i);
+ break;
+ }
+ } else {
+ // Temporal, unsorted chunk
+
+ let mut closest_data_time = TimeInt::MIN;
+ let mut closest_row_id = RowId::ZERO;
+
+ for (i, row_id) in self.row_ids().enumerate() {
+ if !component_list_array.is_valid(i) {
+ continue;
+ }
+
+ let data_time = TimeInt::new_temporal(times[i]);
+
+ let is_closer_time = data_time > closest_data_time && data_time <= query.at();
+ let is_same_time_but_closer_row_id =
+ data_time == closest_data_time && row_id > closest_row_id;
+
+ if is_closer_time || is_same_time_but_closer_row_id {
+ closest_data_time = data_time;
+ closest_row_id = row_id;
+ index = Some(i);
+ }
+ }
+ }
+ }
+
+ index.map_or_else(|| self.emptied(), |i| self.row_sliced(i, 1))
+ }
+}
diff --git a/crates/re_chunk/src/lib.rs b/crates/re_chunk/src/lib.rs
index 85893096d22e..2f0719956c11 100644
--- a/crates/re_chunk/src/lib.rs
+++ b/crates/re_chunk/src/lib.rs
@@ -4,23 +4,46 @@
#![doc = document_features::document_features!()]
//!
+mod builder;
mod chunk;
+mod id;
+mod iter;
+mod latest_at;
+mod range;
mod shuffle;
+mod slice;
mod transport;
-mod util;
+pub mod util;
#[cfg(not(target_arch = "wasm32"))]
mod batcher;
-pub use self::chunk::{Chunk, ChunkError, ChunkId, ChunkResult, ChunkTimeline};
+pub use self::builder::{ChunkBuilder, ChunkTimelineBuilder};
+pub use self::chunk::{Chunk, ChunkError, ChunkResult, ChunkTimeline};
+pub use self::id::{ChunkId, RowId};
+pub use self::latest_at::LatestAtQuery;
+pub use self::range::RangeQuery;
pub use self::transport::TransportChunk;
-pub use self::util::arrays_to_list_array;
#[cfg(not(target_arch = "wasm32"))]
pub use self::batcher::{
ChunkBatcher, ChunkBatcherConfig, ChunkBatcherError, ChunkBatcherResult, PendingRow,
};
+// Re-exports
+
+#[doc(no_inline)]
+pub use arrow2::array::Array as ArrowArray;
+#[doc(no_inline)]
+pub use re_log_types::{EntityPath, TimeInt, TimePoint, Timeline, TimelineName};
+#[doc(no_inline)]
+pub use re_types_core::ComponentName;
+
pub mod external {
pub use arrow2;
+
+ pub use re_log_types;
+
+ #[cfg(not(target_arch = "wasm32"))]
+ pub use crossbeam;
}
diff --git a/crates/re_chunk/src/range.rs b/crates/re_chunk/src/range.rs
new file mode 100644
index 000000000000..abe485e15178
--- /dev/null
+++ b/crates/re_chunk/src/range.rs
@@ -0,0 +1,128 @@
+use re_log_types::{ResolvedTimeRange, TimeInt, Timeline};
+use re_types_core::ComponentName;
+
+use crate::Chunk;
+
+// --- Range ---
+
+/// A query over a time range, for a given timeline.
+///
+/// Get all the data within this time interval, plus the latest one before the start of the
+/// interval.
+///
+/// Motivation: all data is considered alive until the next logging to the same component path.
+#[derive(Clone, PartialEq, Eq, Hash)]
+pub struct RangeQuery {
+ pub timeline: Timeline,
+ pub range: ResolvedTimeRange,
+}
+
+impl std::fmt::Debug for RangeQuery {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.write_fmt(format_args!(
+ " Self {
+ Self { timeline, range }
+ }
+
+ #[inline]
+ pub const fn everything(timeline: Timeline) -> Self {
+ Self {
+ timeline,
+ range: ResolvedTimeRange::EVERYTHING,
+ }
+ }
+
+ #[inline]
+ pub fn timeline(&self) -> Timeline {
+ self.timeline
+ }
+
+ #[inline]
+ pub fn range(&self) -> ResolvedTimeRange {
+ self.range
+ }
+}
+
+// ---
+
+impl Chunk {
+ /// Runs a [`RangeQuery`] filter on a [`Chunk`].
+ ///
+ /// This behaves as a row-based filter: the result is a new [`Chunk`] that is vertically
+ /// sliced, sorted and filtered in order to only contain the row(s) relevant for the
+ /// specified `query`.
+ ///
+ /// The resulting [`Chunk`] is guaranteed to contain all the same columns has the queried
+ /// chunk: there is no horizontal slicing going on.
+ ///
+ /// An empty [`Chunk`] (i.e. 0 rows, but N columns) is returned if the `query` yields nothing.
+ ///
+ /// Because the resulting chunk doesn't discard any column information, you can find extra relevant
+ /// information by inspecting the data, for examples timestamps on other timelines.
+ /// See [`Self::timeline_sliced`] and [`Self::component_sliced`] if you do want to filter this
+ /// extra data.
+ //
+ // TODO(#3741): Since we don't have access to arrow's ListView yet, we must actually clone the
+ // data if the chunk requires sorting.
+ pub fn range(&self, query: &RangeQuery, component_name: ComponentName) -> Self {
+ if self.is_empty() {
+ return self.clone();
+ }
+
+ re_tracing::profile_function!(format!("{query:?}"));
+
+ let is_static = self.is_static();
+
+ if is_static {
+ // NOTE: A given component for a given entity can only have one static entry associated
+ // with it, and this entry overrides everything else, which means it is functionally
+ // equivalent to just running a latest-at query.
+ self.latest_at(
+ &crate::LatestAtQuery::new(query.timeline(), TimeInt::MAX),
+ component_name,
+ )
+ } else {
+ let Some(is_sorted_by_time) = self
+ .timelines
+ .get(&query.timeline())
+ .map(|time_chunk| time_chunk.is_sorted())
+ else {
+ return self.emptied();
+ };
+
+ let chunk = self.densified(component_name);
+
+ let chunk = if is_sorted_by_time {
+ // Temporal, row-sorted, time-sorted chunk
+ chunk
+ } else {
+ // Temporal, unsorted chunk
+ chunk.sorted_by_timeline_if_unsorted(&query.timeline())
+ };
+
+ let Some(times) = chunk
+ .timelines
+ .get(&query.timeline())
+ .map(|time_chunk| time_chunk.times_raw())
+ else {
+ return chunk.emptied();
+ };
+
+ let start_index = times.partition_point(|&time| time < query.range().min().as_i64());
+ let end_index = times.partition_point(|&time| time <= query.range().max().as_i64());
+
+ chunk.row_sliced(start_index, end_index.saturating_sub(start_index))
+ }
+ }
+}
diff --git a/crates/re_chunk/src/shuffle.rs b/crates/re_chunk/src/shuffle.rs
index 7683fabf398a..636861b44d91 100644
--- a/crates/re_chunk/src/shuffle.rs
+++ b/crates/re_chunk/src/shuffle.rs
@@ -1,15 +1,19 @@
use arrow2::{
- array::{Array as ArrowArray, ListArray as ArrowListArray},
+ array::{
+ Array as ArrowArray, ListArray as ArrowListArray, PrimitiveArray as ArrowPrimitiveArray,
+ StructArray,
+ },
offset::Offsets as ArrowOffsets,
};
use itertools::Itertools as _;
+use re_log_types::Timeline;
use crate::{Chunk, ChunkTimeline};
// ---
impl Chunk {
- /// Is the chunk currently ascendingly sorted by [`re_log_types::RowId`]?
+ /// Is the chunk currently ascendingly sorted by [`crate::RowId`]?
///
/// This is O(1) (cached).
///
@@ -29,9 +33,9 @@ impl Chunk {
pub fn is_sorted_uncached(&self) -> bool {
re_tracing::profile_function!();
- self.row_ids
- .windows(2)
- .all(|row_ids| row_ids[0] <= row_ids[1])
+ self.row_ids()
+ .tuple_windows::<(_, _)>()
+ .all(|row_ids| row_ids.0 <= row_ids.1)
}
/// Sort the chunk, if needed.
@@ -45,17 +49,20 @@ impl Chunk {
re_tracing::profile_function!();
+ #[cfg(not(target_arch = "wasm32"))]
let now = std::time::Instant::now();
let swaps = {
re_tracing::profile_scope!("swaps");
- let mut swaps = (0..self.row_ids.len()).collect::>();
- swaps.sort_by_key(|&i| self.row_ids[i]);
+ let row_ids = self.row_ids().collect_vec();
+ let mut swaps = (0..row_ids.len()).collect::>();
+ swaps.sort_by_key(|&i| row_ids[i]);
swaps
};
self.shuffle_with(&swaps);
+ #[cfg(not(target_arch = "wasm32"))]
re_log::trace!(
entity_path = %self.entity_path,
num_rows = self.row_ids.len(),
@@ -68,6 +75,49 @@ impl Chunk {
self.sanity_check().unwrap();
}
+ /// Returns a new [`Chunk`] that is sorted by `(, RowId)`.
+ ///
+ /// The underlying arrow data will be copied and shuffled in memory in order to make it contiguous.
+ ///
+ /// This is a no-op if the underlying timeline is already sorted appropriately (happy path).
+ pub fn sorted_by_timeline_if_unsorted(&self, timeline: &Timeline) -> Self {
+ re_tracing::profile_function!();
+
+ let mut chunk = self.clone();
+
+ let Some(time_chunk) = chunk.timelines.get(timeline) else {
+ return chunk;
+ };
+
+ #[cfg(not(target_arch = "wasm32"))]
+ let now = std::time::Instant::now();
+
+ let swaps = {
+ re_tracing::profile_scope!("swaps");
+ let row_ids = chunk.row_ids().collect_vec();
+ let times = time_chunk.times_raw().to_vec();
+ let mut swaps = (0..times.len()).collect::>();
+ swaps.sort_by_key(|&i| (times[i], row_ids[i]));
+ swaps
+ };
+
+ chunk.shuffle_with(&swaps);
+
+ #[cfg(not(target_arch = "wasm32"))]
+ re_log::trace!(
+ entity_path = %chunk.entity_path,
+ num_rows = chunk.row_ids.len(),
+ elapsed = ?now.elapsed(),
+ "chunk sorted",
+ );
+
+ #[cfg(debug_assertions)]
+ #[allow(clippy::unwrap_used)] // dev only
+ chunk.sanity_check().unwrap();
+
+ chunk
+ }
+
/// Randomly shuffles the chunk using the given `seed`.
///
/// The underlying arrow data will be copied and shuffled in memory in order to make it contiguous.
@@ -75,6 +125,7 @@ impl Chunk {
pub fn shuffle_random(&mut self, seed: u64) {
re_tracing::profile_function!();
+ #[cfg(not(target_arch = "wasm32"))]
let now = std::time::Instant::now();
use rand::{seq::SliceRandom as _, SeedableRng as _};
@@ -89,6 +140,7 @@ impl Chunk {
self.shuffle_with(&swaps);
+ #[cfg(not(target_arch = "wasm32"))]
re_log::trace!(
entity_path = %self.entity_path,
num_rows = self.row_ids.len(),
@@ -108,42 +160,59 @@ impl Chunk {
pub(crate) fn shuffle_with(&mut self, swaps: &[usize]) {
re_tracing::profile_function!();
- let Self {
- id: _,
- entity_path: _,
- is_sorted: _,
- row_ids,
- timelines,
- components,
- } = self;
-
// Row IDs
{
re_tracing::profile_scope!("row ids");
- let original = row_ids.clone();
+ let (times, counters) = self.row_ids_raw();
+ let (times, counters) = (times.values(), counters.values());
+
+ let mut sorted_times = times.to_vec();
+ let mut sorted_counters = counters.to_vec();
for (to, from) in swaps.iter().copied().enumerate() {
- row_ids[to] = original[from];
+ sorted_times[to] = times[from];
+ sorted_counters[to] = counters[from];
}
+
+ let times = ArrowPrimitiveArray::::from_vec(sorted_times).boxed();
+ let counters = ArrowPrimitiveArray::::from_vec(sorted_counters).boxed();
+
+ self.row_ids = StructArray::new(
+ self.row_ids.data_type().clone(),
+ vec![times, counters],
+ None,
+ );
}
+ let Self {
+ id: _,
+ entity_path: _,
+ heap_size_bytes: _,
+ is_sorted: _,
+ row_ids: _,
+ timelines,
+ components,
+ } = self;
+
// Timelines
{
re_tracing::profile_scope!("timelines");
for info in timelines.values_mut() {
let ChunkTimeline {
+ timeline,
times,
is_sorted,
time_range: _,
} = info;
- let original = times.clone();
+ let mut sorted = times.values().to_vec();
for (to, from) in swaps.iter().copied().enumerate() {
- times[to] = original[from];
+ sorted[to] = times.values()[from];
}
- *is_sorted = times.windows(2).all(|times| times[0] <= times[1]);
+ *is_sorted = sorted.windows(2).all(|times| times[0] <= times[1]);
+ *times = ArrowPrimitiveArray::::from_vec(sorted).to(timeline.datatype());
}
}
@@ -153,16 +222,10 @@ impl Chunk {
re_tracing::profile_scope!("components (offsets & data)");
{
for original in components.values_mut() {
- #[allow(clippy::unwrap_used)] // a chunk's column is always a list array
- let original_list = original
- .as_any()
- .downcast_ref::>()
- .unwrap();
-
let sorted_arrays = swaps
.iter()
.copied()
- .map(|from| original_list.value(from))
+ .map(|from| original.value(from))
.collect_vec();
let sorted_arrays = sorted_arrays
.iter()
@@ -176,12 +239,11 @@ impl Chunk {
.unwrap();
#[allow(clippy::unwrap_used)] // these are slices of the same outer array
let values = arrow2::compute::concatenate::concatenate(&sorted_arrays).unwrap();
- let validity = original_list
+ let validity = original
.validity()
.map(|validity| swaps.iter().map(|&from| validity.get_bit(from)).collect());
- *original =
- ArrowListArray::::new(datatype, offsets.into(), values, validity).boxed();
+ *original = ArrowListArray::::new(datatype, offsets.into(), values, validity);
}
}
@@ -209,7 +271,9 @@ impl ChunkTimeline {
#[inline]
pub fn is_sorted_uncached(&self) -> bool {
re_tracing::profile_function!();
- self.times.windows(2).all(|times| times[0] <= times[1])
+ self.times_raw()
+ .windows(2)
+ .all(|times| times[0] <= times[1])
}
}
@@ -217,11 +281,11 @@ impl ChunkTimeline {
mod tests {
use re_log_types::{
example_components::{MyColor, MyPoint},
- EntityPath, RowId, TimeInt, Timeline,
+ EntityPath, Timeline,
};
use re_types_core::Loggable as _;
- use crate::{arrays_to_list_array, ChunkId};
+ use crate::{ChunkId, RowId};
use super::*;
@@ -230,72 +294,62 @@ mod tests {
let entity_path: EntityPath = "a/b/c".into();
let timeline1 = Timeline::new_temporal("log_time");
- let timeline2 = Timeline::new_temporal("frame_nr");
+ let timeline2 = Timeline::new_sequence("frame_nr");
- let points1 = MyPoint::to_arrow([
+ let points1 = vec![
MyPoint::new(1.0, 2.0),
MyPoint::new(3.0, 4.0),
MyPoint::new(5.0, 6.0),
- ])?;
- let points2 = None;
- let points3 = MyPoint::to_arrow([MyPoint::new(10.0, 20.0)])?;
- let points4 = MyPoint::to_arrow([MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)])?;
+ ];
+ let points3 = vec![MyPoint::new(10.0, 20.0)];
+ let points4 = vec![MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)];
- let colors1 = MyColor::to_arrow([
+ let colors1 = vec![
MyColor::from_rgb(1, 2, 3),
MyColor::from_rgb(4, 5, 6),
MyColor::from_rgb(7, 8, 9),
- ])?;
- let colors2 = MyColor::to_arrow([MyColor::from_rgb(10, 20, 30)])?;
- let colors3 = None;
- let colors4 = MyColor::to_arrow([
+ ];
+ let colors2 = vec![MyColor::from_rgb(10, 20, 30)];
+ let colors4 = vec![
MyColor::from_rgb(101, 102, 103),
MyColor::from_rgb(104, 105, 106),
- ])?;
-
- let timelines = [
- (
- timeline1,
- ChunkTimeline::new(
- Some(true),
- [1000, 1001, 1002, 1003].map(TimeInt::new_temporal).to_vec(),
- )
- .unwrap(),
- ),
- (
- timeline2,
- ChunkTimeline::new(
- Some(true),
- [42, 43, 44, 45].map(TimeInt::new_temporal).to_vec(),
- )
- .unwrap(),
- ),
];
- let components = [
- (
- MyPoint::name(),
- arrays_to_list_array(&[Some(&*points1), points2, Some(&*points3), Some(&*points4)])
- .unwrap(),
- ),
- (
- MyPoint::name(),
- arrays_to_list_array(&[Some(&*colors1), Some(&*colors2), colors3, Some(&*colors4)])
- .unwrap(),
- ),
- ];
-
- let row_ids = vec![RowId::new(), RowId::new(), RowId::new(), RowId::new()];
-
{
- let chunk_sorted = Chunk::new(
- ChunkId::new(),
- entity_path.clone(),
- Some(true),
- row_ids.clone(),
- timelines.clone().into_iter().collect(),
- components.clone().into_iter().collect(),
- )?;
+ let chunk_sorted = Chunk::builder(entity_path.clone())
+ .with_sparse_component_batches(
+ RowId::new(),
+ [(timeline1, 1000), (timeline2, 42)],
+ [
+ (MyPoint::name(), Some(&points1 as _)),
+ (MyColor::name(), Some(&colors1 as _)),
+ ],
+ )
+ .with_sparse_component_batches(
+ RowId::new(),
+ [(timeline1, 1001), (timeline2, 43)],
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(&colors2 as _)),
+ ],
+ )
+ .with_sparse_component_batches(
+ RowId::new(),
+ [(timeline1, 1002), (timeline2, 44)],
+ [
+ (MyPoint::name(), Some(&points3 as _)),
+ (MyColor::name(), None),
+ ],
+ )
+ .with_sparse_component_batches(
+ RowId::new(),
+ [(timeline1, 1003), (timeline2, 45)],
+ [
+ (MyPoint::name(), Some(&points4 as _)),
+ (MyColor::name(), Some(&colors4 as _)),
+ ],
+ )
+ .build()?;
eprintln!("{chunk_sorted}");
@@ -329,4 +383,183 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn sort_time() -> anyhow::Result<()> {
+ let entity_path: EntityPath = "a/b/c".into();
+
+ let timeline1 = Timeline::new_temporal("log_time");
+ let timeline2 = Timeline::new_sequence("frame_nr");
+
+ let chunk_id = ChunkId::new();
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+ let row_id4 = RowId::new();
+
+ let points1 = vec![
+ MyPoint::new(1.0, 2.0),
+ MyPoint::new(3.0, 4.0),
+ MyPoint::new(5.0, 6.0),
+ ];
+ let points3 = vec![MyPoint::new(10.0, 20.0)];
+ let points4 = vec![MyPoint::new(100.0, 200.0), MyPoint::new(300.0, 400.0)];
+
+ let colors1 = vec![
+ MyColor::from_rgb(1, 2, 3),
+ MyColor::from_rgb(4, 5, 6),
+ MyColor::from_rgb(7, 8, 9),
+ ];
+ let colors2 = vec![MyColor::from_rgb(10, 20, 30)];
+ let colors4 = vec![
+ MyColor::from_rgb(101, 102, 103),
+ MyColor::from_rgb(104, 105, 106),
+ ];
+
+ {
+ let chunk_unsorted_timeline2 = Chunk::builder_with_id(chunk_id, entity_path.clone())
+ .with_sparse_component_batches(
+ row_id1,
+ [(timeline1, 1000), (timeline2, 45)],
+ [
+ (MyPoint::name(), Some(&points1 as _)),
+ (MyColor::name(), Some(&colors1 as _)),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id2,
+ [(timeline1, 1001), (timeline2, 44)],
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(&colors2 as _)),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id3,
+ [(timeline1, 1002), (timeline2, 43)],
+ [
+ (MyPoint::name(), Some(&points3 as _)),
+ (MyColor::name(), None),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id4,
+ [(timeline1, 1003), (timeline2, 42)],
+ [
+ (MyPoint::name(), Some(&points4 as _)),
+ (MyColor::name(), Some(&colors4 as _)),
+ ],
+ )
+ .build()?;
+
+ eprintln!("unsorted:\n{chunk_unsorted_timeline2}");
+
+ assert!(chunk_unsorted_timeline2.is_sorted());
+ assert!(chunk_unsorted_timeline2.is_sorted_uncached());
+
+ assert!(chunk_unsorted_timeline2
+ .timelines()
+ .get(&timeline1)
+ .unwrap()
+ .is_sorted());
+ assert!(chunk_unsorted_timeline2
+ .timelines()
+ .get(&timeline1)
+ .unwrap()
+ .is_sorted_uncached());
+
+ assert!(!chunk_unsorted_timeline2
+ .timelines()
+ .get(&timeline2)
+ .unwrap()
+ .is_sorted());
+ assert!(!chunk_unsorted_timeline2
+ .timelines()
+ .get(&timeline2)
+ .unwrap()
+ .is_sorted_uncached());
+
+ let chunk_sorted_timeline2 =
+ chunk_unsorted_timeline2.sorted_by_timeline_if_unsorted(&timeline2);
+
+ eprintln!("sorted:\n{chunk_sorted_timeline2}");
+
+ assert!(!chunk_sorted_timeline2.is_sorted());
+ assert!(!chunk_sorted_timeline2.is_sorted_uncached());
+
+ assert!(!chunk_sorted_timeline2
+ .timelines()
+ .get(&timeline1)
+ .unwrap()
+ .is_sorted());
+ assert!(!chunk_sorted_timeline2
+ .timelines()
+ .get(&timeline1)
+ .unwrap()
+ .is_sorted_uncached());
+
+ assert!(chunk_sorted_timeline2
+ .timelines()
+ .get(&timeline2)
+ .unwrap()
+ .is_sorted());
+ assert!(chunk_sorted_timeline2
+ .timelines()
+ .get(&timeline2)
+ .unwrap()
+ .is_sorted_uncached());
+
+ let chunk_sorted_timeline2_expected =
+ Chunk::builder_with_id(chunk_id, entity_path.clone())
+ .with_sparse_component_batches(
+ row_id4,
+ [(timeline1, 1003), (timeline2, 42)],
+ [
+ (MyPoint::name(), Some(&points4 as _)),
+ (MyColor::name(), Some(&colors4 as _)),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id3,
+ [(timeline1, 1002), (timeline2, 43)],
+ [
+ (MyPoint::name(), Some(&points3 as _)),
+ (MyColor::name(), None),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id2,
+ [(timeline1, 1001), (timeline2, 44)],
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(&colors2 as _)),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id1,
+ [(timeline1, 1000), (timeline2, 45)],
+ [
+ (MyPoint::name(), Some(&points1 as _)),
+ (MyColor::name(), Some(&colors1 as _)),
+ ],
+ )
+ .build()?;
+
+ eprintln!("expected:\n{chunk_sorted_timeline2}");
+
+ assert_eq!(
+ chunk_sorted_timeline2_expected,
+ chunk_sorted_timeline2,
+ "{}",
+ similar_asserts::SimpleDiff::from_str(
+ &format!("{chunk_sorted_timeline2_expected}"),
+ &format!("{chunk_sorted_timeline2}"),
+ "got",
+ "expected",
+ ),
+ );
+ }
+
+ Ok(())
+ }
}
diff --git a/crates/re_chunk/src/slice.rs b/crates/re_chunk/src/slice.rs
new file mode 100644
index 000000000000..345b195efa41
--- /dev/null
+++ b/crates/re_chunk/src/slice.rs
@@ -0,0 +1,495 @@
+use arrow2::array::{
+ Array as ArrowArray, BooleanArray as ArrowBooleanArray, ListArray,
+ PrimitiveArray as ArrowPrimitiveArray, StructArray,
+};
+
+use itertools::Itertools;
+use nohash_hasher::IntSet;
+use re_log_types::Timeline;
+use re_types_core::ComponentName;
+
+use crate::{Chunk, ChunkTimeline};
+
+// ---
+
+// NOTE: Not worth writing tests for all of these, until some subtle bug comes up.
+// Most of them are indirectly stressed by our higher-level query tests anyhow.
+
+impl Chunk {
+ /// Slices the [`Chunk`] vertically.
+ ///
+ /// The result is a new [`Chunk`] with the same columns and (potentially) less rows.
+ ///
+ /// This cannot fail nor panic: `index` and `len` will be capped so that they cannot
+ /// run out of bounds.
+ /// This can result in an empty [`Chunk`] being returned if the slice is completely OOB.
+ #[inline]
+ pub fn row_sliced(&self, index: usize, len: usize) -> Self {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ // NOTE: Bound checking costs are completely dwarfed by everything else, and preventing the
+ // viewer from crashing is more important than anything else in any case.
+
+ if index >= self.num_rows() {
+ return self.emptied();
+ }
+
+ let end_offset = usize::min(index.saturating_add(len), self.num_rows());
+ let len = end_offset.saturating_sub(index);
+
+ if len == 0 {
+ return self.emptied();
+ }
+
+ let is_sorted = *is_sorted || (len < 2);
+
+ let mut chunk = Self {
+ id: *id,
+ entity_path: entity_path.clone(),
+ heap_size_bytes: Default::default(),
+ is_sorted,
+ row_ids: row_ids.clone().sliced(index, len),
+ timelines: timelines
+ .iter()
+ .map(|(timeline, time_chunk)| (*timeline, time_chunk.row_sliced(index, len)))
+ .collect(),
+ components: components
+ .iter()
+ .map(|(component_name, list_array)| {
+ (*component_name, list_array.clone().sliced(index, len))
+ })
+ .collect(),
+ };
+
+ // We can know for sure whether the resulting chunk is already sorted (see conditional
+ // above), but the reverse is not true.
+ //
+ // Consider e.g. slicing the following chunk on `(1..=3)`:
+ // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐
+ // │ frame ┆ example.MyColor ┆ example.MyPoint │
+ // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡
+ // │ 3 ┆ [4278255873] ┆ - │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 2 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 3 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │
+ // └──────────────┴───────────────────┴────────────────────────────────────────────┘
+ //
+ // The original chunk is unsorted, but the new sliced one actually ends up being sorted.
+ chunk.is_sorted = is_sorted || chunk.is_sorted_uncached();
+
+ #[cfg(debug_assertions)]
+ #[allow(clippy::unwrap_used)] // debug-only
+ chunk.sanity_check().unwrap();
+
+ chunk
+ }
+
+ /// Slices the [`Chunk`] horizontally by keeping only the selected `timeline`.
+ ///
+ /// The result is a new [`Chunk`] with the same rows and (at-most) one timeline column.
+ /// All non-timeline columns will be kept as-is.
+ ///
+ /// If `timeline` is not found within the [`Chunk`], the end result will be the same as the
+ /// current chunk but without any timeline column.
+ #[inline]
+ pub fn timeline_sliced(&self, timeline: Timeline) -> Self {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ let chunk = Self {
+ id: *id,
+ entity_path: entity_path.clone(),
+ heap_size_bytes: Default::default(),
+ is_sorted: *is_sorted,
+ row_ids: row_ids.clone(),
+ timelines: timelines
+ .get_key_value(&timeline)
+ .map(|(timeline, time_chunk)| (*timeline, time_chunk.clone()))
+ .into_iter()
+ .collect(),
+ components: components.clone(),
+ };
+
+ #[cfg(debug_assertions)]
+ #[allow(clippy::unwrap_used)] // debug-only
+ chunk.sanity_check().unwrap();
+
+ chunk
+ }
+
+ /// Slices the [`Chunk`] horizontally by keeping only the selected `component_name`.
+ ///
+ /// The result is a new [`Chunk`] with the same rows and (at-most) one component column.
+ /// All non-component columns will be kept as-is.
+ ///
+ /// If `component_name` is not found within the [`Chunk`], the end result will be the same as the
+ /// current chunk but without any component column.
+ #[inline]
+ pub fn component_sliced(&self, component_name: ComponentName) -> Self {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ let chunk = Self {
+ id: *id,
+ entity_path: entity_path.clone(),
+ heap_size_bytes: Default::default(),
+ is_sorted: *is_sorted,
+ row_ids: row_ids.clone(),
+ timelines: timelines.clone(),
+ components: components
+ .get_key_value(&component_name)
+ .map(|(component_name, list_array)| (*component_name, list_array.clone()))
+ .into_iter()
+ .collect(),
+ };
+
+ #[cfg(debug_assertions)]
+ #[allow(clippy::unwrap_used)] // debug-only
+ chunk.sanity_check().unwrap();
+
+ chunk
+ }
+
+ /// Slices the [`Chunk`] horizontally by keeping only the selected timelines.
+ ///
+ /// The result is a new [`Chunk`] with the same rows and (at-most) the selected timeline columns.
+ /// All non-timeline columns will be kept as-is.
+ ///
+ /// If none of the selected timelines exist in the [`Chunk`], the end result will be the same as the
+ /// current chunk but without any timeline column.
+ #[inline]
+ pub fn timelines_sliced(&self, timelines_to_keep: &IntSet) -> Self {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ let chunk = Self {
+ id: *id,
+ entity_path: entity_path.clone(),
+ heap_size_bytes: Default::default(),
+ is_sorted: *is_sorted,
+ row_ids: row_ids.clone(),
+ timelines: timelines
+ .iter()
+ .filter(|(timeline, _)| timelines_to_keep.contains(timeline))
+ .map(|(timeline, time_chunk)| (*timeline, time_chunk.clone()))
+ .collect(),
+ components: components.clone(),
+ };
+
+ #[cfg(debug_assertions)]
+ #[allow(clippy::unwrap_used)] // debug-only
+ chunk.sanity_check().unwrap();
+
+ chunk
+ }
+
+ /// Slices the [`Chunk`] horizontally by keeping only the selected `component_names`.
+ ///
+ /// The result is a new [`Chunk`] with the same rows and (at-most) the selected component columns.
+ /// All non-component columns will be kept as-is.
+ ///
+ /// If none of the `component_names` exist in the [`Chunk`], the end result will be the same as the
+ /// current chunk but without any component column.
+ #[inline]
+ pub fn components_sliced(&self, component_names: &IntSet) -> Self {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ let chunk = Self {
+ id: *id,
+ entity_path: entity_path.clone(),
+ heap_size_bytes: Default::default(),
+ is_sorted: *is_sorted,
+ row_ids: row_ids.clone(),
+ timelines: timelines.clone(),
+ components: components
+ .iter()
+ .filter(|(component_name, _)| component_names.contains(component_name))
+ .map(|(component_name, list_array)| (*component_name, list_array.clone()))
+ .collect(),
+ };
+
+ #[cfg(debug_assertions)]
+ #[allow(clippy::unwrap_used)] // debug-only
+ chunk.sanity_check().unwrap();
+
+ chunk
+ }
+
+ /// Densifies the [`Chunk`] vertically based on the `component_name` column.
+ ///
+ /// Densifying here means dropping all rows where the associated value in the `component_name`
+ /// column is null.
+ ///
+ /// The result is a new [`Chunk`] where the `component_name` column is guaranteed to be dense.
+ ///
+ /// If `component_name` doesn't exist in this [`Chunk`], or if it is already dense, this method
+ /// is a no-op.
+ #[inline]
+ pub fn densified(&self, component_name: ComponentName) -> Self {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ if self.is_empty() {
+ return self.clone();
+ }
+
+ let Some(component_list_array) = components.get(&component_name) else {
+ return self.clone();
+ };
+
+ let Some(validity) = component_list_array.validity() else {
+ return self.clone();
+ };
+
+ let mask = validity.iter().collect_vec();
+ let is_sorted = *is_sorted || (mask.iter().filter(|&&b| b).count() < 2);
+ let validity_filter = ArrowBooleanArray::from_slice(mask);
+
+ let mut chunk = Self {
+ id: *id,
+ entity_path: entity_path.clone(),
+ heap_size_bytes: Default::default(),
+ is_sorted,
+ row_ids: crate::util::filter_array(row_ids, &validity_filter),
+ timelines: timelines
+ .iter()
+ .map(|(&timeline, time_chunk)| (timeline, time_chunk.filtered(&validity_filter)))
+ .collect(),
+ components: components
+ .iter()
+ .map(|(&component_name, list_array)| {
+ (
+ component_name,
+ crate::util::filter_array(list_array, &validity_filter),
+ )
+ })
+ .collect(),
+ };
+
+ // We can know for sure whether the resulting chunk is already sorted (see conditional
+ // above), but the reverse is not true.
+ //
+ // Consider e.g. densifying the following chunk on `example.MyPoint`:
+ // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐
+ // │ frame ┆ example.MyColor ┆ example.MyPoint │
+ // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡
+ // │ 3 ┆ [4278255873] ┆ - │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │
+ // └──────────────┴───────────────────┴────────────────────────────────────────────┘
+ //
+ // The original chunk is unsorted, but the new filtered one actually ends up being sorted.
+ chunk.is_sorted = is_sorted || chunk.is_sorted_uncached();
+
+ #[cfg(debug_assertions)]
+ #[allow(clippy::unwrap_used)] // debug-only
+ chunk.sanity_check().unwrap();
+
+ chunk
+ }
+
+ /// Empties the [`Chunk`] vertically.
+ ///
+ /// The result is a new [`Chunk`] with the same columns but zero rows.
+ #[inline]
+ pub fn emptied(&self) -> Self {
+ let Self {
+ id,
+ entity_path,
+ heap_size_bytes: _,
+ is_sorted: _,
+ row_ids,
+ timelines,
+ components,
+ } = self;
+
+ Self {
+ id: *id,
+ entity_path: entity_path.clone(),
+ heap_size_bytes: Default::default(),
+ is_sorted: true,
+ row_ids: StructArray::new_empty(row_ids.data_type().clone()),
+ timelines: timelines
+ .iter()
+ .map(|(&timeline, time_chunk)| (timeline, time_chunk.emptied()))
+ .collect(),
+ components: components
+ .iter()
+ .map(|(&component_name, list_array)| {
+ (
+ component_name,
+ ListArray::new_empty(list_array.data_type().clone()),
+ )
+ })
+ .collect(),
+ }
+ }
+}
+
+impl ChunkTimeline {
+ /// Slices the [`ChunkTimeline`] vertically.
+ ///
+ /// The result is a new [`ChunkTimeline`] with the same timelines and (potentially) less rows.
+ ///
+ /// This cannot fail nor panic: `index` and `len` will be capped so that they cannot
+ /// run out of bounds.
+ /// This can result in an empty [`ChunkTimeline`] being returned if the slice is completely OOB.
+ #[inline]
+ pub fn row_sliced(&self, index: usize, len: usize) -> Self {
+ let Self {
+ timeline,
+ times,
+ is_sorted,
+ time_range: _,
+ } = self;
+
+ // NOTE: Bound checking costs are completely dwarfed by everything else, and preventing the
+ // viewer from crashing is more important than anything else in any case.
+
+ if index >= self.num_rows() {
+ return self.emptied();
+ }
+
+ let end_offset = usize::min(index.saturating_add(len), self.num_rows());
+ let len = end_offset.saturating_sub(index);
+
+ if len == 0 {
+ return self.emptied();
+ }
+
+ let is_sorted = *is_sorted || (len < 2);
+
+ // We can know for sure whether the resulting chunk is already sorted (see conditional
+ // above), but the reverse is not true.
+ //
+ // Consider e.g. slicing the following chunk on `(1..=3)`:
+ // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐
+ // │ frame ┆ example.MyColor ┆ example.MyPoint │
+ // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡
+ // │ 3 ┆ [4278255873] ┆ - │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 2 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 3 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │
+ // └──────────────┴───────────────────┴────────────────────────────────────────────┘
+ //
+ // The original chunk is unsorted, but the new sliced one actually ends up being sorted.
+ let is_sorted_opt = is_sorted.then_some(is_sorted);
+
+ Self::new(
+ is_sorted_opt,
+ *timeline,
+ ArrowPrimitiveArray::sliced(times.clone(), index, len),
+ )
+ }
+
+ /// Empties the [`ChunkTimeline`] vertically.
+ ///
+ /// The result is a new [`ChunkTimeline`] with the same columns but zero rows.
+ #[inline]
+ pub fn emptied(&self) -> Self {
+ let Self {
+ timeline,
+ times,
+ is_sorted: _,
+ time_range: _,
+ } = self;
+
+ Self::new(
+ Some(true),
+ *timeline,
+ ArrowPrimitiveArray::new_empty(times.data_type().clone()),
+ )
+ }
+
+ /// Runs a filter compute kernel on the time data with the specified `mask`.
+ #[inline]
+ pub(crate) fn filtered(&self, filter: &ArrowBooleanArray) -> Self {
+ let Self {
+ timeline,
+ times,
+ is_sorted,
+ time_range: _,
+ } = self;
+
+ let is_sorted = *is_sorted || filter.values_iter().filter(|&b| b).count() < 2;
+
+ // We can know for sure whether the resulting chunk is already sorted (see conditional
+ // above), but the reverse is not true.
+ //
+ // Consider e.g. densifying the following chunk on `example.MyPoint`:
+ // ┌──────────────┬───────────────────┬────────────────────────────────────────────┐
+ // │ frame ┆ example.MyColor ┆ example.MyPoint │
+ // ╞══════════════╪═══════════════════╪════════════════════════════════════════════╡
+ // │ 3 ┆ [4278255873] ┆ - │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 1 ┆ - ┆ [{x: 1, y: 1}, {x: 2, y: 2}] │
+ // ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ // │ 5 ┆ - ┆ [{x: 3, y: 3}, {x: 4, y: 4}, {x: 5, y: 5}] │
+ // └──────────────┴───────────────────┴────────────────────────────────────────────┘
+ //
+ // The original chunk is unsorted, but the new filtered one actually ends up being sorted.
+ let is_sorted_opt = is_sorted.then_some(is_sorted);
+
+ Self::new(
+ is_sorted_opt,
+ *timeline,
+ crate::util::filter_array(times, filter),
+ )
+ }
+}
diff --git a/crates/re_chunk/src/transport.rs b/crates/re_chunk/src/transport.rs
index 977076a4081b..ca343bf04217 100644
--- a/crates/re_chunk/src/transport.rs
+++ b/crates/re_chunk/src/transport.rs
@@ -1,7 +1,10 @@
use std::collections::BTreeMap;
use arrow2::{
- array::{Array as ArrowArray, PrimitiveArray as ArrowPrimitiveArray},
+ array::{
+ Array as ArrowArray, ListArray, PrimitiveArray as ArrowPrimitiveArray,
+ StructArray as ArrowStructArray,
+ },
chunk::Chunk as ArrowChunk,
datatypes::{
DataType as ArrowDatatype, Field as ArrowField, Metadata as ArrowMetadata,
@@ -9,10 +12,10 @@ use arrow2::{
},
};
-use re_log_types::{EntityPath, RowId, TimeInt, Timeline};
-use re_types_core::Loggable as _;
+use re_log_types::{EntityPath, Timeline};
+use re_types_core::{Loggable as _, SizeBytes};
-use crate::{Chunk, ChunkError, ChunkId, ChunkResult, ChunkTimeline};
+use crate::{Chunk, ChunkError, ChunkId, ChunkResult, ChunkTimeline, RowId};
// ---
@@ -53,6 +56,8 @@ impl std::fmt::Display for TransportChunk {
}
}
+// TODO(#6572): Relying on Arrow's native schema metadata feature is bound to fail, we need to
+// switch to something more powerful asap.
impl TransportChunk {
/// The key used to identify a Rerun [`ChunkId`] in chunk-level [`ArrowSchema`] metadata.
pub const CHUNK_METADATA_KEY_ID: &'static str = "rerun.id";
@@ -60,6 +65,10 @@ impl TransportChunk {
/// The key used to identify a Rerun [`EntityPath`] in chunk-level [`ArrowSchema`] metadata.
pub const CHUNK_METADATA_KEY_ENTITY_PATH: &'static str = "rerun.entity_path";
+ /// The key used to identify the size in bytes of the data, once loaded in memory, in chunk-level
+ /// [`ArrowSchema`] metadata.
+ pub const CHUNK_METADATA_KEY_HEAP_SIZE_BYTES: &'static str = "rerun.heap_size_bytes";
+
/// The marker used to identify whether a chunk is sorted in chunk-level [`ArrowSchema`] metadata.
///
/// The associated value is irrelevant -- if this marker is present, then it is true.
@@ -103,6 +112,18 @@ impl TransportChunk {
.into()
}
+ /// Returns the appropriate chunk-level [`ArrowSchema`] metadata for the in-memory size in bytes.
+ #[inline]
+ pub fn chunk_metadata_heap_size_bytes(heap_size_bytes: u64) -> ArrowMetadata {
+ [
+ (
+ Self::CHUNK_METADATA_KEY_HEAP_SIZE_BYTES.to_owned(),
+ heap_size_bytes.to_string(),
+ ), //
+ ]
+ .into()
+ }
+
/// Returns the appropriate chunk-level [`ArrowSchema`] metadata for a Rerun [`EntityPath`].
#[inline]
pub fn chunk_metadata_entity_path(entity_path: &EntityPath) -> ArrowMetadata {
@@ -179,19 +200,18 @@ impl TransportChunk {
impl TransportChunk {
#[inline]
pub fn id(&self) -> ChunkResult {
- match self.schema.metadata.get(Self::CHUNK_METADATA_KEY_ID) {
- Some(id) => {
- let id = u128::from_str_radix(id, 16).map_err(|err| ChunkError::Malformed {
- reason: format!("cannot deserialize chunk id: {err}"),
- })?;
- Ok(ChunkId::from_u128(id))
- }
- None => Err(crate::ChunkError::Malformed {
+ if let Some(id) = self.schema.metadata.get(Self::CHUNK_METADATA_KEY_ID) {
+ let id = u128::from_str_radix(id, 16).map_err(|err| ChunkError::Malformed {
+ reason: format!("cannot deserialize chunk id: {err}"),
+ })?;
+ Ok(ChunkId::from_u128(id))
+ } else {
+ Err(crate::ChunkError::Malformed {
reason: format!(
"chunk id missing from metadata ({:?})",
self.schema.metadata
),
- }),
+ })
}
}
@@ -212,6 +232,14 @@ impl TransportChunk {
}
}
+ #[inline]
+ pub fn heap_size_bytes(&self) -> Option {
+ self.schema
+ .metadata
+ .get(Self::CHUNK_METADATA_KEY_HEAP_SIZE_BYTES)
+ .and_then(|s| s.parse::().ok())
+ }
+
/// Looks in the chunk metadata for the `IS_SORTED` marker.
///
/// It is possible that a chunk is sorted but didn't set that marker.
@@ -308,6 +336,7 @@ impl Chunk {
let Self {
id,
entity_path,
+ heap_size_bytes: _, // use the method instead because of lazy initialization
is_sorted,
row_ids,
timelines,
@@ -329,6 +358,12 @@ impl Chunk {
.metadata
.extend(TransportChunk::chunk_metadata_entity_path(entity_path));
+ schema
+ .metadata
+ .extend(TransportChunk::chunk_metadata_heap_size_bytes(
+ self.heap_size_bytes(),
+ ));
+
if *is_sorted {
schema
.metadata
@@ -340,7 +375,6 @@ impl Chunk {
{
re_tracing::profile_scope!("row ids");
- let row_ids = RowId::to_arrow(row_ids)?;
schema.fields.push(
ArrowField::new(
RowId::name().to_string(),
@@ -349,7 +383,7 @@ impl Chunk {
)
.with_metadata(TransportChunk::field_metadata_control_column()),
);
- columns.push(row_ids);
+ columns.push(row_ids.clone().boxed());
}
// Timelines
@@ -358,21 +392,12 @@ impl Chunk {
for (timeline, info) in timelines {
let ChunkTimeline {
+ timeline: _,
times,
is_sorted,
time_range: _,
} = info;
- let times = {
- let values = times.iter().map(|time| time.as_i64()).collect();
- ArrowPrimitiveArray::new(
- arrow2::types::PrimitiveType::Int64.into(),
- values,
- None,
- )
- .to(timeline.datatype())
- };
-
let field = ArrowField::new(
timeline.name().to_string(),
times.data_type().clone(),
@@ -387,7 +412,7 @@ impl Chunk {
});
schema.fields.push(field);
- columns.push(Box::new(times));
+ columns.push(times.clone().boxed() /* cheap */);
}
}
@@ -400,7 +425,7 @@ impl Chunk {
ArrowField::new(component_name.to_string(), data.data_type().clone(), true)
.with_metadata(TransportChunk::field_metadata_data_column()),
);
- columns.push(data.clone() /* refcounted (dyn Clone) */);
+ columns.push(data.clone().boxed());
}
}
@@ -410,34 +435,46 @@ impl Chunk {
})
}
- pub fn from_transport(chunk: &TransportChunk) -> ChunkResult {
+ pub fn from_transport(transport: &TransportChunk) -> ChunkResult {
re_tracing::profile_function!(format!(
"num_columns={} num_rows={}",
- chunk.num_columns(),
- chunk.num_rows()
+ transport.num_columns(),
+ transport.num_rows()
));
// Metadata
let (id, entity_path, is_sorted) = {
re_tracing::profile_scope!("metadata");
- (chunk.id()?, chunk.entity_path()?, chunk.is_sorted())
+ (
+ transport.id()?,
+ transport.entity_path()?,
+ transport.is_sorted(),
+ )
};
// Row IDs
let row_ids = {
re_tracing::profile_scope!("row ids");
- let Some(column) = chunk.controls().find_map(|(field, column)| {
+ let Some(row_ids) = transport.controls().find_map(|(field, column)| {
(field.name == RowId::name().as_str()).then_some(column)
}) else {
return Err(ChunkError::Malformed {
- reason: format!("missing row_id column ({:?})", chunk.schema),
+ reason: format!("missing row_id column ({:?})", transport.schema),
});
};
- RowId::from_arrow(&**column).map_err(|err| ChunkError::Malformed {
- reason: format!("row_id column is not deserializable: {err}"),
- })?
+ row_ids
+ .as_any()
+ .downcast_ref::()
+ .ok_or_else(|| ChunkError::Malformed {
+ reason: format!(
+ "RowId data has the wrong datatype: expected {:?} but got {:?} instead",
+ RowId::arrow_datatype(),
+ *row_ids.data_type(),
+ ),
+ })?
+ .clone()
};
// Timelines
@@ -446,7 +483,7 @@ impl Chunk {
let mut timelines = BTreeMap::default();
- for (field, column) in chunk.timelines() {
+ for (field, column) in transport.timelines() {
// See also [`Timeline::datatype`]
let timeline = match column.data_type().to_logical_type() {
ArrowDatatype::Int64 => Timeline::new_sequence(field.name.as_str()),
@@ -492,22 +529,16 @@ impl Chunk {
let time_chunk = ChunkTimeline::new(
is_sorted.then_some(true),
- times
- .values_iter()
- .copied()
- .map(TimeInt::new_temporal)
- .collect(),
+ timeline,
+ times.clone(), /* cheap */
);
-
- if let Some(time_chunk) = time_chunk {
- if timelines.insert(timeline, time_chunk).is_some() {
- return Err(ChunkError::Malformed {
- reason: format!(
- "time column '{}' was specified more than once",
- field.name,
- ),
- });
- }
+ if timelines.insert(timeline, time_chunk).is_some() {
+ return Err(ChunkError::Malformed {
+ reason: format!(
+ "time column '{}' was specified more than once",
+ field.name,
+ ),
+ });
}
}
@@ -518,16 +549,16 @@ impl Chunk {
let components = {
let mut components = BTreeMap::default();
- for (field, column) in chunk.components() {
- if !matches!(column.data_type(), ArrowDatatype::List(_)) {
- return Err(ChunkError::Malformed {
+ for (field, column) in transport.components() {
+ let column = column
+ .as_any()
+ .downcast_ref::>()
+ .ok_or_else(|| ChunkError::Malformed {
reason: format!(
- "component column '{}' is not deserializable ({:?})",
- field.name,
- column.data_type()
+ "The outer array in a chunked component batch must be a sparse list, got {:?}",
+ column.data_type(),
),
- });
- }
+ })?;
if components
.insert(
@@ -548,14 +579,52 @@ impl Chunk {
components
};
- Self::new(
+ let mut res = Self::new(
id,
entity_path,
is_sorted.then_some(true),
row_ids,
timelines,
components,
- )
+ )?;
+
+ if let Some(heap_size_bytes) = transport.heap_size_bytes() {
+ res.heap_size_bytes = heap_size_bytes.into();
+ }
+
+ Ok(res)
+ }
+}
+
+impl Chunk {
+ #[inline]
+ pub fn from_arrow_msg(msg: &re_log_types::ArrowMsg) -> ChunkResult {
+ let re_log_types::ArrowMsg {
+ chunk_id: _,
+ timepoint_max: _,
+ schema,
+ chunk,
+ on_release: _,
+ } = msg;
+
+ Self::from_transport(&TransportChunk {
+ schema: schema.clone(),
+ data: chunk.clone(),
+ })
+ }
+
+ #[inline]
+ pub fn to_arrow_msg(&self) -> ChunkResult {
+ self.sanity_check()?;
+
+ let transport = self.to_transport()?;
+ Ok(re_log_types::ArrowMsg {
+ chunk_id: re_tuid::Tuid::from_u128(self.id().as_u128()),
+ timepoint_max: self.timepoint_max(),
+ schema: transport.schema,
+ chunk: transport.data,
+ on_release: None,
+ })
}
}
@@ -563,11 +632,9 @@ impl Chunk {
mod tests {
use re_log_types::{
example_components::{MyColor, MyPoint},
- TimeInt, Timeline,
+ Timeline,
};
- use crate::arrays_to_list_array;
-
use super::*;
#[test]
@@ -579,9 +646,9 @@ mod tests {
timeline1,
ChunkTimeline::new(
Some(true),
- [42, 43, 44, 45].map(TimeInt::new_temporal).to_vec(),
- )
- .unwrap(),
+ timeline1,
+ ArrowPrimitiveArray::::from_vec(vec![42, 43, 44, 45]),
+ ),
))
.collect();
@@ -607,7 +674,7 @@ mod tests {
let components = [
(MyPoint::name(), {
- let list_array = arrays_to_list_array(&[
+ let list_array = crate::util::arrays_to_list_array_opt(&[
Some(&*points1),
points2,
Some(&*points3),
@@ -618,9 +685,13 @@ mod tests {
list_array
}),
(MyPoint::name(), {
- let list_array =
- arrays_to_list_array(&[Some(&*colors1), Some(&*colors2), colors3, colors4])
- .unwrap();
+ let list_array = crate::util::arrays_to_list_array_opt(&[
+ Some(&*colors1),
+ Some(&*colors2),
+ colors3,
+ colors4,
+ ])
+ .unwrap();
assert_eq!(4, list_array.len());
list_array
}),
@@ -629,11 +700,11 @@ mod tests {
let row_ids = vec![RowId::new(), RowId::new(), RowId::new(), RowId::new()];
for timelines in [timelines1, timelines2] {
- let chunk_original = Chunk::new(
+ let chunk_original = Chunk::from_native_row_ids(
ChunkId::new(),
entity_path.clone(),
None,
- row_ids.clone(),
+ &row_ids,
timelines.clone(),
components.clone().into_iter().collect(),
)?;
@@ -651,6 +722,10 @@ mod tests {
chunk_in_transport.entity_path()?,
*chunk_after.entity_path()
);
+ assert_eq!(
+ chunk_in_transport.heap_size_bytes(),
+ Some(chunk_after.heap_size_bytes()),
+ );
assert_eq!(
chunk_in_transport.num_columns(),
chunk_original.num_columns()
@@ -684,9 +759,9 @@ mod tests {
chunk_after.num_components()
);
- // eprintln!("{chunk_before}");
+ eprintln!("{chunk_before}");
eprintln!("{chunk_in_transport}");
- // eprintln!("{chunk_after}");
+ eprintln!("{chunk_after}");
assert_eq!(chunk_before, chunk_after);
diff --git a/crates/re_chunk/src/util.rs b/crates/re_chunk/src/util.rs
index 1fe1356ffbd2..092d3ff8e943 100644
--- a/crates/re_chunk/src/util.rs
+++ b/crates/re_chunk/src/util.rs
@@ -1,6 +1,7 @@
use arrow2::{
- array::{Array as ArrowArray, ListArray as ArrowListArray},
+ array::{Array as ArrowArray, BooleanArray as ArrowBooleanArray, ListArray as ArrowListArray},
bitmap::Bitmap as ArrowBitmap,
+ datatypes::DataType as ArrowDataType,
offset::Offsets as ArrowOffsets,
};
use itertools::Itertools as _;
@@ -12,29 +13,41 @@ use itertools::Itertools as _;
/// All arrays must have the same datatype.
///
/// Returns `None` if `arrays` is empty.
-pub fn arrays_to_list_array(arrays: &[Option<&dyn ArrowArray>]) -> Option> {
+#[inline]
+pub fn arrays_to_list_array_opt(arrays: &[Option<&dyn ArrowArray>]) -> Option> {
+ let datatype = arrays
+ .iter()
+ .flatten()
+ .map(|array| array.data_type().clone())
+ .next()?;
+ arrays_to_list_array(datatype, arrays)
+}
+
+/// Create a sparse list-array out of an array of arrays.
+///
+/// Returns `None` if any of the specified `arrays` doesn't match the given `array_datatype`.
+///
+/// Returns an empty list if `arrays` is empty.
+pub fn arrays_to_list_array(
+ array_datatype: ArrowDataType,
+ arrays: &[Option<&dyn ArrowArray>],
+) -> Option> {
let arrays_dense = arrays.iter().flatten().copied().collect_vec();
- if arrays_dense.is_empty() {
- return None;
- }
+ let data = if arrays_dense.is_empty() {
+ arrow2::array::new_empty_array(array_datatype.clone())
+ } else {
+ arrow2::compute::concatenate::concatenate(&arrays_dense)
+ .map_err(|err| {
+ re_log::warn_once!("failed to concatenate arrays: {err}");
+ err
+ })
+ .ok()?
+ };
- let data = arrow2::compute::concatenate::concatenate(&arrays_dense)
- .map_err(|err| {
- re_log::warn_once!("failed to concatenate arrays: {err}");
- err
- })
- .ok()?;
-
- let datatype = arrays_dense
- .first()
- .map(|array| array.data_type().clone())?;
- debug_assert!(arrays_dense
- .iter()
- .all(|array| *array.data_type() == datatype));
- let datatype = ArrowListArray::::default_datatype(datatype);
+ let datatype = ArrowListArray::::default_datatype(array_datatype);
- #[allow(clippy::unwrap_used)] // yes, there are indeed lengths
+ #[allow(clippy::unwrap_used)] // yes, these are indeed lengths
let offsets = ArrowOffsets::try_from_lengths(
arrays
.iter()
@@ -45,5 +58,57 @@ pub fn arrays_to_list_array(arrays: &[Option<&dyn ArrowArray>]) -> Option::new(datatype, offsets.into(), data, validity.into()).boxed())
+ Some(ArrowListArray::::new(
+ datatype,
+ offsets.into(),
+ data,
+ validity.into(),
+ ))
+}
+
+/// Given a sparse `ArrowListArray` (i.e. an array with a validity bitmap that contains at least
+/// one falsy value), returns a dense `ArrowListArray` that only contains the non-null values from
+/// the original list.
+///
+/// This is a no-op if the original array is already dense.
+pub fn sparse_list_array_to_dense_list_array(
+ list_array: &ArrowListArray,
+) -> ArrowListArray {
+ if list_array.is_empty() {
+ return list_array.clone();
+ }
+
+ let is_empty = list_array
+ .validity()
+ .map_or(false, |validity| validity.is_empty());
+ if is_empty {
+ return list_array.clone();
+ }
+
+ #[allow(clippy::unwrap_used)] // yes, these are indeed lengths
+ let offsets =
+ ArrowOffsets::try_from_lengths(list_array.iter().flatten().map(|array| array.len()))
+ .unwrap();
+
+ ArrowListArray::::new(
+ list_array.data_type().clone(),
+ offsets.into(),
+ list_array.values().clone(),
+ None,
+ )
+}
+
+/// Applies a filter kernel to the given `array`.
+///
+/// Takes care of up- and down-casting the data back and forth on behalf of the caller.
+pub fn filter_array(array: &A, filter: &ArrowBooleanArray) -> A {
+ #[allow(clippy::unwrap_used)]
+ arrow2::compute::filter::filter(array, filter)
+ // Unwrap: this literally cannot fail.
+ .unwrap()
+ .as_any()
+ .downcast_ref::()
+ // Unwrap: that's initial type that we got.
+ .unwrap()
+ .clone()
}
diff --git a/crates/re_chunk/tests/latest_at.rs b/crates/re_chunk/tests/latest_at.rs
new file mode 100644
index 000000000000..596dc21e3a5e
--- /dev/null
+++ b/crates/re_chunk/tests/latest_at.rs
@@ -0,0 +1,501 @@
+use arrow2::datatypes::DataType as ArrowDatatype;
+use nohash_hasher::IntMap;
+
+use re_chunk::{Chunk, ComponentName, LatestAtQuery, RowId, TimePoint, Timeline};
+use re_log_types::example_components::{MyColor, MyLabel, MyPoint};
+use re_types_core::Loggable;
+
+// ---
+
+const ENTITY_PATH: &str = "my/entity";
+
+fn datatypes() -> IntMap {
+ [
+ (MyPoint::name(), MyPoint::arrow_datatype()),
+ (MyColor::name(), MyColor::arrow_datatype()),
+ (MyLabel::name(), MyLabel::arrow_datatype()),
+ ]
+ .into_iter()
+ .collect()
+}
+
+#[test]
+fn temporal_sorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint1 = [
+ (Timeline::log_time(), 1000),
+ (Timeline::new_sequence("frame"), 1),
+ ];
+ let timepoint2 = [
+ (Timeline::log_time(), 1032),
+ (Timeline::new_sequence("frame"), 3),
+ ];
+ let timepoint3 = [
+ (Timeline::log_time(), 1064),
+ (Timeline::new_sequence("frame"), 5),
+ ];
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id1, timepoint1, [points1 as _])
+ .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _])
+ .with_component_batches(row_id3, timepoint3, [points3 as _])
+ .build()?;
+
+ {
+ let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 2);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id1,
+ timepoint1,
+ [
+ (MyPoint::name(), Some(points1 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = chunk.emptied();
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = chunk.emptied();
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+ {
+ let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 4);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id1,
+ timepoint1,
+ [
+ (MyPoint::name(), Some(points1 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+ {
+ let query = LatestAtQuery::new(Timeline::new_sequence("frame"), 6);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint3,
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+#[test]
+fn temporal_unsorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint1 = [
+ (Timeline::log_time(), 1000),
+ (Timeline::new_sequence("frame"), 1),
+ ];
+ let timepoint2 = [
+ (Timeline::log_time(), 1032),
+ (Timeline::new_sequence("frame"), 3),
+ ];
+ let timepoint3 = [
+ (Timeline::log_time(), 1064),
+ (Timeline::new_sequence("frame"), 5),
+ ];
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _])
+ .with_component_batches(row_id1, timepoint1, [points1 as _])
+ .with_component_batches(row_id3, timepoint3, [points3 as _])
+ .build()?;
+
+ {
+ let query = LatestAtQuery::new(Timeline::log_time(), 1000);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id1,
+ timepoint1,
+ [
+ (MyPoint::name(), Some(points1 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = chunk.emptied();
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = chunk.emptied();
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+ {
+ let query = LatestAtQuery::new(Timeline::log_time(), 1050);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id1,
+ timepoint1,
+ [
+ (MyPoint::name(), Some(points1 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+ {
+ let query = LatestAtQuery::new(Timeline::log_time(), 1100);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint3,
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+#[test]
+fn static_sorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint = TimePoint::default();
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id1, timepoint.clone(), [points1 as _])
+ .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _])
+ .with_component_batches(row_id3, timepoint.clone(), [points3 as _])
+ .build()?;
+
+ for frame_nr in [2, 4, 6] {
+ let query = LatestAtQuery::new(Timeline::new_sequence("frame"), frame_nr);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+#[test]
+fn static_unsorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint = TimePoint::default();
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id3, timepoint.clone(), [points3 as _])
+ .with_component_batches(row_id1, timepoint.clone(), [points1 as _])
+ .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _])
+ .build()?;
+
+ for log_time in [1000, 1050, 1100] {
+ let query = LatestAtQuery::new(Timeline::log_time(), log_time);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+// ---
+
+fn query_and_compare(
+ (component_name, query): (ComponentName, &LatestAtQuery),
+ chunk: &Chunk,
+ expected: &Chunk,
+) {
+ re_log::setup_logging();
+
+ let results = chunk.latest_at(query, component_name);
+
+ eprintln!("Query: {component_name} @ {query:?}");
+ eprintln!("Data:\n{chunk}");
+ eprintln!("Expected:\n{expected}");
+ eprintln!("Results:\n{results}");
+
+ assert_eq!(
+ *expected,
+ results,
+ "{}",
+ similar_asserts::SimpleDiff::from_str(
+ &format!("{results}"),
+ &format!("{expected}"),
+ // &format!("{results:#?}"),
+ // &format!("{expected:#?}"),
+ "got",
+ "expected",
+ ),
+ );
+}
diff --git a/crates/re_chunk/tests/range.rs b/crates/re_chunk/tests/range.rs
new file mode 100644
index 000000000000..8432b0b7c161
--- /dev/null
+++ b/crates/re_chunk/tests/range.rs
@@ -0,0 +1,475 @@
+use arrow2::datatypes::DataType as ArrowDatatype;
+use nohash_hasher::IntMap;
+
+use re_chunk::{Chunk, ComponentName, RangeQuery, RowId, TimePoint, Timeline};
+use re_log_types::{
+ example_components::{MyColor, MyLabel, MyPoint},
+ ResolvedTimeRange,
+};
+use re_types_core::Loggable as _;
+
+// ---
+
+const ENTITY_PATH: &str = "my/entity";
+
+fn datatypes() -> IntMap {
+ [
+ (MyPoint::name(), MyPoint::arrow_datatype()),
+ (MyColor::name(), MyColor::arrow_datatype()),
+ (MyLabel::name(), MyLabel::arrow_datatype()),
+ ]
+ .into_iter()
+ .collect()
+}
+
+#[test]
+fn temporal_sorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint1 = [
+ (Timeline::log_time(), 1000),
+ (Timeline::new_sequence("frame"), 1),
+ ];
+ let timepoint2 = [
+ (Timeline::log_time(), 1032),
+ (Timeline::new_sequence("frame"), 3),
+ ];
+ let timepoint3 = [
+ (Timeline::log_time(), 1064),
+ (Timeline::new_sequence("frame"), 5),
+ ];
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id1, timepoint1, [points1 as _])
+ .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _])
+ .with_component_batches(row_id3, timepoint3, [points3 as _])
+ .build()?;
+
+ {
+ let query = RangeQuery::new(
+ Timeline::new_sequence("frame"),
+ ResolvedTimeRange::EVERYTHING,
+ );
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id1,
+ timepoint1,
+ [
+ (MyPoint::name(), Some(points1 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint3,
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ {
+ let query = RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050));
+
+ let expected = chunk.emptied();
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+#[test]
+fn temporal_unsorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint1 = [
+ (Timeline::log_time(), 1000),
+ (Timeline::new_sequence("frame"), 1),
+ ];
+ let timepoint2 = [
+ (Timeline::log_time(), 1032),
+ (Timeline::new_sequence("frame"), 3),
+ ];
+ let timepoint3 = [
+ (Timeline::log_time(), 1064),
+ (Timeline::new_sequence("frame"), 5),
+ ];
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id2, timepoint2, [colors2 as _, labels2 as _])
+ .with_component_batches(row_id1, timepoint1, [points1 as _])
+ .with_component_batches(row_id3, timepoint3, [points3 as _])
+ .build()?;
+
+ {
+ let query = RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::EVERYTHING);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id1,
+ timepoint1,
+ [
+ (MyPoint::name(), Some(points1 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint3,
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ {
+ let query = RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050));
+
+ let expected = chunk.emptied();
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint2,
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+#[test]
+fn static_sorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint = TimePoint::default();
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id1, timepoint.clone(), [points1 as _])
+ .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _])
+ .with_component_batches(row_id3, timepoint.clone(), [points3 as _])
+ .build()?;
+
+ let queries = [
+ RangeQuery::new(
+ Timeline::new_sequence("frame"),
+ ResolvedTimeRange::EVERYTHING,
+ ),
+ RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050)),
+ ];
+
+ for query in queries {
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+#[test]
+fn static_unsorted() -> anyhow::Result<()> {
+ let row_id1 = RowId::new();
+ let row_id2 = RowId::new();
+ let row_id3 = RowId::new();
+
+ let timepoint = TimePoint::default();
+
+ let points1 = &[MyPoint::new(1.0, 1.0), MyPoint::new(2.0, 2.0)];
+ let points3 = &[
+ MyPoint::new(3.0, 3.0),
+ MyPoint::new(4.0, 4.0),
+ MyPoint::new(5.0, 5.0),
+ ];
+
+ let colors2 = &[MyColor::from_rgb(1, 1, 1)];
+
+ let labels2 = &[
+ MyLabel("a".into()),
+ MyLabel("b".into()),
+ MyLabel("c".into()),
+ ];
+
+ let chunk = Chunk::builder(ENTITY_PATH.into())
+ .with_component_batches(row_id3, timepoint.clone(), [points3 as _])
+ .with_component_batches(row_id1, timepoint.clone(), [points1 as _])
+ .with_component_batches(row_id2, timepoint.clone(), [colors2 as _, labels2 as _])
+ .build()?;
+
+ let queries = [
+ RangeQuery::new(
+ Timeline::new_sequence("frame"),
+ ResolvedTimeRange::EVERYTHING,
+ ),
+ RangeQuery::new(Timeline::log_time(), ResolvedTimeRange::new(1020, 1050)),
+ ];
+
+ for query in queries {
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id3,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), Some(points3 as _)),
+ (MyColor::name(), None),
+ (MyLabel::name(), None),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyPoint::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyColor::name(), &query), &chunk, &expected);
+
+ let expected = Chunk::builder_with_id(chunk.id(), ENTITY_PATH.into())
+ .with_sparse_component_batches(
+ row_id2,
+ timepoint.clone(),
+ [
+ (MyPoint::name(), None),
+ (MyColor::name(), Some(colors2 as _)),
+ (MyLabel::name(), Some(labels2 as _)),
+ ],
+ )
+ .build_with_datatypes(&datatypes())?;
+ query_and_compare((MyLabel::name(), &query), &chunk, &expected);
+ }
+
+ Ok(())
+}
+
+// ---
+
+fn query_and_compare(
+ (component_name, query): (ComponentName, &RangeQuery),
+ chunk: &Chunk,
+ expected: &Chunk,
+) {
+ re_log::setup_logging();
+
+ let results = chunk.range(query, component_name);
+
+ eprintln!("Query: {component_name} @ {query:?}");
+ eprintln!("Data:\n{chunk}");
+ eprintln!("Expected:\n{expected}");
+ eprintln!("Results:\n{results}");
+
+ assert_eq!(
+ *expected,
+ results,
+ "{}",
+ similar_asserts::SimpleDiff::from_str(
+ &format!("{results}"),
+ &format!("{expected}"),
+ // &format!("{results:#?}"),
+ // &format!("{expected:#?}"),
+ "got",
+ "expected",
+ ),
+ );
+}
diff --git a/crates/re_data_store/Cargo.toml b/crates/re_chunk_store/Cargo.toml
similarity index 81%
rename from crates/re_data_store/Cargo.toml
rename to crates/re_chunk_store/Cargo.toml
index ef5937d27e79..43fb1a162d75 100644
--- a/crates/re_data_store/Cargo.toml
+++ b/crates/re_chunk_store/Cargo.toml
@@ -1,7 +1,7 @@
[package]
-name = "re_data_store"
+name = "re_chunk_store"
authors.workspace = true
-description = "An in-memory time series database for Rerun log data, based on Apache Arrow"
+description = "A storage engine for Rerun's Chunks"
edition.workspace = true
homepage.workspace = true
include.workspace = true
@@ -28,6 +28,7 @@ deadlock_detection = ["parking_lot/deadlock_detection"]
[dependencies]
# Rerun dependencies:
+re_chunk.workspace = true
re_format.workspace = true
re_format_arrow.workspace = true
re_log = { workspace = true, features = ["setup"] }
@@ -60,23 +61,3 @@ mimalloc.workspace = true
rand = { workspace = true, features = ["std", "std_rng"] }
similar-asserts.workspace = true
tinyvec.workspace = true
-
-[lib]
-bench = false
-
-
-[[bench]]
-name = "arrow2"
-harness = false
-
-[[bench]]
-name = "data_store"
-harness = false
-
-[[bench]]
-name = "gc"
-harness = false
-
-[[bench]]
-name = "vectors"
-harness = false
diff --git a/crates/re_chunk_store/README.md b/crates/re_chunk_store/README.md
new file mode 100644
index 000000000000..35800660488a
--- /dev/null
+++ b/crates/re_chunk_store/README.md
@@ -0,0 +1,12 @@
+# Rerun chunk store
+
+Part of the [`rerun`](https://github.com/rerun-io/rerun) family of crates.
+
+[![Latest version](https://img.shields.io/crates/v/re_chunk_store.svg)](https://crates.io/crates/re_chunk_store?speculative-link)
+[![Documentation](https://docs.rs/re_chunk_store/badge.svg)](https://docs.rs/re_chunk_store?speculative-link)
+![MIT](https://img.shields.io/badge/license-MIT-blue.svg)
+![Apache](https://img.shields.io/badge/license-Apache-blue.svg)
+
+[Apache Arrow](https://arrow.apache.org/) is a language-independent columnar memory format for arbitrary data.
+
+The `re_chunk_store` crate is an in-memory time series database for Rerun log data. It is indexed by Entity path, component, timeline, and time. It supports out-of-order insertions, and fast `O(log(N))` queries.
diff --git a/crates/re_data_store/src/store_event.rs b/crates/re_chunk_store/src/events.rs
similarity index 59%
rename from crates/re_data_store/src/store_event.rs
rename to crates/re_chunk_store/src/events.rs
index 5373d4a9d375..c021444f1569 100644
--- a/crates/re_data_store/src/store_event.rs
+++ b/crates/re_chunk_store/src/events.rs
@@ -1,52 +1,52 @@
-use nohash_hasher::IntMap;
+use std::sync::Arc;
-use re_log_types::{DataCell, EntityPath, RowId, StoreId, TimeInt, TimePoint, Timeline};
-use re_types_core::ComponentName;
+use re_chunk::Chunk;
+use re_log_types::StoreId;
-use crate::StoreGeneration;
+use crate::ChunkStoreGeneration;
// Used all over in docstrings.
#[allow(unused_imports)]
-use crate::{DataStore, StoreSubscriber};
+use crate::{ChunkId, ChunkStore, ChunkStoreSubscriber, RowId};
// ---
-/// The atomic unit of change in the Rerun [`DataStore`].
+/// The atomic unit of change in the Rerun [`ChunkStore`].
///
-/// A [`StoreEvent`] describes the changes caused by the addition or deletion of a
-/// [`re_log_types::DataRow`] in the store.
+/// A [`ChunkStoreEvent`] describes the changes caused by the addition or deletion of a
+/// [`Chunk`] in the store.
///
-/// Methods that mutate the [`DataStore`], such as [`DataStore::insert_row`] and [`DataStore::gc`],
-/// return [`StoreEvent`]s that describe the changes.
-/// You can also register your own [`StoreSubscriber`] in order to be notified of changes as soon as they
+/// Methods that mutate the [`ChunkStore`], such as [`ChunkStore::insert_chunk`] and [`ChunkStore::gc`],
+/// return [`ChunkStoreEvent`]s that describe the changes.
+/// You can also register your own [`ChunkStoreSubscriber`] in order to be notified of changes as soon as they
/// happen.
///
-/// Refer to field-level documentation for more details and check out [`StoreDiff`] for a precise
+/// Refer to field-level documentation for more details and check out [`ChunkStoreDiff`] for a precise
/// definition of what an event involves.
-#[derive(Debug, Clone, PartialEq)]
-pub struct StoreEvent {
- /// Which [`DataStore`] sent this event?
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ChunkStoreEvent {
+ /// Which [`ChunkStore`] sent this event?
pub store_id: StoreId,
/// What was the store's generation when it sent that event?
- pub store_generation: StoreGeneration,
+ pub store_generation: ChunkStoreGeneration,
/// Monotonically increasing ID of the event.
///
/// This is on a per-store basis.
///
- /// When handling a [`StoreEvent`], if this is the first time you process this [`StoreId`] and
+ /// When handling a [`ChunkStoreEvent`], if this is the first time you process this [`StoreId`] and
/// the associated `event_id` is not `1`, it means you registered late and missed some updates.
pub event_id: u64,
/// What actually changed?
///
- /// Refer to [`StoreDiff`] for more information.
- pub diff: StoreDiff,
+ /// Refer to [`ChunkStoreDiff`] for more information.
+ pub diff: ChunkStoreDiff,
}
-impl std::ops::Deref for StoreEvent {
- type Target = StoreDiff;
+impl std::ops::Deref for ChunkStoreEvent {
+ type Target = ChunkStoreDiff;
#[inline]
fn deref(&self) -> &Self::Target {
@@ -58,19 +58,19 @@ impl std::ops::Deref for StoreEvent {
///
/// Reminder: ⚠ Do not confuse _a deletion_ and _a clear_ ⚠.
///
-/// A deletion is the result of a row being completely removed from the store as part of the
+/// A deletion is the result of a chunk being completely removed from the store as part of the
/// garbage collection process.
///
/// A clear, on the other hand, is the act of logging an empty [`re_types_core::ComponentBatch`],
/// either directly using the logging APIs, or indirectly through the use of a
/// [`re_types_core::archetypes::Clear`] archetype.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum StoreDiffKind {
+pub enum ChunkStoreDiffKind {
Addition,
Deletion,
}
-impl StoreDiffKind {
+impl ChunkStoreDiffKind {
#[inline]
pub fn delta(&self) -> i64 {
match self {
@@ -80,118 +80,69 @@ impl StoreDiffKind {
}
}
-/// Describes an atomic change in the Rerun [`DataStore`]: a row has been added or deleted.
+/// Describes an atomic change in the Rerun [`ChunkStore`]: a chunk has been added or deleted.
///
-/// From a query model standpoint, the [`DataStore`] _always_ operates one row at a time:
-/// - The contents of a row (i.e. its columns) are immutable past insertion, by virtue of
-/// [`RowId`]s being unique and non-reusable.
-/// - Similarly, garbage collection always removes _all the data_ associated with a row in one go:
-/// there cannot be orphaned columns. When a row is gone, all data associated with it is gone too.
+/// From a query model standpoint, the [`ChunkStore`] _always_ operates one chunk at a time:
+/// - The contents of a chunk (i.e. its columns) are immutable past insertion, by virtue of
+/// [`ChunkId`]s being unique and non-reusable.
+/// - Similarly, garbage collection always removes _all the data_ associated with a chunk in one go:
+/// there cannot be orphaned columns. When a chunk is gone, all data associated with it is gone too.
///
/// Refer to field-level documentation for more information.
-#[derive(Debug, Clone, PartialEq)]
-pub struct StoreDiff {
+#[derive(Debug, Clone)]
+pub struct ChunkStoreDiff {
/// Addition or deletion?
///
/// The store's internals are opaque and don't necessarily reflect the query model (e.g. there
/// might be data in the store that cannot by reached by any query).
///
- /// A [`StoreDiff`] answers a logical question: "does there exist a query path which can return
- /// data from that row?".
+ /// A [`ChunkStoreDiff`] answers a logical question: "does there exist a query path which can return
+ /// data from that chunk?".
///
- /// An event of kind deletion only tells you that, from this point on, no query can return data from that row.
+ /// An event of kind deletion only tells you that, from this point on, no query can return data from that chunk.
/// That doesn't necessarily mean that the data is actually gone, i.e. don't make assumptions of e.g. the size
/// in bytes of the store based on these events.
/// They are in "query-model space" and are not an accurate representation of what happens in storage space.
- pub kind: StoreDiffKind,
+ pub kind: ChunkStoreDiffKind,
- /// What's the row's [`RowId`]?
- ///
- /// [`RowId`]s are guaranteed to be unique within a single [`DataStore`].
- ///
- /// Put another way, the same [`RowId`] can only appear twice in a [`StoreDiff`] event:
- /// one addition and (optionally) one deletion (in that order!).
- pub row_id: RowId,
-
- /// The time data associated with that row.
- ///
- /// Since insertions and deletions both work on a row-level basis, this is guaranteed to be the
- /// same value for both the insertion and deletion events (if any).
- ///
- /// This is not a [`TimePoint`] for performance reasons.
+ /// The chunk that was added or removed.
//
- // NOTE: Empirical testing shows that a SmallVec isn't any better in the best case, and can be a
- // significant performant drop at worst.
- // pub times: SmallVec<[(Timeline, TimeInt); 5]>, // "5 timelines ought to be enough for anyone"
- pub times: Vec<(Timeline, TimeInt)>,
-
- /// The [`EntityPath`] associated with that row.
- ///
- /// Since insertions and deletions both work on a row-level basis, this is guaranteed to be the
- /// same value for both the insertion and deletion events (if any).
- pub entity_path: EntityPath,
-
- /// All the [`DataCell`]s associated with that row.
- ///
- /// Since insertions and deletions both work on a row-level basis, this is guaranteed to be the
- /// same set of values for both the insertion and deletion events (if any).
- pub cells: IntMap,
+ // NOTE: We purposefully use an `Arc` instead of a `ChunkId` here because we want to make sure that all
+ // downstream subscribers get a chance to inspect the data in the chunk before it gets permanently
+ // deallocated.
+ pub chunk: Arc,
}
-impl StoreDiff {
+impl PartialEq for ChunkStoreDiff {
#[inline]
- pub fn addition(row_id: impl Into, entity_path: impl Into) -> Self {
- Self {
- kind: StoreDiffKind::Addition,
- row_id: row_id.into(),
- times: Default::default(),
- entity_path: entity_path.into(),
- cells: Default::default(),
- }
+ fn eq(&self, rhs: &Self) -> bool {
+ let Self { kind, chunk } = self;
+ *kind == rhs.kind && chunk.id() == rhs.chunk.id()
}
+}
+impl Eq for ChunkStoreDiff {}
+
+impl ChunkStoreDiff {
#[inline]
- pub fn deletion(row_id: impl Into, entity_path: impl Into) -> Self {
+ pub fn addition(chunk: Arc) -> Self {
Self {
- kind: StoreDiffKind::Deletion,
- row_id: row_id.into(),
- times: Default::default(),
- entity_path: entity_path.into(),
- cells: Default::default(),
+ kind: ChunkStoreDiffKind::Addition,
+ chunk,
}
}
#[inline]
- pub fn at_timepoint(&mut self, timepoint: impl Into) -> &mut Self {
- self.times.extend(timepoint.into());
- self
- }
-
- #[inline]
- pub fn at_timestamp(
- &mut self,
- timeline: impl Into,
- time: impl Into,
- ) -> &mut Self {
- self.times.push((timeline.into(), time.into()));
- self
- }
-
- #[inline]
- pub fn with_cells(&mut self, cells: impl IntoIterator
- ) -> &mut Self {
- self.cells
- .extend(cells.into_iter().map(|cell| (cell.component_name(), cell)));
- self
- }
-
- #[inline]
- pub fn timepoint(&self) -> TimePoint {
- self.times.clone().into_iter().collect()
+ pub fn deletion(chunk: Arc) -> Self {
+ Self {
+ kind: ChunkStoreDiffKind::Deletion,
+ chunk,
+ }
}
#[inline]
pub fn is_static(&self) -> bool {
- self.times.is_empty()
+ self.chunk.is_static()
}
/// `-1` for deletions, `+1` for additions.
@@ -202,21 +153,24 @@ impl StoreDiff {
#[inline]
pub fn num_components(&self) -> usize {
- self.cells.len()
+ self.chunk.num_components()
}
}
+// ---
+
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
+ use re_chunk::RowId;
use re_log_types::{
example_components::{MyColor, MyIndex, MyPoint},
- DataRow, RowId, TimePoint, Timeline,
+ EntityPath, TimeInt, TimePoint, Timeline,
};
- use re_types_core::Loggable as _;
+ use re_types_core::{ComponentName, Loggable as _};
- use crate::{DataStore, GarbageCollectionOptions};
+ use crate::{ChunkStore, GarbageCollectionOptions};
use super::*;
@@ -255,26 +209,33 @@ mod tests {
}
impl GlobalCounts {
- fn on_events(&mut self, events: &[StoreEvent]) {
+ fn on_events(&mut self, events: &[ChunkStoreEvent]) {
for event in events {
- let delta = event.delta();
+ let delta_chunks = event.delta();
+ let delta_rows = delta_chunks * event.chunk.num_rows() as i64;
- *self.row_ids.entry(event.row_id).or_default() += delta;
+ for row_id in event.chunk.row_ids() {
+ *self.row_ids.entry(row_id).or_default() += delta_chunks;
+ }
*self
.entity_paths
- .entry(event.entity_path.clone())
- .or_default() += delta;
+ .entry(event.chunk.entity_path().clone())
+ .or_default() += delta_chunks;
- for component_name in event.cells.keys() {
+ for (component_name, list_array) in event.chunk.components() {
+ let delta =
+ event.delta() * list_array.iter().filter(Option::is_some).count() as i64;
*self.component_names.entry(*component_name).or_default() += delta;
}
if event.is_static() {
- self.num_static += delta;
+ self.num_static += delta_rows;
} else {
- for &(timeline, time) in &event.times {
- *self.timelines.entry(timeline).or_default() += delta;
- *self.times.entry(time).or_default() += delta;
+ for (&timeline, time_chunk) in event.chunk.timelines() {
+ *self.timelines.entry(timeline).or_default() += delta_rows;
+ for time in time_chunk.times() {
+ *self.times.entry(time).or_default() += delta_chunks;
+ }
}
}
}
@@ -283,7 +244,7 @@ mod tests {
#[test]
fn store_events() -> anyhow::Result<()> {
- let mut store = DataStore::new(
+ let mut store = ChunkStore::new(
re_log_types::StoreId::random(re_log_types::StoreKind::Recording),
Default::default(),
);
@@ -301,14 +262,11 @@ mod tests {
(timeline_yet_another, 1), //
]);
let entity_path1: EntityPath = "entity_a".into();
- let row1 = DataRow::from_component_batches(
- row_id1,
- timepoint1.clone(),
- entity_path1.clone(),
- [&MyIndex::from_iter(0..10) as _],
- )?;
+ let chunk1 = Chunk::builder(entity_path1.clone())
+ .with_component_batch(row_id1, timepoint1.clone(), &MyIndex::from_iter(0..10))
+ .build()?;
- view.on_events(&[store.insert_row(&row1)?]);
+ view.on_events(&[store.insert_chunk(&Arc::new(chunk1))?.unwrap()]);
similar_asserts::assert_eq!(
GlobalCounts::new(
@@ -342,21 +300,18 @@ mod tests {
(timeline_yet_another, 1), //
]);
let entity_path2: EntityPath = "entity_b".into();
- let row2 = {
+ let chunk2 = {
let num_instances = 3;
let points: Vec<_> = (0..num_instances)
.map(|i| MyPoint::new(0.0, i as f32))
.collect();
let colors = vec![MyColor::from(0xFF0000FF)];
- DataRow::from_component_batches(
- row_id2,
- timepoint2.clone(),
- entity_path2.clone(),
- [&points as _, &colors as _],
- )?
+ Chunk::builder(entity_path2.clone())
+ .with_component_batches(row_id2, timepoint2.clone(), [&points as _, &colors as _])
+ .build()?
};
- view.on_events(&[store.insert_row(&row2)?]);
+ view.on_events(&[store.insert_chunk(&Arc::new(chunk2))?.unwrap()]);
similar_asserts::assert_eq!(
GlobalCounts::new(
@@ -390,21 +345,22 @@ mod tests {
let row_id3 = RowId::new();
let timepoint3 = TimePoint::default();
- let row3 = {
+ let chunk3 = {
let num_instances = 6;
let colors = vec![MyColor::from(0x00DD00FF); num_instances];
- DataRow::from_component_batches(
- row_id3,
- timepoint3.clone(),
- entity_path2.clone(),
- [
- &MyIndex::from_iter(0..num_instances as _) as _,
- &colors as _,
- ],
- )?
+ Chunk::builder(entity_path2.clone())
+ .with_component_batches(
+ row_id3,
+ timepoint3.clone(),
+ [
+ &MyIndex::from_iter(0..num_instances as _) as _,
+ &colors as _,
+ ],
+ )
+ .build()?
};
- view.on_events(&[store.insert_row(&row3)?]);
+ view.on_events(&[store.insert_chunk(&Arc::new(chunk3))?.unwrap()]);
similar_asserts::assert_eq!(
GlobalCounts::new(
diff --git a/crates/re_chunk_store/src/gc.rs b/crates/re_chunk_store/src/gc.rs
new file mode 100644
index 000000000000..0a54b1448aa1
--- /dev/null
+++ b/crates/re_chunk_store/src/gc.rs
@@ -0,0 +1,534 @@
+use std::{
+ collections::{btree_map::Entry as BTreeMapEntry, BTreeSet},
+ time::Duration,
+};
+
+use ahash::{HashMap, HashSet};
+use nohash_hasher::{IntMap, IntSet};
+use re_chunk::{Chunk, ChunkId};
+use web_time::Instant;
+
+use re_log_types::{EntityPath, TimeInt, Timeline};
+use re_types_core::{ComponentName, SizeBytes};
+
+use crate::{
+ store::ChunkIdSetPerTime, ChunkStore, ChunkStoreChunkStats, ChunkStoreDiff, ChunkStoreDiffKind,
+ ChunkStoreEvent, ChunkStoreStats,
+};
+
+// Used all over in docstrings.
+#[allow(unused_imports)]
+use crate::RowId;
+
+// ---
+
+#[derive(Debug, Clone, Copy)]
+pub enum GarbageCollectionTarget {
+ /// Try to drop _at least_ the given fraction.
+ ///
+ /// The fraction must be a float in the range [0.0 : 1.0].
+ DropAtLeastFraction(f64),
+
+ /// GC Everything that isn't protected.
+ Everything,
+}
+
+#[derive(Debug, Clone)]
+pub struct GarbageCollectionOptions {
+ /// What target threshold should the GC try to meet.
+ pub target: GarbageCollectionTarget,
+
+ /// How long the garbage collection in allowed to run for.
+ ///
+ /// Trades off latency for throughput:
+ /// - A smaller `time_budget` will clear less data in a shorter amount of time, allowing for a
+ /// more responsive UI at the cost of more GC overhead and more frequent runs.
+ /// - A larger `time_budget` will clear more data in a longer amount of time, increasing the
+ /// chance of UI freeze frames but decreasing GC overhead and running less often.
+ ///
+ /// The default is an unbounded time budget (i.e. throughput only).
+ pub time_budget: Duration,
+
+ /// How many component revisions to preserve on each timeline.
+ pub protect_latest: usize,
+
+ /// Components which should not be protected from GC when using
+ /// [`GarbageCollectionOptions::protect_latest`].
+ //
+ // TODO(#6552): this should be removed in favor of a dedicated `remove_entity_path` API.
+ pub dont_protect_components: IntSet,
+
+ /// Timelines which should not be protected from GC when using `protect_latest`
+ /// [`GarbageCollectionOptions::protect_latest`].
+ //
+ // TODO(#6552): this should be removed in favor of a dedicated `remove_entity_path` API.
+ pub dont_protect_timelines: IntSet,
+}
+
+impl GarbageCollectionOptions {
+ pub fn gc_everything() -> Self {
+ Self {
+ target: GarbageCollectionTarget::Everything,
+ time_budget: std::time::Duration::MAX,
+ protect_latest: 0,
+ dont_protect_components: Default::default(),
+ dont_protect_timelines: Default::default(),
+ }
+ }
+}
+
+impl std::fmt::Display for GarbageCollectionTarget {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Self::DropAtLeastFraction(p) => {
+ write!(f, "DropAtLeast({:.3}%)", *p * 100.0)
+ }
+ Self::Everything => write!(f, "Everything"),
+ }
+ }
+}
+
+impl ChunkStore {
+ /// Triggers a garbage collection according to the desired `target`.
+ ///
+ /// Returns the list of `Chunk`s that were purged from the store in the form of [`ChunkStoreEvent`]s.
+ ///
+ /// ## Semantics
+ ///
+ /// Garbage collection works on a chunk-level basis and is driven by [`RowId`] order
+ /// (specifically, the smallest `RowId` of each respective Chunk), i.e. the order defined
+ /// by the clients' wall-clocks, allowing it to drop data across the different timelines in
+ /// a fair, deterministic manner.
+ /// Similarly, out-of-order data is supported out of the box.
+ ///
+ /// The garbage collector doesn't deallocate data in and of itself: all it does is drop the
+ /// store's internal references to that data (the `Chunk`s), which will be deallocated once
+ /// their reference count reaches 0.
+ ///
+ /// ## Limitations
+ ///
+ /// The garbage collector has limited support for latest-at semantics. The configuration option:
+ /// [`GarbageCollectionOptions::protect_latest`] will protect the N latest values of each
+ /// component on each timeline. The only practical guarantee this gives is that a latest-at query
+ /// with a value of max-int will be unchanged. However, latest-at queries from other arbitrary
+ /// points in time may provide different results pre- and post- GC.
+ pub fn gc(
+ &mut self,
+ options: &GarbageCollectionOptions,
+ ) -> (Vec, ChunkStoreStats) {
+ re_tracing::profile_function!();
+
+ self.gc_id += 1;
+
+ let stats_before = self.stats();
+
+ let total_size_bytes_before = stats_before.total().total_size_bytes as f64;
+ let total_num_chunks_before = stats_before.total().num_chunks;
+ let total_num_rows_before = stats_before.total().total_num_rows;
+
+ let protected_chunk_ids = self.find_all_protected_chunk_ids(
+ options.protect_latest,
+ &options.dont_protect_components,
+ &options.dont_protect_timelines,
+ );
+
+ let diffs = match options.target {
+ GarbageCollectionTarget::DropAtLeastFraction(p) => {
+ assert!((0.0..=1.0).contains(&p));
+
+ let num_bytes_to_drop = total_size_bytes_before * p;
+ let target_size_bytes = total_size_bytes_before - num_bytes_to_drop;
+
+ re_log::trace!(
+ kind = "gc",
+ id = self.gc_id,
+ %options.target,
+ total_num_chunks_before = re_format::format_uint(total_num_chunks_before),
+ total_num_rows_before = re_format::format_uint(total_num_rows_before),
+ total_size_bytes_before = re_format::format_bytes(total_size_bytes_before),
+ target_size_bytes = re_format::format_bytes(target_size_bytes),
+ drop_at_least_num_bytes = re_format::format_bytes(num_bytes_to_drop),
+ "starting GC"
+ );
+
+ self.gc_drop_at_least_num_bytes(options, num_bytes_to_drop, &protected_chunk_ids)
+ }
+ GarbageCollectionTarget::Everything => {
+ re_log::trace!(
+ kind = "gc",
+ id = self.gc_id,
+ %options.target,
+ total_num_rows_before = re_format::format_uint(total_num_rows_before),
+ total_size_bytes_before = re_format::format_bytes(total_size_bytes_before),
+ "starting GC"
+ );
+
+ self.gc_drop_at_least_num_bytes(options, f64::INFINITY, &protected_chunk_ids)
+ }
+ };
+
+ let stats_after = self.stats();
+ let total_size_bytes_after = stats_after.total().total_size_bytes as f64;
+ let total_num_chunks_after = stats_after.total().num_chunks;
+ let total_num_rows_after = stats_after.total().total_num_rows;
+
+ re_log::trace!(
+ kind = "gc",
+ id = self.gc_id,
+ %options.target,
+ total_num_chunks_before = re_format::format_uint(total_num_chunks_before),
+ total_num_rows_before = re_format::format_uint(total_num_rows_before),
+ total_size_bytes_before = re_format::format_bytes(total_size_bytes_before),
+ total_num_chunks_after = re_format::format_uint(total_num_chunks_after),
+ total_num_rows_after = re_format::format_uint(total_num_rows_after),
+ total_size_bytes_after = re_format::format_bytes(total_size_bytes_after),
+ "GC done"
+ );
+
+ let events: Vec<_> = diffs
+ .into_iter()
+ .map(|diff| ChunkStoreEvent {
+ store_id: self.id.clone(),
+ store_generation: self.generation(),
+ event_id: self
+ .event_id
+ .fetch_add(1, std::sync::atomic::Ordering::Relaxed),
+ diff,
+ })
+ .collect();
+
+ {
+ if cfg!(debug_assertions) {
+ let any_event_other_than_deletion = events
+ .iter()
+ .any(|e| e.kind != ChunkStoreDiffKind::Deletion);
+ assert!(!any_event_other_than_deletion);
+ }
+
+ Self::on_events(&events);
+ }
+
+ (events, stats_before - stats_after)
+ }
+
+ /// For each `EntityPath`, `Timeline`, `Component` find the N latest [`ChunkId`]s.
+ //
+ // TODO(jleibs): More complex functionality might required expanding this to also
+ // *ignore* specific entities, components, timelines, etc. for this protection.
+ fn find_all_protected_chunk_ids(
+ &self,
+ target_count: usize,
+ dont_protect_components: &IntSet,
+ dont_protect_timelines: &IntSet,
+ ) -> BTreeSet {
+ re_tracing::profile_function!();
+
+ if target_count == 0 {
+ return Default::default();
+ }
+
+ self.temporal_chunk_ids_per_entity
+ .values()
+ .flat_map(|temporal_chunk_ids_per_timeline| {
+ temporal_chunk_ids_per_timeline
+ .iter()
+ .filter_map(|(timeline, temporal_chunk_ids_per_component)| {
+ (!dont_protect_timelines.contains(timeline))
+ .then_some(temporal_chunk_ids_per_component)
+ })
+ .flat_map(|temporal_chunk_ids_per_component| {
+ temporal_chunk_ids_per_component
+ .iter()
+ .filter(|(component_name, _)| {
+ !dont_protect_components.contains(component_name)
+ })
+ .flat_map(|(_, temporal_chunk_ids_per_time)| {
+ temporal_chunk_ids_per_time
+ .per_start_time
+ .last_key_value()
+ .map(|(_, chunk_ids)| chunk_ids.iter().copied())
+ .into_iter()
+ .flatten()
+ .chain(
+ temporal_chunk_ids_per_time
+ .per_end_time
+ .last_key_value()
+ .map(|(_, chunk_ids)| chunk_ids.iter().copied())
+ .into_iter()
+ .flatten(),
+ )
+ .collect::>()
+ .into_iter()
+ .rev()
+ .take(target_count)
+ })
+ })
+ })
+ .collect()
+ }
+
+ fn gc_drop_at_least_num_bytes(
+ &mut self,
+ options: &GarbageCollectionOptions,
+ mut num_bytes_to_drop: f64,
+ protected_chunk_ids: &BTreeSet,
+ ) -> Vec {
+ re_tracing::profile_function!(re_format::format_bytes(num_bytes_to_drop));
+
+ type RemovableChunkIdPerTimePerComponentPerTimelinePerEntity = IntMap<
+ EntityPath,
+ IntMap>>>,
+ >;
+
+ let mut chunk_ids_to_be_removed =
+ RemovableChunkIdPerTimePerComponentPerTimelinePerEntity::default();
+ let mut chunk_ids_dangling = HashSet::default();
+
+ let start_time = Instant::now();
+
+ {
+ re_tracing::profile_scope!("mark");
+
+ for chunk_id in self
+ .chunk_ids_per_min_row_id
+ .values()
+ .flatten()
+ .filter(|chunk_id| !protected_chunk_ids.contains(chunk_id))
+ {
+ if let Some(chunk) = self.chunks_per_chunk_id.get(chunk_id) {
+ // NOTE: Do _NOT_ use `chunk.total_size_bytes` as it is sitting behind an Arc
+ // and would count as amortized (i.e. 0 bytes).
+ num_bytes_to_drop -= ::total_size_bytes(chunk) as f64;
+
+ // NOTE: We cannot blindly `retain` across all temporal tables, it's way too costly
+ // and slow. Rather we need to surgically remove the superfluous chunks.
+ let entity_path = chunk.entity_path();
+ let per_timeline = chunk_ids_to_be_removed
+ .entry(entity_path.clone())
+ .or_default();
+ for (&timeline, time_chunk) in chunk.timelines() {
+ let per_component = per_timeline.entry(timeline).or_default();
+ for component_name in chunk.component_names() {
+ let per_time = per_component.entry(component_name).or_default();
+
+ // NOTE: As usual, these are vectors of `ChunkId`s, as it is legal to
+ // have perfectly overlapping chunks.
+ let time_range = time_chunk.time_range();
+ per_time
+ .entry(time_range.min())
+ .or_default()
+ .push(chunk.id());
+ if time_range.min() != time_range.max() {
+ per_time
+ .entry(time_range.max())
+ .or_default()
+ .push(chunk.id());
+ }
+ }
+ }
+ } else {
+ chunk_ids_dangling.insert(*chunk_id);
+ }
+
+ // NOTE: There is no point in spending more than a fourth of the time budget on the
+ // mark phase or there is no way the sweep phase will have any time to do anything
+ // with the results anyhow.
+ if start_time.elapsed() >= options.time_budget / 4 || num_bytes_to_drop <= 0.0 {
+ break;
+ }
+ }
+ }
+
+ {
+ re_tracing::profile_scope!("sweep");
+
+ let Self {
+ id: _,
+ config: _,
+ type_registry: _,
+ chunks_per_chunk_id,
+ chunk_ids_per_min_row_id: chunk_id_per_min_row_id,
+ temporal_chunk_ids_per_entity,
+ temporal_chunks_stats,
+ static_chunk_ids_per_entity: _, // we don't GC static data
+ static_chunks_stats: _, // we don't GC static data
+ insert_id: _,
+ query_id: _,
+ gc_id: _,
+ event_id: _,
+ } = self;
+
+ let mut diffs = Vec::new();
+
+ // NOTE: Dangling chunks should never happen: it is the job of the GC to ensure that.
+ //
+ // In release builds, we still want to do the nice thing and clean them up as best as we
+ // can in order to prevent OOMs.
+ //
+ // We should really never be in there, so don't bother accounting that in the time
+ // budget.
+ debug_assert!(
+ chunk_ids_dangling.is_empty(),
+ "detected dangling chunks -- there's a GC bug"
+ );
+ if !chunk_ids_dangling.is_empty() {
+ re_tracing::profile_scope!("dangling");
+
+ chunk_id_per_min_row_id.retain(|_row_id, chunk_ids| {
+ chunk_ids.retain(|chunk_id| !chunk_ids_dangling.contains(chunk_id));
+ !chunk_ids.is_empty()
+ });
+
+ for temporal_chunk_ids_per_component in temporal_chunk_ids_per_entity.values_mut() {
+ for temporal_chunk_ids_per_timeline in
+ temporal_chunk_ids_per_component.values_mut()
+ {
+ for temporal_chunk_ids_per_time in
+ temporal_chunk_ids_per_timeline.values_mut()
+ {
+ let ChunkIdSetPerTime {
+ max_interval_length: _,
+ per_start_time,
+ per_end_time,
+ } = temporal_chunk_ids_per_time;
+
+ // TODO(cmc): Technically, the optimal thing to do would be to
+ // recompute `max_interval_length` per time here.
+ // In practice, this adds a lot of complexity for likely very little
+ // performance benefit, since we expect the chunks to have similar
+ // interval lengths on the happy path.
+
+ for chunk_ids in per_start_time.values_mut() {
+ chunk_ids.retain(|chunk_id| !chunk_ids_dangling.contains(chunk_id));
+ }
+ for chunk_ids in per_end_time.values_mut() {
+ chunk_ids.retain(|chunk_id| !chunk_ids_dangling.contains(chunk_id));
+ }
+ }
+ }
+ }
+
+ diffs.extend(
+ chunk_ids_dangling
+ .into_iter()
+ .filter_map(|chunk_id| chunks_per_chunk_id.remove(&chunk_id))
+ .map(ChunkStoreDiff::deletion),
+ );
+ }
+
+ if !chunk_ids_to_be_removed.is_empty() {
+ re_tracing::profile_scope!("standard");
+
+ // NOTE: We cannot blindly `retain` across all temporal tables, it's way too costly
+ // and slow. Rather we need to surgically remove the superfluous chunks.
+
+ let mut chunk_ids_removed = HashSet::default();
+
+ for (entity_path, chunk_ids_to_be_removed) in chunk_ids_to_be_removed {
+ let BTreeMapEntry::Occupied(mut temporal_chunk_ids_per_timeline) =
+ temporal_chunk_ids_per_entity.entry(entity_path)
+ else {
+ continue;
+ };
+
+ for (timeline, chunk_ids_to_be_removed) in chunk_ids_to_be_removed {
+ let BTreeMapEntry::Occupied(mut temporal_chunk_ids_per_component) =
+ temporal_chunk_ids_per_timeline.get_mut().entry(timeline)
+ else {
+ continue;
+ };
+
+ for (component_name, chunk_ids_to_be_removed) in chunk_ids_to_be_removed {
+ let BTreeMapEntry::Occupied(mut temporal_chunk_ids_per_time) =
+ temporal_chunk_ids_per_component
+ .get_mut()
+ .entry(component_name)
+ else {
+ continue;
+ };
+
+ let ChunkIdSetPerTime {
+ max_interval_length: _,
+ per_start_time,
+ per_end_time,
+ } = temporal_chunk_ids_per_time.get_mut();
+
+ // TODO(cmc): Technically, the optimal thing to do would be to
+ // recompute `max_interval_length` per time here.
+ // In practice, this adds a lot of complexity for likely very little
+ // performance benefit, since we expect the chunks to have similar
+ // interval lengths on the happy path.
+
+ for (time, chunk_ids) in chunk_ids_to_be_removed {
+ if let BTreeMapEntry::Occupied(mut chunk_id_set) =
+ per_start_time.entry(time)
+ {
+ for chunk_id in &chunk_ids {
+ chunk_id_set.get_mut().remove(chunk_id);
+ }
+ if chunk_id_set.get().is_empty() {
+ chunk_id_set.remove_entry();
+ }
+ }
+
+ if let BTreeMapEntry::Occupied(mut chunk_id_set) =
+ per_end_time.entry(time)
+ {
+ for chunk_id in &chunk_ids {
+ chunk_id_set.get_mut().remove(chunk_id);
+ }
+ if chunk_id_set.get().is_empty() {
+ chunk_id_set.remove_entry();
+ }
+ }
+
+ chunk_ids_removed.extend(chunk_ids);
+ }
+
+ if per_start_time.is_empty() && per_end_time.is_empty() {
+ temporal_chunk_ids_per_time.remove_entry();
+ }
+
+ if start_time.elapsed() >= options.time_budget {
+ break;
+ }
+ }
+
+ if temporal_chunk_ids_per_component.get().is_empty() {
+ temporal_chunk_ids_per_component.remove_entry();
+ }
+
+ if start_time.elapsed() >= options.time_budget {
+ break;
+ }
+ }
+
+ if temporal_chunk_ids_per_timeline.get().is_empty() {
+ temporal_chunk_ids_per_timeline.remove_entry();
+ }
+
+ if start_time.elapsed() >= options.time_budget {
+ break;
+ }
+ }
+
+ chunk_id_per_min_row_id.retain(|_row_id, chunk_ids| {
+ chunk_ids.retain(|chunk_id| !chunk_ids_removed.contains(chunk_id));
+ !chunk_ids.is_empty()
+ });
+
+ diffs.extend(
+ chunk_ids_removed
+ .into_iter()
+ .filter_map(|chunk_id| chunks_per_chunk_id.remove(&chunk_id))
+ .inspect(|chunk| {
+ *temporal_chunks_stats -= ChunkStoreChunkStats::from_chunk(chunk);
+ })
+ .map(ChunkStoreDiff::deletion),
+ );
+ }
+
+ diffs
+ }
+ }
+}
diff --git a/crates/re_chunk_store/src/lib.rs b/crates/re_chunk_store/src/lib.rs
new file mode 100644
index 000000000000..d7d72afabd04
--- /dev/null
+++ b/crates/re_chunk_store/src/lib.rs
@@ -0,0 +1,49 @@
+//! The Rerun chunk store, implemented on top of [Apache Arrow](https://arrow.apache.org/)
+//! using the [`arrow2`] crate.
+//!
+//! This crate is an in-memory time series database for Rerun log data.
+//! It is indexed by Entity path, component, timeline, and time.
+//! It supports out-of-order insertions, and fast `O(log(N))` queries.
+//!
+//! * See [`ChunkStore`] for an overview of the core data structures.
+//! * See [`ChunkStore::latest_at_relevant_chunks`] and [`ChunkStore::range_relevant_chunks`]
+//! for the documentation of the public read APIs.
+//! * See [`ChunkStore::insert_chunk`] for the documentation of the public write APIs.
+//!
+//! ## Feature flags
+#![doc = document_features::document_features!()]
+//!
+
+mod events;
+mod gc;
+mod query;
+mod stats;
+mod store;
+mod subscribers;
+mod writes;
+
+pub use self::events::{ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent};
+pub use self::gc::{GarbageCollectionOptions, GarbageCollectionTarget};
+pub use self::stats::{ChunkStoreChunkStats, ChunkStoreStats};
+pub use self::store::{ChunkStore, ChunkStoreConfig, ChunkStoreGeneration};
+pub use self::subscribers::{ChunkStoreSubscriber, ChunkStoreSubscriberHandle};
+
+// Re-exports
+#[doc(no_inline)]
+pub use re_chunk::{Chunk, ChunkId, LatestAtQuery, RangeQuery, RowId};
+#[doc(no_inline)]
+pub use re_log_types::{ResolvedTimeRange, TimeInt, TimeType, Timeline};
+
+pub mod external {
+ pub use re_chunk;
+}
+
+// ---
+
+#[derive(thiserror::Error, Debug)]
+pub enum ChunkStoreError {
+ #[error("Chunks must be sorted before insertion in the chunk store")]
+ UnsortedChunk,
+}
+
+pub type ChunkStoreResult = ::std::result::Result;
diff --git a/crates/re_chunk_store/src/query.rs b/crates/re_chunk_store/src/query.rs
new file mode 100644
index 000000000000..3b6cde149f95
--- /dev/null
+++ b/crates/re_chunk_store/src/query.rs
@@ -0,0 +1,270 @@
+use std::{
+ collections::BTreeSet,
+ sync::{atomic::Ordering, Arc},
+};
+
+use re_chunk::{Chunk, LatestAtQuery, RangeQuery};
+use re_log_types::{EntityPath, TimeInt, Timeline};
+use re_types_core::{ComponentName, ComponentNameSet};
+
+use crate::ChunkStore;
+
+// Used all over in docstrings.
+#[allow(unused_imports)]
+use crate::RowId;
+
+// ---
+
+impl ChunkStore {
+ /// Retrieve all the [`ComponentName`]s that have been written to for a given [`EntityPath`] on
+ /// the specified [`Timeline`].
+ ///
+ /// Static components are always included in the results.
+ ///
+ /// Returns `None` if the entity doesn't exist at all on this `timeline`.
+ pub fn all_components(
+ &self,
+ timeline: &Timeline,
+ entity_path: &EntityPath,
+ ) -> Option {
+ re_tracing::profile_function!();
+
+ self.query_id.fetch_add(1, Ordering::Relaxed);
+
+ let static_components: Option = self
+ .static_chunk_ids_per_entity
+ .get(entity_path)
+ .map(|static_chunks_per_component| {
+ static_chunks_per_component.keys().copied().collect()
+ });
+
+ let temporal_components: Option = self
+ .temporal_chunk_ids_per_entity
+ .get(entity_path)
+ .map(|temporal_chunk_ids_per_timeline| {
+ temporal_chunk_ids_per_timeline
+ .iter()
+ .filter(|(cur_timeline, _)| *cur_timeline == timeline)
+ .flat_map(|(_, temporal_chunk_ids_per_component)| {
+ temporal_chunk_ids_per_component.keys().copied()
+ })
+ .collect()
+ });
+
+ match (static_components, temporal_components) {
+ (None, None) => None,
+ (None, comps @ Some(_)) | (comps @ Some(_), None) => comps,
+ (Some(static_comps), Some(temporal_comps)) => {
+ Some(static_comps.into_iter().chain(temporal_comps).collect())
+ }
+ }
+ }
+
+ /// Check whether a given entity has a specific [`ComponentName`] either on the specified
+ /// timeline, or in its static data.
+ #[inline]
+ pub fn entity_has_component(
+ &self,
+ timeline: &Timeline,
+ entity_path: &EntityPath,
+ component_name: &ComponentName,
+ ) -> bool {
+ re_tracing::profile_function!();
+ self.all_components(timeline, entity_path)
+ .map_or(false, |components| components.contains(component_name))
+ }
+
+ /// Find the earliest time at which something was logged for a given entity on the specified
+ /// timeline.
+ ///
+ /// Ignores static data.
+ #[inline]
+ pub fn entity_min_time(
+ &self,
+ timeline: &Timeline,
+ entity_path: &EntityPath,
+ ) -> Option {
+ let temporal_chunk_ids_per_timeline =
+ self.temporal_chunk_ids_per_entity.get(entity_path)?;
+ let temporal_chunk_ids_per_component = temporal_chunk_ids_per_timeline.get(timeline)?;
+
+ let mut time_min = TimeInt::MAX;
+ for temporal_chunk_ids_per_time in temporal_chunk_ids_per_component.values() {
+ let Some(time) = temporal_chunk_ids_per_time
+ .per_start_time
+ .first_key_value()
+ .map(|(time, _)| *time)
+ else {
+ continue;
+ };
+ time_min = TimeInt::min(time_min, time);
+ }
+
+ (time_min != TimeInt::MAX).then_some(time_min)
+ }
+
+ /// Returns the most-relevant chunk(s) for the given [`LatestAtQuery`].
+ ///
+ /// The [`ChunkStore`] always work at the [`Chunk`] level (as opposed to the row level): it is
+ /// oblivious to the data therein.
+ /// For that reason, and because [`Chunk`]s are allowed to temporally overlap, it is possible
+ /// that a query has more than one relevant chunk.
+ ///
+ /// The caller should filter the returned chunks further (see [`Chunk::latest_at`]) in order to
+ /// determine what exact row contains the final result.
+ ///
+ /// If the entity has static component data associated with it, it will unconditionally
+ /// override any temporal component data.
+ pub fn latest_at_relevant_chunks(
+ &self,
+ query: &LatestAtQuery,
+ entity_path: &EntityPath,
+ component_name: ComponentName,
+ ) -> Vec> {
+ re_tracing::profile_function!(format!("{query:?}"));
+
+ self.query_id.fetch_add(1, Ordering::Relaxed);
+
+ // Reminder: if a chunk has been indexed for a given component, then it must contain at
+ // least one non-null value for that column.
+
+ if let Some(static_chunk) = self
+ .static_chunk_ids_per_entity
+ .get(entity_path)
+ .and_then(|static_chunks_per_component| {
+ static_chunks_per_component.get(&component_name)
+ })
+ .and_then(|chunk_id| self.chunks_per_chunk_id.get(chunk_id))
+ {
+ return vec![Arc::clone(static_chunk)];
+ }
+
+ if let Some(temporal_chunk_ids) = self
+ .temporal_chunk_ids_per_entity
+ .get(entity_path)
+ .and_then(|temporal_chunk_ids_per_timeline| {
+ temporal_chunk_ids_per_timeline.get(&query.timeline())
+ })
+ .and_then(|temporal_chunk_ids_per_component| {
+ temporal_chunk_ids_per_component.get(&component_name)
+ })
+ .and_then(|temporal_chunk_ids_per_time| {
+ let upper_bound = temporal_chunk_ids_per_time
+ .per_start_time
+ .range(..=query.at())
+ .next_back()
+ .map(|(time, _)| *time)?;
+
+ // Overlapped chunks
+ // =================
+ //
+ // To deal with potentially overlapping chunks, we keep track of the longest
+ // interval in the entire map, which gives us an upper bound on how much we
+ // would need to walk backwards in order to find all potential overlaps.
+ //
+ // This is a fairly simple solution that scales much better than interval-tree
+ // based alternatives, both in terms of complexity and performance, in the normal
+ // case where most chunks in a collection have similar lengths.
+ //
+ // The most degenerate case -- a single chunk overlaps everything else -- results
+ // in `O(n)` performance, which gets amortized by the query cache.
+ // If that turns out to be a problem in practice, we can experiment with more
+ // complex solutions then.
+ let lower_bound = upper_bound
+ .as_i64()
+ .saturating_sub(temporal_chunk_ids_per_time.max_interval_length as _);
+
+ Some(
+ temporal_chunk_ids_per_time
+ .per_start_time
+ .range(..=query.at())
+ .rev()
+ .take_while(|(time, _)| time.as_i64() >= lower_bound)
+ .flat_map(|(_time, chunk_ids)| chunk_ids.iter())
+ .copied()
+ .collect::>(),
+ )
+ })
+ {
+ return temporal_chunk_ids
+ .iter()
+ .filter_map(|chunk_id| self.chunks_per_chunk_id.get(chunk_id).cloned())
+ .collect();
+ }
+
+ Vec::new()
+ }
+
+ /// Returns the most-relevant chunk(s) for the given [`RangeQuery`].
+ ///
+ /// The criteria for returning a chunk is only that it may contain data that overlaps with
+ /// the queried range.
+ ///
+ /// The caller should filter the returned chunks further (see [`Chunk::range`]) in order to
+ /// determine how exactly each row of data fit with the rest.
+ ///
+ /// If the entity has static component data associated with it, it will unconditionally
+ /// override any temporal component data.
+ pub fn range_relevant_chunks(
+ &self,
+ query: &RangeQuery,
+ entity_path: &EntityPath,
+ component_name: ComponentName,
+ ) -> Vec> {
+ re_tracing::profile_function!(format!("{query:?}"));
+
+ self.query_id.fetch_add(1, Ordering::Relaxed);
+
+ if let Some(static_chunk) = self
+ .static_chunk_ids_per_entity
+ .get(entity_path)
+ .and_then(|static_chunks_per_component| {
+ static_chunks_per_component.get(&component_name)
+ })
+ .and_then(|chunk_id| self.chunks_per_chunk_id.get(chunk_id))
+ {
+ return vec![Arc::clone(static_chunk)];
+ }
+
+ self.temporal_chunk_ids_per_entity
+ .get(entity_path)
+ .and_then(|temporal_chunk_ids_per_timeline| {
+ temporal_chunk_ids_per_timeline.get(&query.timeline())
+ })
+ .and_then(|temporal_chunk_ids_per_component| {
+ temporal_chunk_ids_per_component.get(&component_name)
+ })
+ .into_iter()
+ .map(|temporal_chunk_ids_per_time| {
+ let start_time = temporal_chunk_ids_per_time
+ .per_start_time
+ .range(..=query.range.min())
+ .next_back()
+ .map_or(TimeInt::MIN, |(&time, _)| time);
+
+ let end_time = temporal_chunk_ids_per_time
+ .per_start_time
+ .range(..=query.range.max())
+ .next_back()
+ .map_or(start_time, |(&time, _)| time);
+
+ // NOTE: Just being extra cautious because, even though this shouldnt possibly ever happen,
+ // indexing a std map with a backwards range is an instant crash.
+ let end_time = TimeInt::max(start_time, end_time);
+
+ (start_time, end_time, temporal_chunk_ids_per_time)
+ })
+ .flat_map(|(start_time, end_time, temporal_chunk_ids_per_time)| {
+ temporal_chunk_ids_per_time
+ .per_start_time
+ .range(start_time..=end_time)
+ .map(|(_time, chunk_ids)| chunk_ids)
+ })
+ .flat_map(|temporal_chunk_ids| {
+ temporal_chunk_ids
+ .iter()
+ .filter_map(|chunk_id| self.chunks_per_chunk_id.get(chunk_id).cloned())
+ })
+ .collect()
+ }
+}
diff --git a/crates/re_chunk_store/src/stats.rs b/crates/re_chunk_store/src/stats.rs
new file mode 100644
index 000000000000..fd713afe2c0e
--- /dev/null
+++ b/crates/re_chunk_store/src/stats.rs
@@ -0,0 +1,166 @@
+use std::sync::Arc;
+
+use re_chunk::Chunk;
+use re_types_core::SizeBytes;
+
+use crate::ChunkStore;
+
+// ---
+
+#[derive(Default, Debug, Clone, Copy)]
+pub struct ChunkStoreStats {
+ pub static_chunks: ChunkStoreChunkStats,
+ pub temporal_chunks: ChunkStoreChunkStats,
+}
+
+impl ChunkStoreStats {
+ #[inline]
+ pub fn total(&self) -> ChunkStoreChunkStats {
+ let Self {
+ static_chunks,
+ temporal_chunks,
+ } = *self;
+ static_chunks + temporal_chunks
+ }
+}
+
+impl std::ops::Add for ChunkStoreStats {
+ type Output = Self;
+
+ #[inline]
+ fn add(self, rhs: Self) -> Self::Output {
+ let Self {
+ static_chunks,
+ temporal_chunks,
+ } = self;
+
+ let static_chunks = static_chunks + rhs.static_chunks;
+ let temporal_chunks = temporal_chunks + rhs.temporal_chunks;
+
+ Self {
+ static_chunks,
+ temporal_chunks,
+ }
+ }
+}
+
+impl std::ops::Sub for ChunkStoreStats {
+ type Output = Self;
+
+ #[inline]
+ fn sub(self, rhs: Self) -> Self::Output {
+ let Self {
+ static_chunks,
+ temporal_chunks,
+ } = self;
+
+ let static_chunks = static_chunks - rhs.static_chunks;
+ let temporal_chunks = temporal_chunks - rhs.temporal_chunks;
+
+ Self {
+ static_chunks,
+ temporal_chunks,
+ }
+ }
+}
+
+impl ChunkStore {
+ #[inline]
+ pub fn stats(&self) -> ChunkStoreStats {
+ ChunkStoreStats {
+ static_chunks: self.static_chunks_stats,
+ temporal_chunks: self.temporal_chunks_stats,
+ }
+ }
+}
+
+// ---
+
+#[derive(Default, Debug, Clone, Copy)]
+pub struct ChunkStoreChunkStats {
+ pub num_chunks: u64,
+ pub total_size_bytes: u64,
+ pub total_num_rows: u64,
+}
+
+impl std::fmt::Display for ChunkStoreChunkStats {
+ #[inline]
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let Self {
+ num_chunks,
+ total_size_bytes,
+ total_num_rows,
+ } = *self;
+
+ f.write_fmt(format_args!(
+ "num_chunks: {}\n",
+ re_format::format_uint(num_chunks)
+ ))?;
+ f.write_fmt(format_args!(
+ "total_size_bytes: {}\n",
+ re_format::format_bytes(total_size_bytes as _)
+ ))?;
+ f.write_fmt(format_args!(
+ "total_num_rows: {}\n",
+ re_format::format_uint(total_num_rows)
+ ))?;
+
+ Ok(())
+ }
+}
+
+impl std::ops::Add for ChunkStoreChunkStats {
+ type Output = Self;
+
+ #[inline]
+ fn add(self, rhs: Self) -> Self::Output {
+ Self {
+ num_chunks: self.num_chunks + rhs.num_chunks,
+ total_size_bytes: self.total_size_bytes + rhs.total_size_bytes,
+ total_num_rows: self.total_num_rows + rhs.total_num_rows,
+ }
+ }
+}
+
+impl std::ops::AddAssign for ChunkStoreChunkStats {
+ #[inline]
+ fn add_assign(&mut self, rhs: Self) {
+ *self = *self + rhs;
+ }
+}
+
+impl std::ops::Sub for ChunkStoreChunkStats {
+ type Output = Self;
+
+ #[inline]
+ fn sub(self, rhs: Self) -> Self::Output {
+ Self {
+ num_chunks: self.num_chunks - rhs.num_chunks,
+ total_size_bytes: self.total_size_bytes - rhs.total_size_bytes,
+ total_num_rows: self.total_num_rows - rhs.total_num_rows,
+ }
+ }
+}
+
+impl std::ops::SubAssign for ChunkStoreChunkStats {
+ #[inline]
+ fn sub_assign(&mut self, rhs: Self) {
+ *self = *self - rhs;
+ }
+}
+
+impl ChunkStoreChunkStats {
+ #[inline]
+ pub fn from_chunk(chunk: &Arc) -> Self {
+ // NOTE: Do _NOT_ use `chunk.total_size_bytes` as it is sitting behind an Arc
+ // and would count as amortized (i.e. 0 bytes).
+ let size_bytes = ::total_size_bytes(&**chunk);
+ let num_rows = chunk.num_rows() as u64;
+
+ Self {
+ num_chunks: 1,
+ total_size_bytes: size_bytes,
+ total_num_rows: num_rows,
+ }
+ }
+}
diff --git a/crates/re_chunk_store/src/store.rs b/crates/re_chunk_store/src/store.rs
new file mode 100644
index 000000000000..d248bf29eb15
--- /dev/null
+++ b/crates/re_chunk_store/src/store.rs
@@ -0,0 +1,261 @@
+use std::collections::{BTreeMap, BTreeSet};
+use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
+
+use arrow2::datatypes::DataType as ArrowDataType;
+use nohash_hasher::IntMap;
+
+use re_chunk::{Chunk, ChunkId, RowId};
+use re_log_types::{EntityPath, StoreId, TimeInt, Timeline};
+use re_types_core::ComponentName;
+
+use crate::ChunkStoreChunkStats;
+
+// ---
+
+// TODO(cmc): empty for now but soon will contain compaction settings, so preemptively
+// avoid breaking changes everywhere.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ChunkStoreConfig {}
+
+impl Default for ChunkStoreConfig {
+ #[inline]
+ fn default() -> Self {
+ Self::DEFAULT
+ }
+}
+
+impl ChunkStoreConfig {
+ pub const DEFAULT: Self = Self {};
+}
+
+// ---
+
+pub type ChunkIdSet = BTreeSet;
+
+#[derive(Default, Debug, Clone)]
+pub struct ChunkIdSetPerTime {
+ /// Keeps track of the longest interval being currently stored in the two maps below.
+ ///
+ /// This is used to bound the backwards linear walk when looking for overlapping chunks in
+ /// latest-at queries.
+ ///
+ /// See [`ChunkStore::latest_at_relevant_chunks`] implementation comments for more details.
+ pub(crate) max_interval_length: u64,
+
+ pub(crate) per_start_time: BTreeMap,
+ pub(crate) per_end_time: BTreeMap,
+}
+
+pub type ChunkIdSetPerTimePerComponent = BTreeMap;
+
+pub type ChunkIdSetPerTimePerComponentPerTimeline =
+ BTreeMap;
+
+pub type ChunkIdSetPerTimePerComponentPerTimelinePerEntity =
+ BTreeMap;
+
+pub type ChunkIdPerComponent = BTreeMap;
+
+pub type ChunkIdPerComponentPerEntity = BTreeMap;
+
+// ---
+
+/// Incremented on each edit.
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct ChunkStoreGeneration {
+ insert_id: u64,
+ gc_id: u64,
+}
+
+/// A complete chunk store: covers all timelines, all entities, everything.
+///
+/// The chunk store _always_ works at the chunk level, whether it is for write & read queries or
+/// garbage collection. It is completely oblivious to individual rows.
+///
+/// Use the `Display` implementation for a detailed view of the internals.
+#[derive(Debug)]
+pub struct ChunkStore {
+ pub(crate) id: StoreId,
+
+ /// The configuration of the chunk store (e.g. compaction settings).
+ pub(crate) config: ChunkStoreConfig,
+
+ /// Keeps track of the _latest_ datatype information for all component types that have been written
+ /// to the store so far.
+ ///
+ /// See also [`Self::lookup_datatype`].
+ //
+ // TODO(#1809): replace this with a centralized Arrow registry.
+ // TODO(cmc): this would become fairly problematic in a world where each chunk can use a
+ // different datatype for a given component.
+ pub(crate) type_registry: IntMap,
+
+ pub(crate) chunks_per_chunk_id: BTreeMap>,
+
+ /// All [`ChunkId`]s currently in the store, indexed by the smallest [`RowId`] in each of them.
+ ///
+ /// This is effectively all chunks in global data order. Used for garbage collection.
+ ///
+ /// This is a map of vecs instead of individual [`ChunkId`] in order to better support
+ /// duplicated [`RowId`]s.
+ pub(crate) chunk_ids_per_min_row_id: BTreeMap>,
+
+ /// All temporal [`ChunkId`]s for all entities on all timelines.
+ ///
+ /// See also [`Self::static_chunk_ids_per_entity`].
+ pub(crate) temporal_chunk_ids_per_entity: ChunkIdSetPerTimePerComponentPerTimelinePerEntity,
+
+ /// Accumulated size statitistics for all temporal [`Chunk`]s currently present in the store.
+ ///
+ /// This is too costly to be computed from scratch every frame, and is required by e.g. the GC.
+ pub(crate) temporal_chunks_stats: ChunkStoreChunkStats,
+
+ /// Static data. Never garbage collected.
+ ///
+ /// Static data unconditionally shadows temporal data at query time.
+ ///
+ /// Existing temporal will not be removed. Events won't be fired.
+ pub(crate) static_chunk_ids_per_entity: ChunkIdPerComponentPerEntity,
+
+ /// Accumulated size statitistics for all static [`Chunk`]s currently present in the store.
+ ///
+ /// This is too costly to be computed from scratch every frame, and is required by e.g. the GC.
+ pub(crate) static_chunks_stats: ChunkStoreChunkStats,
+
+ // pub(crate) static_tables: BTreeMap,
+ /// Monotonically increasing ID for insertions.
+ pub(crate) insert_id: u64,
+
+ /// Monotonically increasing ID for queries.
+ pub(crate) query_id: AtomicU64,
+
+ /// Monotonically increasing ID for GCs.
+ pub(crate) gc_id: u64,
+
+ /// Monotonically increasing ID for store events.
+ pub(crate) event_id: AtomicU64,
+}
+
+impl Clone for ChunkStore {
+ #[inline]
+ fn clone(&self) -> Self {
+ Self {
+ id: self.id.clone(),
+ config: self.config.clone(),
+ type_registry: self.type_registry.clone(),
+ chunks_per_chunk_id: self.chunks_per_chunk_id.clone(),
+ chunk_ids_per_min_row_id: self.chunk_ids_per_min_row_id.clone(),
+ temporal_chunk_ids_per_entity: self.temporal_chunk_ids_per_entity.clone(),
+ temporal_chunks_stats: self.temporal_chunks_stats,
+ static_chunk_ids_per_entity: self.static_chunk_ids_per_entity.clone(),
+ static_chunks_stats: self.static_chunks_stats,
+ insert_id: Default::default(),
+ query_id: Default::default(),
+ gc_id: Default::default(),
+ event_id: Default::default(),
+ }
+ }
+}
+
+impl std::fmt::Display for ChunkStore {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let Self {
+ id,
+ config,
+ type_registry: _,
+ chunks_per_chunk_id,
+ chunk_ids_per_min_row_id: chunk_id_per_min_row_id,
+ temporal_chunk_ids_per_entity: _,
+ temporal_chunks_stats,
+ static_chunk_ids_per_entity: _,
+ static_chunks_stats,
+ insert_id: _,
+ query_id: _,
+ gc_id: _,
+ event_id: _,
+ } = self;
+
+ f.write_str("ChunkStore {\n")?;
+
+ f.write_str(&indent::indent_all_by(4, format!("id: {id}\n")))?;
+ f.write_str(&indent::indent_all_by(4, format!("config: {config:?}\n")))?;
+
+ f.write_str(&indent::indent_all_by(4, "stats: {\n"))?;
+ f.write_str(&indent::indent_all_by(
+ 8,
+ format!("{}", *static_chunks_stats + *temporal_chunks_stats),
+ ))?;
+ f.write_str(&indent::indent_all_by(4, "}\n"))?;
+
+ f.write_str(&indent::indent_all_by(4, "chunks: [\n"))?;
+ for chunk_id in chunk_id_per_min_row_id.values().flatten() {
+ if let Some(chunk) = chunks_per_chunk_id.get(chunk_id) {
+ f.write_str(&indent::indent_all_by(8, format!("{chunk}\n")))?;
+ } else {
+ f.write_str(&indent::indent_all_by(8, "\n"))?;
+ }
+ }
+ f.write_str(&indent::indent_all_by(4, "]\n"))?;
+
+ f.write_str("}")?;
+
+ Ok(())
+ }
+}
+
+// ---
+
+impl ChunkStore {
+ #[inline]
+ pub fn new(id: StoreId, config: ChunkStoreConfig) -> Self {
+ Self {
+ id,
+ config,
+ type_registry: Default::default(),
+ chunk_ids_per_min_row_id: Default::default(),
+ chunks_per_chunk_id: Default::default(),
+ temporal_chunk_ids_per_entity: Default::default(),
+ temporal_chunks_stats: Default::default(),
+ static_chunk_ids_per_entity: Default::default(),
+ static_chunks_stats: Default::default(),
+ insert_id: 0,
+ query_id: AtomicU64::new(0),
+ gc_id: 0,
+ event_id: AtomicU64::new(0),
+ }
+ }
+
+ #[inline]
+ pub fn id(&self) -> &StoreId {
+ &self.id
+ }
+
+ /// Return the current [`ChunkStoreGeneration`]. This can be used to determine whether the
+ /// database has been modified since the last time it was queried.
+ #[inline]
+ pub fn generation(&self) -> ChunkStoreGeneration {
+ ChunkStoreGeneration {
+ insert_id: self.insert_id,
+ gc_id: self.gc_id,
+ }
+ }
+
+ /// See [`ChunkStoreConfig`] for more information about configuration.
+ #[inline]
+ pub fn config(&self) -> &ChunkStoreConfig {
+ &self.config
+ }
+
+ /// Iterate over all chunks in the store, in ascending [`ChunkId`] order.
+ #[inline]
+ pub fn iter_chunks(&self) -> impl Iterator
- > + '_ {
+ self.chunks_per_chunk_id.values()
+ }
+
+ /// Lookup the _latest_ arrow [`ArrowDataType`] used by a specific [`re_types_core::Component`].
+ #[inline]
+ pub fn lookup_datatype(&self, component_name: &ComponentName) -> Option<&ArrowDataType> {
+ self.type_registry.get(component_name)
+ }
+}
diff --git a/crates/re_data_store/src/store_subscriber.rs b/crates/re_chunk_store/src/subscribers.rs
similarity index 56%
rename from crates/re_data_store/src/store_subscriber.rs
rename to crates/re_chunk_store/src/subscribers.rs
index f56264164bfe..1d375bd58912 100644
--- a/crates/re_data_store/src/store_subscriber.rs
+++ b/crates/re_chunk_store/src/subscribers.rs
@@ -1,18 +1,19 @@
use parking_lot::RwLock;
-use crate::{DataStore, StoreEvent};
+use crate::{ChunkStore, ChunkStoreEvent};
// ---
// TODO(cmc): Not sure why I need the extra Box here, RwLock should be `?Sized`.
-type SharedStoreSubscriber = RwLock>;
+type SharedStoreSubscriber = RwLock>;
-/// A [`StoreSubscriber`] subscribes to atomic changes from all [`DataStore`]s through [`StoreEvent`]s.
+/// A [`ChunkStoreSubscriber`] subscribes to atomic changes from all [`ChunkStore`]s
+/// through [`ChunkStoreEvent`]s.
///
-/// [`StoreSubscriber`]s can be used to build both secondary indices and trigger systems.
+/// [`ChunkStoreSubscriber`]s can be used to build both secondary indices and trigger systems.
//
// TODO(#4204): StoreSubscriber should require SizeBytes so they can be part of memstats.
-pub trait StoreSubscriber: std::any::Any + Send + Sync {
+pub trait ChunkStoreSubscriber: std::any::Any + Send + Sync {
/// Arbitrary name for the subscriber.
///
/// Does not need to be unique.
@@ -34,54 +35,54 @@ pub trait StoreSubscriber: std::any::Any + Send + Sync {
/// ```
fn as_any_mut(&mut self) -> &mut dyn std::any::Any;
- /// The core of this trait: get notified of changes happening in all [`DataStore`]s.
+ /// The core of this trait: get notified of changes happening in all [`ChunkStore`]s.
///
- /// This will be called automatically by the [`DataStore`] itself if the subscriber has been
- /// registered: [`DataStore::register_subscriber`].
- /// Or you might want to feed it [`StoreEvent`]s manually, depending on your use case.
+ /// This will be called automatically by the [`ChunkStore`] itself if the subscriber has been
+ /// registered: [`ChunkStore::register_subscriber`].
+ /// Or you might want to feed it [`ChunkStoreEvent`]s manually, depending on your use case.
///
/// ## Example
///
/// ```ignore
- /// fn on_events(&mut self, events: &[StoreEvent]) {
- /// use re_data_store::StoreDiffKind;
+ /// fn on_events(&mut self, events: &[ChunkStoreEvent]) {
+ /// use re_chunk_store::ChunkStoreDiffKind;
/// for event in events {
/// match event.kind {
- /// StoreDiffKind::Addition => println!("Row added: {}", event.row_id),
- /// StoreDiffKind::Deletion => println!("Row removed: {}", event.row_id),
+ /// ChunkStoreDiffKind::Addition => println!("Row added: {}", event.row_id),
+ /// ChunkStoreDiffKind::Deletion => println!("Row removed: {}", event.row_id),
/// }
/// }
/// }
/// ```
- fn on_events(&mut self, events: &[StoreEvent]);
+ fn on_events(&mut self, events: &[ChunkStoreEvent]);
}
-/// All registered [`StoreSubscriber`]s.
+/// All registered [`ChunkStoreSubscriber`]s.
static SUBSCRIBERS: once_cell::sync::Lazy>> =
once_cell::sync::Lazy::new(|| RwLock::new(Vec::new()));
#[derive(Debug, Clone, Copy)]
-pub struct StoreSubscriberHandle(u32);
+pub struct ChunkStoreSubscriberHandle(u32);
-impl DataStore {
- /// Registers a [`StoreSubscriber`] so it gets automatically notified when data gets added and/or
- /// removed to/from a [`DataStore`].
+impl ChunkStore {
+ /// Registers a [`ChunkStoreSubscriber`] so it gets automatically notified when data gets added and/or
+ /// removed to/from a [`ChunkStore`].
///
- /// Refer to [`StoreEvent`]'s documentation for more information about these events.
+ /// Refer to [`ChunkStoreEvent`]'s documentation for more information about these events.
///
/// ## Scope
///
- /// Registered [`StoreSubscriber`]s are global scope: they get notified of all events from all
- /// existing [`DataStore`]s, including [`DataStore`]s created after the subscriber was registered.
+ /// Registered [`ChunkStoreSubscriber`]s are global scope: they get notified of all events from all
+ /// existing [`ChunkStore`]s, including [`ChunkStore`]s created after the subscriber was registered.
///
- /// Use [`StoreEvent::store_id`] to identify the source of an event.
+ /// Use [`ChunkStoreEvent::store_id`] to identify the source of an event.
///
/// ## Late registration
///
/// Subscribers must be registered before a store gets created to guarantee that no events
/// were missed.
///
- /// [`StoreEvent::event_id`] can be used to identify missing events.
+ /// [`ChunkStoreEvent::event_id`] can be used to identify missing events.
///
/// ## Ordering
///
@@ -92,17 +93,19 @@ impl DataStore {
/// subscriber.
//
// TODO(cmc): send a compacted snapshot to late registerers for bootstrapping
- pub fn register_subscriber(subscriber: Box) -> StoreSubscriberHandle {
+ pub fn register_subscriber(
+ subscriber: Box,
+ ) -> ChunkStoreSubscriberHandle {
let mut subscribers = SUBSCRIBERS.write();
subscribers.push(RwLock::new(subscriber));
- StoreSubscriberHandle(subscribers.len() as u32 - 1)
+ ChunkStoreSubscriberHandle(subscribers.len() as u32 - 1)
}
/// Passes a reference to the downcasted subscriber to the given `FnMut` callback.
///
/// Returns `None` if the subscriber doesn't exist or downcasting failed.
- pub fn with_subscriber T>(
- StoreSubscriberHandle(handle): StoreSubscriberHandle,
+ pub fn with_subscriber T>(
+ ChunkStoreSubscriberHandle(handle): ChunkStoreSubscriberHandle,
mut f: F,
) -> Option {
let subscribers = SUBSCRIBERS.read();
@@ -115,8 +118,8 @@ impl DataStore {
/// Passes a reference to the downcasted subscriber to the given `FnOnce` callback.
///
/// Returns `None` if the subscriber doesn't exist or downcasting failed.
- pub fn with_subscriber_once T>(
- StoreSubscriberHandle(handle): StoreSubscriberHandle,
+ pub fn with_subscriber_once T>(
+ ChunkStoreSubscriberHandle(handle): ChunkStoreSubscriberHandle,
f: F,
) -> Option {
let subscribers = SUBSCRIBERS.read();
@@ -129,8 +132,8 @@ impl DataStore {
/// Passes a mutable reference to the downcasted subscriber to the given callback.
///
/// Returns `None` if the subscriber doesn't exist or downcasting failed.
- pub fn with_subscriber_mut T>(
- StoreSubscriberHandle(handle): StoreSubscriberHandle,
+ pub fn with_subscriber_mut T>(
+ ChunkStoreSubscriberHandle(handle): ChunkStoreSubscriberHandle,
mut f: F,
) -> Option {
let subscribers = SUBSCRIBERS.read();
@@ -140,8 +143,8 @@ impl DataStore {
})
}
- /// Called by [`DataStore`]'s mutating methods to notify subscriber subscribers of upcoming events.
- pub(crate) fn on_events(events: &[StoreEvent]) {
+ /// Called by [`ChunkStore`]'s mutating methods to notify subscriber subscribers of upcoming events.
+ pub(crate) fn on_events(events: &[ChunkStoreEvent]) {
re_tracing::profile_function!();
let subscribers = SUBSCRIBERS.read();
// TODO(cmc): might want to parallelize at some point.
@@ -153,22 +156,25 @@ impl DataStore {
#[cfg(test)]
mod tests {
+ use std::sync::Arc;
+
use ahash::HashSet;
+ use re_chunk::{Chunk, RowId};
use re_log_types::{
example_components::{MyColor, MyIndex, MyPoint},
- DataRow, RowId, StoreId, TimePoint, Timeline,
+ StoreId, TimePoint, Timeline,
};
- use crate::{DataStore, GarbageCollectionOptions, StoreSubscriber};
+ use crate::{ChunkStore, ChunkStoreSubscriber, GarbageCollectionOptions};
use super::*;
- /// A simple [`StoreSubscriber`] for test purposes that just accumulates [`StoreEvent`]s.
+ /// A simple [`ChunkStoreSubscriber`] for test purposes that just accumulates [`ChunkStoreEvent`]s.
#[derive(Debug)]
struct AllEvents {
store_ids: HashSet,
- events: Vec,
+ events: Vec,
}
impl AllEvents {
@@ -180,7 +186,7 @@ mod tests {
}
}
- impl StoreSubscriber for AllEvents {
+ impl ChunkStoreSubscriber for AllEvents {
fn name(&self) -> String {
"rerun.testing.store_subscribers.AllEvents".into()
}
@@ -193,7 +199,7 @@ mod tests {
self
}
- fn on_events(&mut self, events: &[StoreEvent]) {
+ fn on_events(&mut self, events: &[ChunkStoreEvent]) {
self.events.extend(
events
.iter()
@@ -206,76 +212,79 @@ mod tests {
#[test]
fn store_subscriber() -> anyhow::Result<()> {
- let mut store1 = DataStore::new(
+ let mut store1 = ChunkStore::new(
re_log_types::StoreId::random(re_log_types::StoreKind::Recording),
Default::default(),
);
- let mut store2 = DataStore::new(
+ let mut store = ChunkStore::new(
re_log_types::StoreId::random(re_log_types::StoreKind::Recording),
Default::default(),
);
let mut expected_events = Vec::new();
- let view = AllEvents::new([store1.id().clone(), store2.id().clone()]);
- let view_handle = DataStore::register_subscriber(Box::new(view));
+ let view = AllEvents::new([store1.id().clone(), store.id().clone()]);
+ let view_handle = ChunkStore::register_subscriber(Box::new(view));
let timeline_frame = Timeline::new_sequence("frame");
let timeline_other = Timeline::new_temporal("other");
let timeline_yet_another = Timeline::new_sequence("yet_another");
- let row = DataRow::from_component_batches(
- RowId::new(),
- TimePoint::from_iter([
- (timeline_frame, 42), //
- (timeline_other, 666), //
- (timeline_yet_another, 1), //
- ]),
- "entity_a".into(),
- [&MyIndex::from_iter(0..10) as _],
- )?;
+ let chunk = Chunk::builder("entity_a".into())
+ .with_component_batch(
+ RowId::new(),
+ TimePoint::from_iter([
+ (timeline_frame, 42), //
+ (timeline_other, 666), //
+ (timeline_yet_another, 1), //
+ ]),
+ &MyIndex::from_iter(0..10),
+ )
+ .build()?;
- expected_events.extend(store1.insert_row(&row));
+ expected_events.extend(store1.insert_chunk(&Arc::new(chunk))?);
- let row = {
+ let chunk = {
let num_instances = 3;
let points: Vec<_> = (0..num_instances)
.map(|i| MyPoint::new(0.0, i as f32))
.collect();
let colors = vec![MyColor::from(0xFF0000FF)];
- DataRow::from_component_batches(
- RowId::new(),
- TimePoint::from_iter([
- (timeline_frame, 42), //
- (timeline_yet_another, 1), //
- ]),
- "entity_b".into(),
- [&points as _, &colors as _],
- )?
+ Chunk::builder("entity_b".into())
+ .with_component_batches(
+ RowId::new(),
+ TimePoint::from_iter([
+ (timeline_frame, 42), //
+ (timeline_yet_another, 1), //
+ ]),
+ [&points as _, &colors as _],
+ )
+ .build()?
};
- expected_events.extend(store2.insert_row(&row));
+ expected_events.extend(store.insert_chunk(&Arc::new(chunk))?);
- let row = {
+ let chunk = {
let num_instances = 6;
let colors = vec![MyColor::from(0x00DD00FF); num_instances];
- DataRow::from_component_batches(
- RowId::new(),
- TimePoint::default(),
- "entity_b".into(),
- [
- &MyIndex::from_iter(0..num_instances as _) as _,
- &colors as _,
- ],
- )?
+ Chunk::builder("entity_b".into())
+ .with_component_batches(
+ RowId::new(),
+ TimePoint::default(),
+ [
+ &MyIndex::from_iter(0..num_instances as _) as _,
+ &colors as _,
+ ],
+ )
+ .build()?
};
- expected_events.extend(store1.insert_row(&row));
+ expected_events.extend(store1.insert_chunk(&Arc::new(chunk))?);
expected_events.extend(store1.gc(&GarbageCollectionOptions::gc_everything()).0);
- expected_events.extend(store2.gc(&GarbageCollectionOptions::gc_everything()).0);
+ expected_events.extend(store.gc(&GarbageCollectionOptions::gc_everything()).0);
- DataStore::with_subscriber::(view_handle, |got| {
+ ChunkStore::with_subscriber::(view_handle, |got| {
similar_asserts::assert_eq!(expected_events.len(), got.events.len());
similar_asserts::assert_eq!(expected_events, got.events);
});
diff --git a/crates/re_chunk_store/src/writes.rs b/crates/re_chunk_store/src/writes.rs
new file mode 100644
index 000000000000..4cbc55c47395
--- /dev/null
+++ b/crates/re_chunk_store/src/writes.rs
@@ -0,0 +1,176 @@
+use std::sync::Arc;
+
+use arrow2::array::{Array as _, ListArray as ArrowListArray};
+
+use re_chunk::{Chunk, RowId};
+
+use crate::{
+ ChunkStore, ChunkStoreChunkStats, ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreError,
+ ChunkStoreEvent, ChunkStoreResult,
+};
+
+// Used all over in docstrings.
+#[allow(unused_imports)]
+use crate::ChunkId;
+
+// ---
+
+impl ChunkStore {
+ /// Inserts a [`Chunk`] in the store.
+ ///
+ /// Iff the store was modified, all registered subscribers will be notified and the
+ /// resulting [`ChunkStoreEvent`] will be returned, or `None` otherwise.
+ ///
+ /// * Trying to insert an unsorted chunk ([`Chunk::is_sorted`]) will fail with an error.
+ /// * Inserting a duplicated [`ChunkId`] will result in a no-op.
+ /// * Inserting an empty [`Chunk`] will result in a no-op.
+ pub fn insert_chunk(
+ &mut self,
+ chunk: &Arc,
+ ) -> ChunkStoreResult