From d2d83d7a0c28805fe022b6536aa22bf8661a3192 Mon Sep 17 00:00:00 2001
From: Clement Rey <cr.rey.clement@gmail.com>
Date: Wed, 12 Apr 2023 17:35:13 +0200
Subject: [PATCH] Datastore revamp 7: garbage collection (#1801)

---
 Cargo.lock                                    |   1 +
 crates/re_arrow_store/benches/data_store.rs   |  49 ++-
 crates/re_arrow_store/src/lib.rs              |   5 +-
 crates/re_arrow_store/src/store.rs            |  94 ++++-
 crates/re_arrow_store/src/store_arrow.rs      |  25 +-
 crates/re_arrow_store/src/store_dump.rs       |   4 +-
 crates/re_arrow_store/src/store_format.rs     |  15 +-
 crates/re_arrow_store/src/store_gc.rs         | 281 +++++++++++++-
 crates/re_arrow_store/src/store_polars.rs     |   2 +-
 crates/re_arrow_store/src/store_read.rs       |   4 +-
 crates/re_arrow_store/src/store_sanity.rs     |  55 ++-
 crates/re_arrow_store/src/store_stats.rs      | 345 ++++++++++--------
 crates/re_arrow_store/src/store_write.rs      |  80 ++--
 crates/re_arrow_store/src/test_util.rs        |  30 +-
 crates/re_arrow_store/tests/correctness.rs    | 105 ++----
 crates/re_arrow_store/tests/data_store.rs     | 106 +++---
 crates/re_arrow_store/tests/dump.rs           |  60 +--
 crates/re_arrow_store/tests/internals.rs      |   6 +-
 crates/re_data_store/Cargo.toml               |   3 +-
 crates/re_data_store/src/log_db.rs            |  15 +-
 crates/re_log_types/src/data_cell.rs          |  71 ++--
 crates/re_log_types/src/data_row.rs           |  11 +-
 crates/re_log_types/src/data_table.rs         |  49 ++-
 crates/re_log_types/src/lib.rs                |   2 +
 .../re_log_types/src/path/component_name.rs   |  10 +
 crates/re_log_types/src/path/entity_path.rs   |   8 +
 crates/re_log_types/src/size_bytes.rs         | 173 +++++++++
 crates/re_log_types/src/time_point/mod.rs     |  20 +-
 .../re_log_types/src/time_point/timeline.rs   |   9 +-
 crates/re_log_types/src/time_range.rs         |   9 +-
 crates/re_query/src/query.rs                  |   4 +-
 crates/re_viewer/src/ui/memory_panel.rs       |  62 ++--
 crates/re_viewer/src/ui/time_panel/mod.rs     |   9 +-
 scripts/lint.py                               |   3 +-
 34 files changed, 1226 insertions(+), 499 deletions(-)
 create mode 100644 crates/re_log_types/src/size_bytes.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8ca7d48e4b0f..587df51f948f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3849,6 +3849,7 @@ dependencies = [
  "puffin",
  "rand",
  "re_arrow_store",
+ "re_format",
  "re_int_histogram",
  "re_log",
  "re_log_encoding",
diff --git a/crates/re_arrow_store/benches/data_store.rs b/crates/re_arrow_store/benches/data_store.rs
index 385b4f83f904..6e9aeda7a922 100644
--- a/crates/re_arrow_store/benches/data_store.rs
+++ b/crates/re_arrow_store/benches/data_store.rs
@@ -4,7 +4,10 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 use arrow2::array::UnionArray;
 use criterion::{criterion_group, criterion_main, Criterion};
 
-use re_arrow_store::{DataStore, DataStoreConfig, LatestAtQuery, RangeQuery, TimeInt, TimeRange};
+use re_arrow_store::{
+    DataStore, DataStoreConfig, GarbageCollectionTarget, LatestAtQuery, RangeQuery, TimeInt,
+    TimeRange,
+};
 use re_log_types::{
     component_types::{InstanceKey, Rect2D},
     datagen::{build_frame_nr, build_some_instances, build_some_rects},
@@ -12,7 +15,7 @@ use re_log_types::{
     TimeType, Timeline,
 };
 
-criterion_group!(benches, insert, latest_at, latest_at_missing, range);
+criterion_group!(benches, insert, latest_at, latest_at_missing, range, gc);
 criterion_main!(benches);
 
 // ---
@@ -258,6 +261,48 @@ fn range(c: &mut Criterion) {
     }
 }
 
+fn gc(c: &mut Criterion) {
+    let mut group = c.benchmark_group(format!(
+        "datastore/num_rows={NUM_ROWS}/num_instances={NUM_INSTANCES}/gc"
+    ));
+    group.throughput(criterion::Throughput::Elements(
+        (NUM_INSTANCES * NUM_ROWS) as _,
+    ));
+
+    let mut table = build_table(NUM_INSTANCES as usize, false);
+    table.compute_all_size_bytes();
+
+    // Default config
+    group.bench_function("default", |b| {
+        let store = insert_table(Default::default(), InstanceKey::name(), &table);
+        b.iter(|| {
+            let mut store = store.clone();
+            let (_, stats_diff) = store.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0 / 3.0));
+            stats_diff
+        });
+    });
+
+    // Emulate more or less bucket
+    for &num_rows_per_bucket in num_rows_per_bucket() {
+        group.bench_function(format!("bucketsz={num_rows_per_bucket}"), |b| {
+            let store = insert_table(
+                DataStoreConfig {
+                    indexed_bucket_num_rows: num_rows_per_bucket,
+                    ..Default::default()
+                },
+                InstanceKey::name(),
+                &table,
+            );
+            b.iter(|| {
+                let mut store = store.clone();
+                let (_, stats_diff) =
+                    store.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0 / 3.0));
+                stats_diff
+            });
+        });
+    }
+}
+
 // --- Helpers ---
 
 fn build_table(n: usize, packed: bool) -> DataTable {
diff --git a/crates/re_arrow_store/src/lib.rs b/crates/re_arrow_store/src/lib.rs
index 4593868e9ec6..0ac2be79a28e 100644
--- a/crates/re_arrow_store/src/lib.rs
+++ b/crates/re_arrow_store/src/lib.rs
@@ -38,11 +38,12 @@ pub use self::arrow_util::ArrayExt;
 pub use self::store::{DataStore, DataStoreConfig};
 pub use self::store_gc::GarbageCollectionTarget;
 pub use self::store_read::{LatestAtQuery, RangeQuery};
-pub use self::store_stats::DataStoreStats;
+pub use self::store_stats::{DataStoreRowStats, DataStoreStats};
 pub use self::store_write::{WriteError, WriteResult};
 
 pub(crate) use self::store::{
-    IndexedBucket, IndexedBucketInner, IndexedTable, PersistentIndexedTable,
+    ClusterCellCache, DataTypeRegistry, IndexedBucket, IndexedBucketInner, IndexedTable,
+    MetadataRegistry, PersistentIndexedTable,
 };
 
 // Re-exports
diff --git a/crates/re_arrow_store/src/store.rs b/crates/re_arrow_store/src/store.rs
index a2dba209616f..15056b70f490 100644
--- a/crates/re_arrow_store/src/store.rs
+++ b/crates/re_arrow_store/src/store.rs
@@ -9,7 +9,7 @@ use nohash_hasher::{IntMap, IntSet};
 use parking_lot::RwLock;
 use re_log_types::{
     ComponentName, DataCell, DataCellColumn, EntityPath, EntityPathHash, ErasedTimeVec,
-    NumInstancesVec, RowId, RowIdVec, TimeInt, TimePoint, TimeRange, Timeline,
+    NumInstancesVec, RowId, RowIdVec, SizeBytes, TimeInt, TimePoint, TimeRange, Timeline,
 };
 
 // --- Data store ---
@@ -23,8 +23,13 @@ pub struct DataStoreConfig {
     /// to a specific timeline _and_ a specific entity.
     ///
     /// This effectively puts an upper bound on the number of rows that need to be sorted when an
-    /// indexed bucket gets out of order.
+    /// indexed bucket gets out of order (e.g. because of new insertions or a GC pass).
     /// This is a tradeoff: less rows means faster sorts at the cost of more metadata overhead.
+    /// In particular:
+    /// - Query performance scales inversely logarithmically to this number (i.e. it gets better
+    ///   the higher this number gets).
+    /// - GC performance scales quadratically with this number (i.e. it gets better the lower this
+    ///   number gets).
     ///
     /// See [`Self::DEFAULT`] for defaults.
     pub indexed_bucket_num_rows: u64,
@@ -53,7 +58,12 @@ impl Default for DataStoreConfig {
 
 impl DataStoreConfig {
     pub const DEFAULT: Self = Self {
-        indexed_bucket_num_rows: 1024,
+        // NOTE: Empirical testing has shown that 512 is a good balance between sorting
+        // and binary search costs with the current GC implementation.
+        //
+        // Garbage collection costs are entirely driven by the number of buckets around, the size
+        // of the data itself has no impact.
+        indexed_bucket_num_rows: 512,
         store_insert_ids: cfg!(debug_assertions),
         enable_typecheck: cfg!(debug_assertions),
     };
@@ -67,8 +77,8 @@ pub type InsertIdVec = SmallVec<[u64; 4]>;
 /// so far.
 ///
 /// See also [`DataStore::lookup_datatype`].
-#[derive(Default)]
-pub struct DataTypeRegistry(IntMap<ComponentName, DataType>);
+#[derive(Debug, Default, Clone)]
+pub struct DataTypeRegistry(pub IntMap<ComponentName, DataType>);
 
 impl std::ops::Deref for DataTypeRegistry {
     type Target = IntMap<ComponentName, DataType>;
@@ -87,11 +97,11 @@ impl std::ops::DerefMut for DataTypeRegistry {
 }
 
 /// Keeps track of arbitrary per-row metadata.
-#[derive(Default)]
-pub struct MetadataRegistry<T: Clone>(HashMap<RowId, T>);
+#[derive(Debug, Default, Clone)]
+pub struct MetadataRegistry<T: Clone>(pub BTreeMap<RowId, T>);
 
 impl<T: Clone> std::ops::Deref for MetadataRegistry<T> {
-    type Target = HashMap<RowId, T>;
+    type Target = BTreeMap<RowId, T>;
 
     #[inline]
     fn deref(&self) -> &Self::Target {
@@ -106,6 +116,29 @@ impl<T: Clone> std::ops::DerefMut for MetadataRegistry<T> {
     }
 }
 
+/// Used to cache auto-generated cluster cells (`[0]`, `[0, 1]`, `[0, 1, 2]`, ...) so that they
+/// can be properly deduplicated on insertion.
+#[derive(Debug, Default, Clone)]
+pub struct ClusterCellCache(pub IntMap<u32, DataCell>);
+
+impl std::ops::Deref for ClusterCellCache {
+    type Target = IntMap<u32, DataCell>;
+
+    #[inline]
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl std::ops::DerefMut for ClusterCellCache {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+// ---
+
 /// A complete data store: covers all timelines, all entities, everything.
 ///
 /// ## Debugging
@@ -148,7 +181,7 @@ pub struct DataStore {
 
     /// Used to cache auto-generated cluster cells (`[0]`, `[0, 1]`, `[0, 1, 2]`, ...)
     /// so that they can be properly deduplicated on insertion.
-    pub(crate) cluster_cell_cache: IntMap<u32, DataCell>,
+    pub(crate) cluster_cell_cache: ClusterCellCache,
 
     /// All temporal [`IndexedTable`]s for all entities on all timelines.
     ///
@@ -167,10 +200,29 @@ pub struct DataStore {
     pub(crate) query_id: AtomicU64,
 
     /// Monotonically increasing ID for GCs.
-    #[allow(dead_code)]
     pub(crate) gc_id: u64,
 }
 
+impl Clone for DataStore {
+    fn clone(&self) -> Self {
+        Self {
+            cluster_key: self.cluster_key,
+            config: self.config.clone(),
+            type_registry: self.type_registry.clone(),
+            metadata_registry: self.metadata_registry.clone(),
+            cluster_cell_cache: self.cluster_cell_cache.clone(),
+            tables: self.tables.clone(),
+            timeless_tables: self.timeless_tables.clone(),
+            insert_id: self.insert_id,
+            query_id: self
+                .query_id
+                .load(std::sync::atomic::Ordering::Relaxed)
+                .into(),
+            gc_id: self.gc_id,
+        }
+    }
+}
+
 impl DataStore {
     /// See [`Self::cluster_key`] for more information about the cluster key.
     pub fn new(cluster_key: ComponentName, config: DataStoreConfig) -> Self {
@@ -293,7 +345,7 @@ fn datastore_internal_repr() {
 /// ```
 //
 // TODO(#1524): inline visualization once it's back to a manageable state
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct IndexedTable {
     /// The timeline this table operates in, for debugging purposes.
     pub timeline: Timeline,
@@ -336,7 +388,7 @@ pub struct IndexedTable {
 impl IndexedTable {
     pub fn new(cluster_key: ComponentName, timeline: Timeline, ent_path: EntityPath) -> Self {
         let bucket = IndexedBucket::new(cluster_key, timeline);
-        let buckets_size_bytes = bucket.size_bytes();
+        let buckets_size_bytes = bucket.total_size_bytes();
         Self {
             timeline,
             ent_path,
@@ -364,6 +416,16 @@ pub struct IndexedBucket {
     pub inner: RwLock<IndexedBucketInner>,
 }
 
+impl Clone for IndexedBucket {
+    fn clone(&self) -> Self {
+        Self {
+            timeline: self.timeline,
+            cluster_key: self.cluster_key,
+            inner: RwLock::new(self.inner.read().clone()),
+        }
+    }
+}
+
 impl IndexedBucket {
     fn new(cluster_key: ComponentName, timeline: Timeline) -> Self {
         Self {
@@ -375,7 +437,7 @@ impl IndexedBucket {
 }
 
 /// See [`IndexedBucket`]; this is a helper struct to simplify interior mutability.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct IndexedBucketInner {
     /// Are the rows in this table chunk sorted?
     ///
@@ -412,7 +474,8 @@ pub struct IndexedBucketInner {
     /// (i.e. the table is sparse).
     pub columns: IntMap<ComponentName, DataCellColumn>,
 
-    /// The size of both the control & component data stored in this bucket, in bytes.
+    /// The size of both the control & component data stored in this bucket, heap and stack
+    /// included, in bytes.
     ///
     /// This is a best-effort approximation, adequate for most purposes (stats,
     /// triggering GCs, ...).
@@ -449,7 +512,8 @@ impl Default for IndexedBucketInner {
 /// ```
 //
 // TODO(#1524): inline visualization once it's back to a manageable state
-#[derive(Debug)]
+// TODO(#1807): timeless should be row-id ordered too then
+#[derive(Debug, Clone)]
 pub struct PersistentIndexedTable {
     /// The entity this table is related to, for debugging purposes.
     pub ent_path: EntityPath,
diff --git a/crates/re_arrow_store/src/store_arrow.rs b/crates/re_arrow_store/src/store_arrow.rs
index b7db9157855e..d62d8c3b8f3b 100644
--- a/crates/re_arrow_store/src/store_arrow.rs
+++ b/crates/re_arrow_store/src/store_arrow.rs
@@ -100,6 +100,11 @@ fn serialize(
     let mut schema = Schema::default();
     let mut columns = Vec::new();
 
+    // NOTE: Empty table / bucket.
+    if col_row_id.is_empty() {
+        return Ok((schema, Chunk::new(columns)));
+    }
+
     {
         let (control_schema, control_columns) =
             serialize_control_columns(col_time, col_insert_id, col_row_id, col_num_instances)?;
@@ -135,10 +140,13 @@ fn serialize_control_columns(
     // - time
     // - num_instances
 
-    let (insert_id_field, insert_id_column) =
-        DataTable::serialize_primitive_column(COLUMN_INSERT_ID, col_insert_id, None)?;
-    schema.fields.push(insert_id_field);
-    columns.push(insert_id_column);
+    // NOTE: Optional column, so make sure it's actually there:
+    if !col_insert_id.is_empty() {
+        let (insert_id_field, insert_id_column) =
+            DataTable::serialize_primitive_column(COLUMN_INSERT_ID, col_insert_id, None)?;
+        schema.fields.push(insert_id_field);
+        columns.push(insert_id_column);
+    }
 
     let (row_id_field, row_id_column) =
         DataTable::serialize_control_column(COLUMN_ROW_ID, col_row_id)?;
@@ -187,9 +195,12 @@ fn serialize_data_columns(
     }
 
     for (component, column) in table {
-        let (field, column) = DataTable::serialize_data_column(component.as_str(), column)?;
-        schema.fields.push(field);
-        columns.push(column);
+        // NOTE: Don't serialize columns with only null values.
+        if column.iter().any(Option::is_some) {
+            let (field, column) = DataTable::serialize_data_column(component.as_str(), column)?;
+            schema.fields.push(field);
+            columns.push(column);
+        }
     }
 
     Ok((schema, columns))
diff --git a/crates/re_arrow_store/src/store_dump.rs b/crates/re_arrow_store/src/store_dump.rs
index d24f3a317454..31845f75467d 100644
--- a/crates/re_arrow_store/src/store_dump.rs
+++ b/crates/re_arrow_store/src/store_dump.rs
@@ -48,7 +48,7 @@ impl DataStore {
                 col_row_id: col_row_id.clone(),
                 col_timelines: Default::default(),
                 col_entity_path: std::iter::repeat_with(|| ent_path.clone())
-                    .take(table.total_rows() as _)
+                    .take(table.num_rows() as _)
                     .collect(),
                 col_num_instances: col_num_instances.clone(),
                 columns: columns.clone(), // shallow
@@ -89,7 +89,7 @@ impl DataStore {
                     col_timelines: [(*timeline, col_time.iter().copied().map(Some).collect())]
                         .into(),
                     col_entity_path: std::iter::repeat_with(|| table.ent_path.clone())
-                        .take(table.total_rows() as _)
+                        .take(table.num_rows() as _)
                         .collect(),
                     col_num_instances: col_num_instances.clone(),
                     columns: columns.clone(), // shallow
diff --git a/crates/re_arrow_store/src/store_format.rs b/crates/re_arrow_store/src/store_format.rs
index 975f2f81dd23..d21110f87274 100644
--- a/crates/re_arrow_store/src/store_format.rs
+++ b/crates/re_arrow_store/src/store_format.rs
@@ -1,4 +1,5 @@
 use re_format::{format_bytes, format_number};
+use re_log_types::SizeBytes as _;
 
 use crate::{DataStore, IndexedBucket, IndexedTable, PersistentIndexedTable};
 
@@ -34,8 +35,8 @@ impl std::fmt::Display for DataStore {
                 format!(
                     "{} timeless indexed tables, for a total of {} across {} total rows\n",
                     timeless_tables.len(),
-                    format_bytes(self.total_timeless_size_bytes() as _),
-                    format_number(self.total_timeless_rows() as _)
+                    format_bytes(self.timeless_size_bytes() as _),
+                    format_number(self.num_timeless_rows() as _)
                 ),
             ))?;
             f.write_str(&indent::indent_all_by(4, "timeless_tables: [\n"))?;
@@ -53,8 +54,8 @@ impl std::fmt::Display for DataStore {
                 format!(
                     "{} indexed tables, for a total of {} across {} total rows\n",
                     tables.len(),
-                    format_bytes(self.total_temporal_size_bytes() as _),
-                    format_number(self.total_temporal_rows() as _)
+                    format_bytes(self.temporal_size_bytes() as _),
+                    format_number(self.num_temporal_rows() as _)
                 ),
             ))?;
             f.write_str(&indent::indent_all_by(4, "tables: [\n"))?;
@@ -94,7 +95,7 @@ impl std::fmt::Display for IndexedTable {
             "size: {} buckets for a total of {} across {} total rows\n",
             self.buckets.len(),
             format_bytes(self.total_size_bytes() as _),
-            format_number(self.total_rows() as _),
+            format_number(self.num_rows() as _),
         ))?;
         f.write_str("buckets: [\n")?;
         for (time, bucket) in buckets.iter() {
@@ -116,7 +117,7 @@ impl std::fmt::Display for IndexedBucket {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.write_fmt(format_args!(
             "size: {} across {} rows\n",
-            format_bytes(self.size_bytes() as _),
+            format_bytes(self.total_size_bytes() as _),
             format_number(self.num_rows() as _),
         ))?;
 
@@ -163,7 +164,7 @@ impl std::fmt::Display for PersistentIndexedTable {
         f.write_fmt(format_args!(
             "size: {} across {} rows\n",
             format_bytes(self.total_size_bytes() as _),
-            format_number(self.total_rows() as _),
+            format_number(self.num_rows() as _),
         ))?;
 
         let (schema, columns) = self.serialize().map_err(|err| {
diff --git a/crates/re_arrow_store/src/store_gc.rs b/crates/re_arrow_store/src/store_gc.rs
index 777ac333160b..a1cd819830fd 100644
--- a/crates/re_arrow_store/src/store_gc.rs
+++ b/crates/re_arrow_store/src/store_gc.rs
@@ -1,20 +1,285 @@
+use re_log_types::{RowId, SizeBytes as _, TimeInt, TimeRange};
+
+use crate::{
+    store::{IndexedBucketInner, IndexedTable},
+    DataStore, DataStoreStats,
+};
+
+// ---
+
 #[derive(Debug, Clone, Copy)]
 pub enum GarbageCollectionTarget {
-    /// Try to drop _at least_ the given percentage.
+    /// Try to drop _at least_ the given fraction.
     ///
-    /// The percentage must be a float in the range [0.0 : 1.0].
-    DropAtLeastPercentage(f64),
+    /// The fraction must be a float in the range [0.0 : 1.0].
+    DropAtLeastFraction(f64),
 }
 
 impl std::fmt::Display for GarbageCollectionTarget {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            GarbageCollectionTarget::DropAtLeastPercentage(p) => f.write_fmt(format_args!(
-                "DropAtLeast({}%)",
-                re_format::format_f64(*p * 100.0)
-            )),
+            GarbageCollectionTarget::DropAtLeastFraction(p) => {
+                write!(f, "DropAtLeast({:.3}%)", re_format::format_f64(*p * 100.0))
+            }
         }
     }
 }
 
-// TODO(#1619): Implement garbage collection.
+impl DataStore {
+    /// Triggers a garbage collection according to the desired `target`.
+    ///
+    /// Garbage collection's performance is bounded by the number of buckets in each table (for
+    /// each `RowId`, we have to find the corresponding bucket, which is roughly `O(log(n))`) as
+    /// well as the number of rows in each of those buckets (for each `RowId`, we have to sort the
+    /// corresponding bucket (roughly `O(n*log(n))`) and then find the corresponding row (roughly
+    /// `O(log(n))`.
+    /// The size of the data itself has no impact on performance.
+    ///
+    /// Returns the list of `RowId`s that were purged from the store.
+    ///
+    /// ## Semantics
+    ///
+    /// Garbage collection works on a row-level basis and is driven by [`RowId`] order,
+    /// i.e. the order defined by the clients' wall-clocks, allowing it to drop data across
+    /// the different timelines
+    /// in a fair, deterministic manner.
+    /// Similarly, out-of-order data is supported out of the box.
+    ///
+    /// The garbage collector doesn't deallocate data in and of itself: all it does is drop the
+    /// store's internal references to that data (the `DataCell`s), which will be deallocated once
+    /// their reference count reaches 0.
+    ///
+    /// ## Limitations
+    ///
+    /// The garbage collector is currently unaware of our latest-at semantics, i.e. it will drop
+    /// old data even if doing so would impact the results of recent queries.
+    /// See <https://github.com/rerun-io/rerun/issues/1803>.
+    //
+    // TODO(#1804): There shouldn't be any need to return the purged `RowId`s, all secondary
+    // datastructures should be able to purge themselves based solely off of
+    // [`DataStore::oldest_time_per_timeline`].
+    //
+    // TODO(#1803): The GC should be aware of latest-at semantics and make sure they are upheld
+    // when purging data.
+    //
+    // TODO(#1823): Workload specific optimizations.
+    pub fn gc(&mut self, target: GarbageCollectionTarget) -> (Vec<RowId>, DataStoreStats) {
+        crate::profile_function!();
+
+        self.gc_id += 1;
+
+        // NOTE: only temporal data and row metadata get purged!
+        let stats_before = DataStoreStats::from_store(self);
+        let initial_num_rows =
+            stats_before.temporal.num_rows + stats_before.metadata_registry.num_rows;
+        let initial_num_bytes =
+            (stats_before.temporal.num_bytes + stats_before.metadata_registry.num_bytes) as f64;
+
+        let row_ids = match target {
+            GarbageCollectionTarget::DropAtLeastFraction(p) => {
+                assert!((0.0..=1.0).contains(&p));
+
+                let num_bytes_to_drop = initial_num_bytes * p;
+                let target_num_bytes = initial_num_bytes - num_bytes_to_drop;
+
+                re_log::debug!(
+                    kind = "gc",
+                    id = self.gc_id,
+                    %target,
+                    initial_num_rows = re_format::format_large_number(initial_num_rows as _),
+                    initial_num_bytes = re_format::format_bytes(initial_num_bytes),
+                    target_num_bytes = re_format::format_bytes(target_num_bytes),
+                    drop_at_least_num_bytes = re_format::format_bytes(num_bytes_to_drop),
+                    "starting GC"
+                );
+
+                self.gc_drop_at_least_num_bytes(num_bytes_to_drop)
+            }
+        };
+
+        #[cfg(debug_assertions)]
+        self.sanity_check().unwrap();
+
+        // NOTE: only temporal data and row metadata get purged!
+        let stats_after = DataStoreStats::from_store(self);
+        let new_num_rows = stats_after.temporal.num_rows + stats_after.metadata_registry.num_rows;
+        let new_num_bytes =
+            (stats_after.temporal.num_bytes + stats_after.metadata_registry.num_bytes) as f64;
+
+        re_log::debug!(
+            kind = "gc",
+            id = self.gc_id,
+            %target,
+            initial_num_rows = re_format::format_large_number(initial_num_rows as _),
+            initial_num_bytes = re_format::format_bytes(initial_num_bytes),
+            new_num_rows = re_format::format_large_number(new_num_rows as _),
+            new_num_bytes = re_format::format_bytes(new_num_bytes),
+            "GC done"
+        );
+
+        let stats_diff = stats_before - stats_after;
+
+        (row_ids, stats_diff)
+    }
+
+    /// Tries to drop _at least_ `num_bytes_to_drop` bytes of data from the store.
+    ///
+    /// Returns the list of `RowId`s that were purged from the store.
+    fn gc_drop_at_least_num_bytes(&mut self, mut num_bytes_to_drop: f64) -> Vec<RowId> {
+        crate::profile_function!();
+
+        let mut row_ids = Vec::new();
+
+        // The algorithm is straightforward:
+        // 1. Pop the oldest `RowId` available
+        // 2. Find all tables that potentially hold data associated with that `RowId`
+        // 3. Drop the associated row and account for the space we got back
+        while num_bytes_to_drop > 0.0 {
+            // pop next row id
+            let Some((row_id, timepoint)) = self.metadata_registry.pop_first() else {
+                break;
+            };
+            num_bytes_to_drop -= row_id.total_size_bytes() as f64;
+            num_bytes_to_drop -= timepoint.total_size_bytes() as f64;
+            row_ids.push(row_id);
+
+            // find all tables that could possibly contain this `RowId`
+            let tables = self.tables.iter_mut().filter_map(|((timeline, _), table)| {
+                timepoint.get(timeline).map(|time| (*time, table))
+            });
+
+            for (time, table) in tables {
+                num_bytes_to_drop -= table.try_drop_row(row_id, time.as_i64()) as f64;
+            }
+        }
+
+        row_ids
+    }
+}
+
+impl IndexedTable {
+    /// Tries to drop the given `row_id` from the table, which is expected to be found at the
+    /// specified `time`.
+    ///
+    /// Returns how many bytes were actually dropped, or zero if the row wasn't found.
+    fn try_drop_row(&mut self, row_id: RowId, time: i64) -> u64 {
+        crate::profile_function!();
+
+        let table_has_more_than_one_bucket = self.buckets.len() > 1;
+
+        let (bucket_key, bucket) = self.find_bucket_mut(time.into());
+        let bucket_num_bytes = bucket.total_size_bytes();
+
+        let mut dropped_num_bytes = {
+            let inner = &mut *bucket.inner.write();
+            inner.try_drop_row(row_id, time)
+        };
+
+        // NOTE: We always need to keep at least one bucket alive, otherwise we have
+        // nowhere to write to.
+        if table_has_more_than_one_bucket && bucket.num_rows() == 0 {
+            // NOTE: We're dropping the bucket itself in this case, rather than just its
+            // contents.
+            debug_assert!(
+                dropped_num_bytes <= bucket_num_bytes,
+                "Bucket contained more bytes than it thought"
+            );
+            dropped_num_bytes = bucket_num_bytes;
+            self.buckets.remove(&bucket_key);
+
+            // NOTE: If this is the first bucket of the table that we've just removed, we need the
+            // next one to become responsible for `-∞`.
+            if bucket_key == TimeInt::MIN {
+                if let Some((_, bucket)) = self.buckets.pop_first() {
+                    self.buckets.insert(TimeInt::MIN, bucket);
+                }
+            }
+        }
+
+        self.buckets_size_bytes -= dropped_num_bytes;
+        self.buckets_num_rows -= (dropped_num_bytes > 0) as u64;
+
+        dropped_num_bytes
+    }
+}
+
+impl IndexedBucketInner {
+    /// Tries to drop the given `row_id` from the table, which is expected to be found at the
+    /// specified `time`.
+    ///
+    /// Returns how many bytes were actually dropped, or zero if the row wasn't found.
+    fn try_drop_row(&mut self, row_id: RowId, time: i64) -> u64 {
+        crate::profile_function!();
+
+        self.sort();
+
+        let IndexedBucketInner {
+            is_sorted,
+            time_range,
+            col_time,
+            col_insert_id,
+            col_row_id,
+            col_num_instances,
+            columns,
+            size_bytes,
+        } = self;
+
+        let mut dropped_num_bytes = 0u64;
+
+        let mut row_index = col_time.partition_point(|&time2| time2 < time);
+        while col_time.get(row_index) == Some(&time) {
+            if col_row_id[row_index] != row_id {
+                row_index += 1;
+                continue;
+            }
+
+            // Update the time_range min/max:
+            if col_time.len() == 1 {
+                // We removed the last row
+                *time_range = TimeRange::EMPTY;
+            } else {
+                *is_sorted = false;
+
+                // We have at least two rows, so we can safely [index] here:
+                if row_index == 0 {
+                    // We removed the first row, so the second row holds the new min
+                    time_range.min = col_time[1].into();
+                }
+                if row_index + 1 == col_time.len() {
+                    // We removed the last row, so the penultimate row holds the new max
+                    time_range.max = col_time[row_index - 1].into();
+                }
+            }
+
+            // col_row_id
+            let removed_row_id = col_row_id.swap_remove(row_index);
+            debug_assert_eq!(row_id, removed_row_id);
+            dropped_num_bytes += removed_row_id.total_size_bytes();
+
+            // col_time
+            let row_time = col_time.swap_remove(row_index);
+            dropped_num_bytes += row_time.total_size_bytes();
+
+            // col_insert_id (if present)
+            if !col_insert_id.is_empty() {
+                dropped_num_bytes += col_insert_id.swap_remove(row_index).total_size_bytes();
+            }
+
+            // col_num_instances
+            dropped_num_bytes += col_num_instances.swap_remove(row_index).total_size_bytes();
+
+            // each data column
+            for column in columns.values_mut() {
+                dropped_num_bytes += column.0.swap_remove(row_index).total_size_bytes();
+            }
+
+            // NOTE: A single `RowId` cannot possibly have more than one datapoint for
+            // a single timeline.
+            break;
+        }
+
+        *size_bytes -= dropped_num_bytes;
+
+        dropped_num_bytes
+    }
+}
diff --git a/crates/re_arrow_store/src/store_polars.rs b/crates/re_arrow_store/src/store_polars.rs
index 6ef71c714377..00a233c70d6f 100644
--- a/crates/re_arrow_store/src/store_polars.rs
+++ b/crates/re_arrow_store/src/store_polars.rs
@@ -178,7 +178,7 @@ impl PersistentIndexedTable {
             columns,
         } = self;
 
-        let num_rows = self.total_rows() as usize;
+        let num_rows = self.num_rows() as usize;
 
         let insert_ids = config
             .store_insert_ids
diff --git a/crates/re_arrow_store/src/store_read.rs b/crates/re_arrow_store/src/store_read.rs
index 74f75c586363..02c753ce4be3 100644
--- a/crates/re_arrow_store/src/store_read.rs
+++ b/crates/re_arrow_store/src/store_read.rs
@@ -1001,7 +1001,7 @@ impl PersistentIndexedTable {
         );
 
         // find the primary row number's row.
-        let primary_row_nr = self.total_rows() - 1;
+        let primary_row_nr = self.num_rows() - 1;
 
         trace!(
             kind = "latest_at",
@@ -1085,7 +1085,7 @@ impl PersistentIndexedTable {
         // for building the returned iterator.
         crate::profile_function!();
 
-        let cells = (0..self.total_rows()).filter_map(move |row_nr| {
+        let cells = (0..self.num_rows()).filter_map(move |row_nr| {
             let mut cells = [(); N].map(|_| None);
             for (i, component) in components.iter().enumerate() {
                 if let Some(column) = self.columns.get(component) {
diff --git a/crates/re_arrow_store/src/store_sanity.rs b/crates/re_arrow_store/src/store_sanity.rs
index 0bf32c720cab..eba65eb70011 100644
--- a/crates/re_arrow_store/src/store_sanity.rs
+++ b/crates/re_arrow_store/src/store_sanity.rs
@@ -1,5 +1,6 @@
 use re_log_types::{
-    ComponentName, DataCellColumn, COLUMN_NUM_INSTANCES, COLUMN_ROW_ID, COLUMN_TIMEPOINT,
+    ComponentName, DataCellColumn, SizeBytes as _, TimeRange, COLUMN_NUM_INSTANCES, COLUMN_ROW_ID,
+    COLUMN_TIMEPOINT,
 };
 
 use crate::{DataStore, IndexedBucket, IndexedBucketInner, IndexedTable, PersistentIndexedTable};
@@ -11,6 +12,11 @@ use crate::{DataStore, IndexedBucket, IndexedBucketInner, IndexedTable, Persiste
 /// These violations can only stem from a bug in the store's implementation itself.
 #[derive(thiserror::Error, Debug)]
 pub enum SanityError {
+    #[error(
+        "Reported time range for indexed bucket is out of sync: got {got:?}, expected {expected:?}"
+    )]
+    TimeRangeOutOfSync { expected: TimeRange, got: TimeRange },
+
     #[error("Reported size for {origin} is out of sync: got {got}, expected {expected}")]
     SizeOutOfSync {
         origin: &'static str,
@@ -101,21 +107,26 @@ impl IndexedTable {
 
         // Make sure row numbers aren't out of sync
         {
-            let total_rows = self.total_rows();
-            let total_rows_uncached = self.total_rows_uncached();
-            if total_rows != total_rows_uncached {
+            let num_rows = self.num_rows();
+            let num_rows_uncached = self.num_rows_uncached();
+            if num_rows != num_rows_uncached {
                 return Err(SanityError::RowsOutOfSync {
                     origin: std::any::type_name::<Self>(),
-                    expected: re_format::format_number(total_rows_uncached as _),
-                    got: re_format::format_number(total_rows as _),
+                    expected: re_format::format_number(num_rows_uncached as _),
+                    got: re_format::format_number(num_rows as _),
                 });
             }
         }
 
+        // Run individual bucket sanity check suites too.
+        for bucket in self.buckets.values() {
+            bucket.sanity_check()?;
+        }
+
         // Make sure size values aren't out of sync
         {
             let total_size_bytes = self.total_size_bytes();
-            let total_size_bytes_uncached = self.total_size_bytes_uncached();
+            let total_size_bytes_uncached = self.size_bytes_uncached();
             if total_size_bytes != total_size_bytes_uncached {
                 return Err(SanityError::SizeOutOfSync {
                     origin: std::any::type_name::<Self>(),
@@ -125,11 +136,6 @@ impl IndexedTable {
             }
         }
 
-        // Run individual bucket sanity check suites too.
-        for bucket in self.buckets.values() {
-            bucket.sanity_check()?;
-        }
-
         Ok(())
     }
 }
@@ -150,7 +156,7 @@ impl IndexedBucket {
         {
             let IndexedBucketInner {
                 is_sorted: _,
-                time_range: _,
+                time_range,
                 col_time,
                 col_insert_id,
                 col_row_id,
@@ -159,6 +165,23 @@ impl IndexedBucket {
                 size_bytes: _,
             } = &*inner.read();
 
+            // Time ranges are eagerly maintained.
+            {
+                let mut times = col_time.clone();
+                times.sort();
+
+                let expected_min = times.first().copied().unwrap_or(i64::MAX).into();
+                let expected_max = times.last().copied().unwrap_or(i64::MIN).into();
+                let expected_time_range = TimeRange::new(expected_min, expected_max);
+
+                if expected_time_range != *time_range {
+                    return Err(SanityError::TimeRangeOutOfSync {
+                        expected: expected_time_range,
+                        got: *time_range,
+                    });
+                }
+            }
+
             // All columns should be `Self::num_rows` long.
             {
                 let num_rows = self.num_rows();
@@ -191,7 +214,7 @@ impl IndexedBucket {
             }
 
             // The cluster column must be fully dense.
-            {
+            if self.num_rows() > 0 {
                 let cluster_column =
                     columns
                         .get(cluster_key)
@@ -243,7 +266,7 @@ impl PersistentIndexedTable {
 
         // All columns should be `Self::num_rows` long.
         {
-            let num_rows = self.total_rows();
+            let num_rows = self.num_rows();
 
             let column_lengths = [
                 (!col_insert_id.is_empty())
@@ -272,7 +295,7 @@ impl PersistentIndexedTable {
         }
 
         // The cluster column must be fully dense.
-        {
+        if self.num_rows() > 0 {
             let cluster_column =
                 columns
                     .get(cluster_key)
diff --git a/crates/re_arrow_store/src/store_stats.rs b/crates/re_arrow_store/src/store_stats.rs
index 11c91bfc6223..612c03124855 100644
--- a/crates/re_arrow_store/src/store_stats.rs
+++ b/crates/re_arrow_store/src/store_stats.rs
@@ -1,71 +1,184 @@
 use nohash_hasher::IntMap;
-use re_log_types::{ComponentName, DataCellColumn};
+use re_log_types::{ComponentName, SizeBytes, TimePoint};
 
 use crate::{
-    store::IndexedBucketInner, DataStore, IndexedBucket, IndexedTable, PersistentIndexedTable,
+    store::IndexedBucketInner, ClusterCellCache, DataStore, DataTypeRegistry, IndexedBucket,
+    IndexedTable, MetadataRegistry, PersistentIndexedTable,
 };
 
 // ---
 
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd)]
+pub struct DataStoreRowStats {
+    pub num_rows: u64,
+    pub num_bytes: u64,
+}
+
+impl std::ops::Sub for DataStoreRowStats {
+    type Output = Self;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        Self {
+            num_rows: self.num_rows - rhs.num_rows,
+            num_bytes: self.num_bytes - rhs.num_bytes,
+        }
+    }
+}
+
+impl std::ops::Add for DataStoreRowStats {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        Self {
+            num_rows: self.num_rows + rhs.num_rows,
+            num_bytes: self.num_bytes + rhs.num_bytes,
+        }
+    }
+}
+
 #[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd)]
 pub struct DataStoreStats {
-    pub total_timeless_rows: u64,
-    pub total_timeless_size_bytes: u64,
+    pub type_registry: DataStoreRowStats,
+    pub metadata_registry: DataStoreRowStats,
+    pub autogenerated: DataStoreRowStats,
+    pub timeless: DataStoreRowStats,
+    pub temporal: DataStoreRowStats,
+    pub temporal_buckets: u64,
+    pub total: DataStoreRowStats,
+}
+
+impl std::ops::Sub for DataStoreStats {
+    type Output = Self;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        Self {
+            type_registry: self.type_registry - rhs.type_registry,
+            metadata_registry: self.metadata_registry - rhs.metadata_registry,
+            autogenerated: self.autogenerated - rhs.autogenerated,
+            timeless: self.timeless - rhs.timeless,
+            temporal: self.temporal - rhs.temporal,
+            temporal_buckets: self.temporal_buckets - rhs.temporal_buckets,
+            total: self.total - rhs.total,
+        }
+    }
+}
 
-    pub total_temporal_rows: u64,
-    pub total_temporal_size_bytes: u64,
-    pub total_temporal_buckets: u64,
+impl std::ops::Add for DataStoreStats {
+    type Output = Self;
 
-    pub total_rows: u64,
-    pub total_size_bytes: u64,
+    fn add(self, rhs: Self) -> Self::Output {
+        Self {
+            type_registry: self.type_registry + rhs.type_registry,
+            metadata_registry: self.metadata_registry + rhs.metadata_registry,
+            autogenerated: self.autogenerated + rhs.autogenerated,
+            timeless: self.timeless + rhs.timeless,
+            temporal: self.temporal + rhs.temporal,
+            temporal_buckets: self.temporal_buckets + rhs.temporal_buckets,
+            total: self.total + rhs.total,
+        }
+    }
 }
 
 impl DataStoreStats {
     pub fn from_store(store: &DataStore) -> Self {
         crate::profile_function!();
 
-        let total_timeless_rows = store.total_timeless_rows();
-        let total_timeless_size_bytes = store.total_timeless_size_bytes();
+        let type_registry = DataStoreRowStats {
+            num_rows: store.type_registry.len() as _,
+            num_bytes: store.type_registry.total_size_bytes(),
+        };
+
+        let metadata_registry = DataStoreRowStats {
+            num_rows: store.metadata_registry.len() as _,
+            num_bytes: store.metadata_registry.total_size_bytes(),
+        };
+
+        let autogenerated = DataStoreRowStats {
+            num_rows: store.cluster_cell_cache.len() as _,
+            num_bytes: store.cluster_cell_cache.total_size_bytes(),
+        };
+
+        let timeless = DataStoreRowStats {
+            num_rows: store.num_timeless_rows(),
+            num_bytes: store.timeless_size_bytes(),
+        };
+
+        let temporal = DataStoreRowStats {
+            num_rows: store.num_temporal_rows(),
+            num_bytes: store.temporal_size_bytes(),
+        };
+        let temporal_buckets = store.num_temporal_buckets();
+
+        let total = DataStoreRowStats {
+            num_rows: timeless.num_rows + temporal.num_rows,
+            num_bytes: type_registry.num_bytes
+                + metadata_registry.num_bytes
+                + autogenerated.num_bytes
+                + timeless.num_bytes
+                + temporal.num_bytes,
+        };
 
-        let total_temporal_rows = store.total_temporal_rows();
-        let total_temporal_size_bytes = store.total_temporal_size_bytes();
-        let total_temporal_buckets = store.total_temporal_buckets();
+        Self {
+            type_registry,
+            metadata_registry,
+            autogenerated,
+            timeless,
+            temporal,
+            temporal_buckets,
+            total,
+        }
+    }
+}
 
-        let total_rows = total_timeless_rows + total_temporal_rows;
-        let total_size_bytes = total_timeless_size_bytes + total_temporal_size_bytes;
+// --- Data store ---
 
-        Self {
-            total_timeless_rows,
-            total_timeless_size_bytes,
+impl SizeBytes for DataTypeRegistry {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        type K = ComponentName;
 
-            total_temporal_rows,
-            total_temporal_size_bytes,
-            total_temporal_buckets,
+        // NOTE: This is only here to make sure this method fails to compile if the inner type
+        // changes, as the following size computation assumes POD types.
+        let inner: &IntMap<K, _> = &self.0;
 
-            total_rows,
-            total_size_bytes,
-        }
+        let keys_size_bytes = std::mem::size_of::<K>() * inner.len();
+        // NOTE: It's all on the heap at this point.
+        let values_size_bytes = self.values().map(SizeBytes::total_size_bytes).sum::<u64>();
+
+        keys_size_bytes as u64 + values_size_bytes
     }
 }
 
-// --- Data store ---
+impl SizeBytes for MetadataRegistry<TimePoint> {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        self.0.heap_size_bytes()
+    }
+}
+
+impl SizeBytes for ClusterCellCache {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        self.0.heap_size_bytes()
+    }
+}
 
 impl DataStore {
     /// Returns the number of timeless index rows stored across this entire store, i.e. the sum of
     /// the number of rows across all of its timeless indexed tables.
     #[inline]
-    pub fn total_timeless_rows(&self) -> u64 {
+    pub fn num_timeless_rows(&self) -> u64 {
         crate::profile_function!();
         self.timeless_tables
             .values()
-            .map(|table| table.total_rows())
+            .map(|table| table.num_rows())
             .sum()
     }
 
     /// Returns the size of the timeless index data stored across this entire store, i.e. the sum
     /// of the size of the data stored across all of its timeless indexed tables, in bytes.
     #[inline]
-    pub fn total_timeless_size_bytes(&self) -> u64 {
+    pub fn timeless_size_bytes(&self) -> u64 {
         crate::profile_function!();
         self.timeless_tables
             .values()
@@ -76,15 +189,15 @@ impl DataStore {
     /// Returns the number of temporal index rows stored across this entire store, i.e. the sum of
     /// the number of rows across all of its temporal indexed tables.
     #[inline]
-    pub fn total_temporal_rows(&self) -> u64 {
+    pub fn num_temporal_rows(&self) -> u64 {
         crate::profile_function!();
-        self.tables.values().map(|table| table.total_rows()).sum()
+        self.tables.values().map(|table| table.num_rows()).sum()
     }
 
     /// Returns the size of the temporal index data stored across this entire store, i.e. the sum
     /// of the size of the data stored across all of its temporal indexed tables, in bytes.
     #[inline]
-    pub fn total_temporal_size_bytes(&self) -> u64 {
+    pub fn temporal_size_bytes(&self) -> u64 {
         crate::profile_function!();
         self.tables
             .values()
@@ -94,12 +207,9 @@ impl DataStore {
 
     /// Returns the number of temporal indexed buckets stored across this entire store.
     #[inline]
-    pub fn total_temporal_buckets(&self) -> u64 {
+    pub fn num_temporal_buckets(&self) -> u64 {
         crate::profile_function!();
-        self.tables
-            .values()
-            .map(|table| table.total_buckets())
-            .sum()
+        self.tables.values().map(|table| table.num_buckets()).sum()
     }
 }
 
@@ -109,7 +219,7 @@ impl IndexedTable {
     /// Returns the number of rows stored across this entire table, i.e. the sum of the number
     /// of rows stored across all of its buckets.
     #[inline]
-    pub fn total_rows(&self) -> u64 {
+    pub fn num_rows(&self) -> u64 {
         self.buckets_num_rows
     }
 
@@ -118,76 +228,33 @@ impl IndexedTable {
     ///
     /// Recomputed from scratch, for sanity checking.
     #[inline]
-    pub(crate) fn total_rows_uncached(&self) -> u64 {
+    pub(crate) fn num_rows_uncached(&self) -> u64 {
         crate::profile_function!();
         self.buckets.values().map(|bucket| bucket.num_rows()).sum()
     }
 
-    /// The size of both the control & component data stored in this table, across all of its
-    /// buckets, in bytes.
-    ///
-    /// This is a best-effort approximation, adequate for most purposes (stats,
-    /// triggering GCs, ...).
     #[inline]
-    pub fn total_size_bytes(&self) -> u64 {
+    pub(crate) fn size_bytes_uncached(&self) -> u64 {
         crate::profile_function!();
-
-        let Self {
-            timeline,
-            ent_path,
-            cluster_key,
-            buckets: _,
-            all_components,
-            buckets_num_rows: _,
-            buckets_size_bytes,
-        } = self;
-
-        let size_bytes = std::mem::size_of_val(timeline)
-            + std::mem::size_of_val(ent_path)
-            + std::mem::size_of_val(cluster_key)
-            + (all_components.len() * std::mem::size_of::<ComponentName>());
-
-        size_bytes as u64 + buckets_size_bytes
+        self.stack_size_bytes()
+            + self
+                .buckets
+                .values()
+                .map(|bucket| bucket.total_size_bytes())
+                .sum::<u64>()
     }
 
-    /// The size of both the control & component data stored in this table, across all of its
-    /// buckets, in bytes.
-    ///
-    /// This is a best-effort approximation, adequate for most purposes (stats,
-    /// triggering GCs, ...).
-    ///
-    /// Recomputed from scratch, for sanity checking.
+    /// Returns the number of buckets stored across this entire table.
     #[inline]
-    pub(crate) fn total_size_bytes_uncached(&self) -> u64 {
-        crate::profile_function!();
-
-        let Self {
-            timeline,
-            ent_path,
-            cluster_key,
-            buckets,
-            all_components,
-            buckets_num_rows: _,
-            buckets_size_bytes: _,
-        } = self;
-
-        let buckets_size_bytes = buckets
-            .values()
-            .map(|bucket| bucket.size_bytes())
-            .sum::<u64>();
-
-        let size_bytes = std::mem::size_of_val(timeline)
-            + std::mem::size_of_val(ent_path)
-            + std::mem::size_of_val(cluster_key)
-            + (all_components.len() * std::mem::size_of::<ComponentName>());
-
-        size_bytes as u64 + buckets_size_bytes
+    pub fn num_buckets(&self) -> u64 {
+        self.buckets.len() as _
     }
+}
 
-    /// Returns the number of buckets stored across this entire table.
+impl SizeBytes for IndexedTable {
     #[inline]
-    pub fn total_buckets(&self) -> u64 {
-        self.buckets.len() as _
+    fn heap_size_bytes(&self) -> u64 {
+        self.buckets_size_bytes
     }
 }
 
@@ -198,29 +265,18 @@ impl IndexedBucket {
         crate::profile_function!();
         self.inner.read().col_time.len() as u64
     }
+}
 
-    /// The size of both the control & component data stored in this bucket, in bytes.
-    ///
-    /// This is a best-effort approximation, adequate for most purposes (stats,
-    /// triggering GCs, ...).
+impl SizeBytes for IndexedBucket {
     #[inline]
-    pub fn size_bytes(&self) -> u64 {
-        crate::profile_function!();
-
-        let Self {
-            timeline,
-            cluster_key,
-            inner,
-        } = self;
-
-        (std::mem::size_of_val(timeline) + std::mem::size_of_val(cluster_key)) as u64
-            + inner.read().size_bytes
+    fn heap_size_bytes(&self) -> u64 {
+        self.inner.read().size_bytes
     }
 }
 
 impl IndexedBucketInner {
     /// Computes and caches the size of both the control & component data stored in this bucket,
-    /// in bytes.
+    /// stack and heap included, in bytes.
     ///
     /// This is a best-effort approximation, adequate for most purposes (stats,
     /// triggering GCs, ...).
@@ -239,17 +295,14 @@ impl IndexedBucketInner {
             size_bytes,
         } = self;
 
-        let control_size_bytes = std::mem::size_of_val(is_sorted)
-            + std::mem::size_of_val(time_range)
-            + std::mem::size_of_val(col_time.as_slice())
-            + std::mem::size_of_val(col_insert_id.as_slice())
-            + std::mem::size_of_val(col_row_id.as_slice())
-            + std::mem::size_of_val(col_num_instances.as_slice())
-            + std::mem::size_of_val(size_bytes);
-
-        let data_size_bytes = compute_columns_size_bytes(columns);
-
-        *size_bytes = control_size_bytes as u64 + data_size_bytes;
+        *size_bytes = is_sorted.total_size_bytes()
+            + time_range.total_size_bytes()
+            + col_time.total_size_bytes()
+            + col_insert_id.total_size_bytes()
+            + col_row_id.total_size_bytes()
+            + col_num_instances.total_size_bytes()
+            + columns.total_size_bytes()
+            + size_bytes.total_size_bytes();
 
         *size_bytes
     }
@@ -260,16 +313,14 @@ impl IndexedBucketInner {
 impl PersistentIndexedTable {
     /// Returns the number of rows stored across this table.
     #[inline]
-    pub fn total_rows(&self) -> u64 {
+    pub fn num_rows(&self) -> u64 {
         self.col_num_instances.len() as _
     }
+}
 
-    /// The size of both the control & component data stored in this table, in bytes.
-    ///
-    /// This is a best-effort approximation, adequate for most purposes (stats,
-    /// triggering GCs, ...).
+impl SizeBytes for PersistentIndexedTable {
     #[inline]
-    pub fn total_size_bytes(&self) -> u64 {
+    fn heap_size_bytes(&self) -> u64 {
         crate::profile_function!();
 
         let Self {
@@ -281,35 +332,11 @@ impl PersistentIndexedTable {
             columns,
         } = self;
 
-        let control_size_bytes = std::mem::size_of_val(ent_path)
-            + std::mem::size_of_val(cluster_key)
-            + std::mem::size_of_val(col_insert_id.as_slice())
-            + std::mem::size_of_val(col_row_id.as_slice())
-            + std::mem::size_of_val(col_num_instances.as_slice());
-
-        let data_size_bytes = compute_columns_size_bytes(columns);
-
-        control_size_bytes as u64 + data_size_bytes
+        ent_path.total_size_bytes()
+            + cluster_key.total_size_bytes()
+            + col_insert_id.total_size_bytes()
+            + col_row_id.total_size_bytes()
+            + col_num_instances.total_size_bytes()
+            + columns.total_size_bytes()
     }
 }
-
-// --- Common ---
-
-/// Computes the size in bytes of an entire table's worth of arrow data.
-fn compute_columns_size_bytes(columns: &IntMap<ComponentName, DataCellColumn>) -> u64 {
-    crate::profile_function!();
-    let keys = (columns.keys().len() * std::mem::size_of::<ComponentName>()) as u64;
-    let cells = columns
-        .values()
-        .flat_map(|column| column.iter())
-        .flatten() // option
-        .map(|cell| cell.size_bytes())
-        .sum::<u64>();
-    keys + cells
-}
-
-#[test]
-fn compute_table_size_bytes_ignore_headers() {
-    let columns = Default::default();
-    assert_eq!(0, compute_columns_size_bytes(&columns));
-}
diff --git a/crates/re_arrow_store/src/store_write.rs b/crates/re_arrow_store/src/store_write.rs
index 2f68bedf445c..853bdf4b222c 100644
--- a/crates/re_arrow_store/src/store_write.rs
+++ b/crates/re_arrow_store/src/store_write.rs
@@ -7,12 +7,12 @@ use smallvec::SmallVec;
 use re_log::{debug, trace};
 use re_log_types::{
     component_types::InstanceKey, ComponentName, DataCell, DataCellColumn, DataCellError, DataRow,
-    DataTable, TimeInt, TimeRange,
+    DataTable, RowId, SizeBytes as _, TimeInt, TimePoint, TimeRange,
 };
 
 use crate::{
-    DataStore, DataStoreConfig, IndexedBucket, IndexedBucketInner, IndexedTable,
-    PersistentIndexedTable,
+    store::MetadataRegistry, DataStore, DataStoreConfig, IndexedBucket, IndexedBucketInner,
+    IndexedTable, PersistentIndexedTable,
 };
 
 // TODO(#1619):
@@ -184,12 +184,18 @@ impl DataStore {
             }
         }
 
-        // This is valuable information even for a timeless timepoint!
-        self.metadata_registry.insert(*row_id, timepoint.clone());
+        self.metadata_registry.upsert(*row_id, timepoint.clone());
 
         Ok(())
     }
 
+    /// Wipes all timeless data.
+    ///
+    /// Mostly useful for testing/debugging purposes.
+    pub fn wipe_timeless_data(&mut self) {
+        self.timeless_tables = Default::default();
+    }
+
     /// Auto-generates an appropriate cluster cell for the specified number of instances and
     /// transparently handles caching.
     // TODO(#1777): shared slices for auto generated keys
@@ -225,6 +231,29 @@ impl DataStore {
     }
 }
 
+impl MetadataRegistry<TimePoint> {
+    fn upsert(&mut self, row_id: RowId, timepoint: TimePoint) {
+        // This is valuable information even for a timeless timepoint!
+        match self.entry(row_id) {
+            std::collections::btree_map::Entry::Vacant(entry) => {
+                entry.insert(timepoint);
+            }
+            // NOTE: When saving and loading data from disk, it's very possible that we try to
+            // insert data for a single `RowId` in multiple calls (buckets are per-timeline, so a
+            // single `RowId` can get spread across multiple buckets)!
+            std::collections::btree_map::Entry::Occupied(mut entry) => {
+                let entry = entry.get_mut();
+                for (timeline, time) in timepoint {
+                    if let Some(old_time) = entry.insert(timeline, time) {
+                        re_log::error!(%row_id, ?timeline, old_time = ?old_time, new_time = ?time, "detected re-used `RowId/Timeline` pair, this is illegal and will lead to undefined behavior in the datastore");
+                        debug_assert!(false, "detected re-used `RowId/Timeline`");
+                    }
+                }
+            }
+        }
+    }
+}
+
 // --- Temporal ---
 
 impl IndexedTable {
@@ -250,7 +279,7 @@ impl IndexedTable {
         let len_overflow = len > config.indexed_bucket_num_rows;
 
         if len_overflow {
-            let bucket_size_before = bucket.size_bytes();
+            let bucket_size_before = bucket.total_size_bytes();
             if let Some((min, second_half)) = bucket.split() {
                 trace!(
                     kind = "insert",
@@ -263,7 +292,8 @@ impl IndexedTable {
                     "splitting off indexed bucket following overflow"
                 );
 
-                self.buckets_size_bytes += bucket.size_bytes() + second_half.size_bytes();
+                self.buckets_size_bytes +=
+                    bucket.total_size_bytes() + second_half.total_size_bytes();
                 self.buckets_size_bytes -= bucket_size_before;
                 self.buckets.insert(min, second_half);
 
@@ -393,26 +423,28 @@ impl IndexedBucket {
         // append time to primary column and update time range appropriately
         col_time.push(time.as_i64());
         *time_range = TimeRange::new(time_range.min.min(time), time_range.max.max(time));
-        size_bytes_added += std::mem::size_of_val(&time.as_i64()) as u64;
+        size_bytes_added += time.as_i64().total_size_bytes();
 
         // update all control columns
         if let Some(insert_id) = insert_id {
             col_insert_id.push(insert_id);
-            size_bytes_added += std::mem::size_of_val(&insert_id) as u64;
+            size_bytes_added += insert_id.total_size_bytes();
         }
         col_row_id.push(row.row_id());
-        size_bytes_added += std::mem::size_of_val(&row.row_id()) as u64;
+        size_bytes_added += row.row_id().total_size_bytes();
         col_num_instances.push(row.num_instances());
-        size_bytes_added += std::mem::size_of_val(&row.num_instances()) as u64;
+        size_bytes_added += row.num_instances().total_size_bytes();
 
         // insert auto-generated cluster cell if present
         if let Some(cluster_cell) = generated_cluster_cell {
             let component = cluster_cell.component_name();
             let column = columns.entry(component).or_insert_with(|| {
-                size_bytes_added += std::mem::size_of_val(&component) as u64;
-                DataCellColumn::empty(num_rows)
+                let column = DataCellColumn::empty(num_rows);
+                size_bytes_added += component.total_size_bytes();
+                size_bytes_added += column.total_size_bytes();
+                column
             });
-            size_bytes_added += cluster_cell.size_bytes();
+            size_bytes_added += cluster_cell.total_size_bytes();
             column.0.push(Some(cluster_cell));
         }
 
@@ -422,10 +454,12 @@ impl IndexedBucket {
         for cell in row.cells().iter() {
             let component = cell.component_name();
             let column = columns.entry(component).or_insert_with(|| {
-                size_bytes_added += std::mem::size_of_val(&component) as u64;
-                DataCellColumn::empty(col_time.len().saturating_sub(1))
+                let column = DataCellColumn::empty(col_time.len().saturating_sub(1));
+                size_bytes_added += component.total_size_bytes();
+                size_bytes_added += column.total_size_bytes();
+                column
             });
-            size_bytes_added += cell.size_bytes();
+            size_bytes_added += cell.total_size_bytes();
             column.0.push(Some(cell.clone() /* shallow */));
         }
 
@@ -439,7 +473,9 @@ impl IndexedBucket {
             }
 
             if !components.contains(component) {
-                column.0.push(None);
+                let none_cell: Option<DataCell> = None;
+                size_bytes_added += none_cell.total_size_bytes();
+                column.0.push(none_cell);
             }
         }
 
@@ -605,11 +641,11 @@ impl IndexedBucket {
             self.sanity_check().unwrap();
             bucket2.sanity_check().unwrap();
 
-            let total_rows1 = self.num_rows() as i64;
-            let total_rows2 = bucket2.num_rows() as i64;
+            let num_rows1 = self.num_rows() as i64;
+            let num_rows2 = bucket2.num_rows() as i64;
             debug_assert_eq!(
                 _num_rows as i64,
-                total_rows1 + total_rows2,
+                num_rows1 + num_rows2,
                 "expected both buckets to sum up to the length of the original bucket"
             );
         }
@@ -747,7 +783,7 @@ impl PersistentIndexedTable {
     ) {
         crate::profile_function!();
 
-        let num_rows = self.total_rows() as usize;
+        let num_rows = self.num_rows() as usize;
 
         let Self {
             ent_path: _,
diff --git a/crates/re_arrow_store/src/test_util.rs b/crates/re_arrow_store/src/test_util.rs
index 0fe1b28bf8c2..35dc129bddd4 100644
--- a/crates/re_arrow_store/src/test_util.rs
+++ b/crates/re_arrow_store/src/test_util.rs
@@ -1,28 +1,32 @@
-use crate::DataStoreConfig;
+use crate::{DataStore, DataStoreConfig};
 
 // ---
 
 #[doc(hidden)]
 #[macro_export]
 macro_rules! test_row {
-    ($entity:ident @ $frames:tt => $n:expr; [$c0:expr $(,)*]) => {
-        ::re_log_types::DataRow::from_cells1(
+    ($entity:ident @ $frames:tt => $n:expr; [$c0:expr $(,)*]) => {{
+        let mut row = ::re_log_types::DataRow::from_cells1(
             ::re_log_types::RowId::random(),
             $entity.clone(),
             $frames,
             $n,
             $c0,
-        )
-    };
-    ($entity:ident @ $frames:tt => $n:expr; [$c0:expr, $c1:expr $(,)*]) => {
-        ::re_log_types::DataRow::from_cells2(
+        );
+        row.compute_all_size_bytes();
+        row
+    }};
+    ($entity:ident @ $frames:tt => $n:expr; [$c0:expr, $c1:expr $(,)*]) => {{
+        let mut row = ::re_log_types::DataRow::from_cells2(
             ::re_log_types::RowId::random(),
             $entity.clone(),
             $frames,
             $n,
             ($c0, $c1),
-        )
-    };
+        );
+        row.compute_all_size_bytes();
+        row
+    }};
 }
 
 pub fn all_configs() -> impl Iterator<Item = DataStoreConfig> {
@@ -51,3 +55,11 @@ pub fn all_configs() -> impl Iterator<Item = DataStoreConfig> {
         enable_typecheck: idx.enable_typecheck,
     })
 }
+
+pub fn sanity_unwrap(store: &mut DataStore) {
+    if let err @ Err(_) = store.sanity_check() {
+        store.sort_indices_if_needed();
+        eprintln!("{store}");
+        err.unwrap();
+    }
+}
diff --git a/crates/re_arrow_store/tests/correctness.rs b/crates/re_arrow_store/tests/correctness.rs
index d6d68fcfe683..fba86298332e 100644
--- a/crates/re_arrow_store/tests/correctness.rs
+++ b/crates/re_arrow_store/tests/correctness.rs
@@ -6,7 +6,10 @@ use std::sync::atomic::{AtomicBool, Ordering::SeqCst};
 
 use rand::Rng;
 
-use re_arrow_store::{test_row, DataStore, DataStoreConfig, LatestAtQuery, WriteError};
+use re_arrow_store::{
+    test_row, test_util::sanity_unwrap, DataStore, DataStoreConfig, DataStoreStats,
+    GarbageCollectionTarget, LatestAtQuery, WriteError,
+};
 use re_log_types::{
     component_types::InstanceKey,
     datagen::{
@@ -99,11 +102,7 @@ fn latest_at_emptiness_edge_cases_impl(store: &mut DataStore) {
             ] => num_instances; [build_some_instances(num_instances as _)]))
         .unwrap();
 
-    if let err @ Err(_) = store.sanity_check() {
-        store.sort_indices_if_needed();
-        eprintln!("{store}");
-        err.unwrap();
-    }
+    sanity_unwrap(store);
 
     let timeline_wrong_name = Timeline::new("lag_time", TimeType::Time);
     let timeline_wrong_kind = Timeline::new("log_time", TimeType::Sequence);
@@ -282,6 +281,8 @@ fn gc_correct() {
         },
     );
 
+    let stats_empty = DataStoreStats::from_store(&store);
+
     let mut rng = rand::thread_rng();
 
     let num_frames = rng.gen_range(0..=100);
@@ -300,71 +301,37 @@ fn gc_correct() {
         }
     }
 
-    if let err @ Err(_) = store.sanity_check() {
-        store.sort_indices_if_needed();
-        eprintln!("{store}");
-        err.unwrap();
-    }
+    sanity_unwrap(&mut store);
+    check_still_readable(&store);
+
+    let stats = DataStoreStats::from_store(&store);
+
+    let (row_ids, stats_diff) = store.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+    let stats_diff = stats_diff + stats_empty; // account for fixed overhead
+
+    assert_eq!(row_ids.len() as u64, stats.total.num_rows);
+    assert_eq!(
+        stats.metadata_registry.num_rows,
+        stats_diff.metadata_registry.num_rows
+    );
+    assert_eq!(
+        stats.metadata_registry.num_bytes,
+        stats_diff.metadata_registry.num_bytes
+    );
+    assert_eq!(stats.temporal.num_rows, stats_diff.temporal.num_rows);
+
+    sanity_unwrap(&mut store);
     check_still_readable(&store);
+    for row_id in &row_ids {
+        assert!(store.get_msg_metadata(row_id).is_none());
+    }
+
+    let (row_ids, stats_diff) = store.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+    assert!(row_ids.is_empty());
+    assert_eq!(DataStoreStats::default(), stats_diff);
 
-    // TODO(#1619): bring back garbage collection
-
-    // let row_id_chunks = store.gc(
-    //     GarbageCollectionTarget::DropAtLeastPercentage(1.0),
-    //     Timeline::new("frame_nr", TimeType::Sequence),
-    //     MsgId::name(),
-    // );
-
-    // let row_ids = row_id_chunks
-    //     .iter()
-    //     .flat_map(|chunk| arrow_array_deserialize_iterator::<Option<MsgId>>(&**chunk).unwrap())
-    //     .map(Option::unwrap) // MsgId is always present
-    //     .collect::<ahash::HashSet<_>>();
-    // assert!(!row_ids.is_empty());
-
-    // if let err @ Err(_) = store.sanity_check() {
-    //     store.sort_indices_if_needed();
-    //     eprintln!("{store}");
-    //     err.unwrap();
-    // }
-    // check_still_readable(&store);
-    // for row_id in &row_ids {
-    //     assert!(store.get_msg_metadata(row_id).is_some());
-    // }
-
-    // store.clear_msg_metadata(&row_ids);
-
-    // if let err @ Err(_) = store.sanity_check() {
-    //     store.sort_indices_if_needed();
-    //     eprintln!("{store}");
-    //     err.unwrap();
-    // }
-    // check_still_readable(&store);
-    // for row_id in &row_ids {
-    //     assert!(store.get_msg_metadata(row_id).is_none());
-    // }
-
-    // let row_id_chunks = store.gc(
-    //     GarbageCollectionTarget::DropAtLeastPercentage(1.0),
-    //     Timeline::new("frame_nr", TimeType::Sequence),
-    //     MsgId::name(),
-    // );
-
-    // let row_ids = row_id_chunks
-    //     .iter()
-    //     .flat_map(|chunk| arrow_array_deserialize_iterator::<Option<MsgId>>(&**chunk).unwrap())
-    //     .map(Option::unwrap) // MsgId is always present
-    //     .collect::<ahash::HashSet<_>>();
-    // assert!(row_ids.is_empty());
-
-    // if let err @ Err(_) = store.sanity_check() {
-    //     store.sort_indices_if_needed();
-    //     eprintln!("{store}");
-    //     err.unwrap();
-    // }
-    // check_still_readable(&store);
-
-    // assert_eq!(2, store.total_temporal_component_rows());
+    sanity_unwrap(&mut store);
+    check_still_readable(&store);
 }
 
 fn check_still_readable(_store: &DataStore) {
diff --git a/crates/re_arrow_store/tests/data_store.rs b/crates/re_arrow_store/tests/data_store.rs
index 29af874ae0e3..5ba2a722bf80 100644
--- a/crates/re_arrow_store/tests/data_store.rs
+++ b/crates/re_arrow_store/tests/data_store.rs
@@ -11,8 +11,8 @@ use polars_core::{prelude::*, series::Series};
 use polars_ops::prelude::DataFrameJoinOps;
 use rand::Rng;
 use re_arrow_store::{
-    polars_util, test_row, DataStore, DataStoreConfig, LatestAtQuery, RangeQuery, TimeInt,
-    TimeRange,
+    polars_util, test_row, test_util::sanity_unwrap, DataStore, DataStoreConfig, DataStoreStats,
+    GarbageCollectionTarget, LatestAtQuery, RangeQuery, TimeInt, TimeRange,
 };
 use re_log_types::{
     component_types::{ColorRGBA, InstanceKey, Point2D, Rect2D},
@@ -48,6 +48,12 @@ fn all_components() {
                 store2.insert_table(&table).unwrap();
             }
 
+            // Stress test GC
+            store2.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+            for table in store.to_data_tables(None) {
+                store2.insert_table(&table).unwrap();
+            }
+
             let mut store = store2;
             let timeline = Timeline::new("frame_nr", TimeType::Sequence);
 
@@ -110,11 +116,7 @@ fn all_components() {
 
         assert_latest_components_at(&mut store, &ent_path, Some(components_b));
 
-        if let err @ Err(_) = store.sanity_check() {
-            store.sort_indices_if_needed();
-            eprintln!("{store}");
-            err.unwrap();
-        }
+        sanity_unwrap(&mut store);
     }
 
     // Tiny buckets, demonstrating the harder-to-reason-about cases.
@@ -172,11 +174,7 @@ fn all_components() {
 
         assert_latest_components_at(&mut store, &ent_path, Some(components_b));
 
-        if let err @ Err(_) = store.sanity_check() {
-            store.sort_indices_if_needed();
-            eprintln!("{store}");
-            err.unwrap();
-        }
+        sanity_unwrap(&mut store);
     }
 
     // Tiny buckets and tricky splits, demonstrating a case that is not only extremely hard to
@@ -242,11 +240,7 @@ fn all_components() {
 
         assert_latest_components_at(&mut store, &ent_path, Some(components_b));
 
-        if let err @ Err(_) = store.sanity_check() {
-            store.sort_indices_if_needed();
-            eprintln!("{store}");
-            err.unwrap();
-        }
+        sanity_unwrap(&mut store);
     }
 }
 
@@ -259,14 +253,6 @@ fn latest_at() {
     for config in re_arrow_store::test_util::all_configs() {
         let mut store = DataStore::new(InstanceKey::name(), config.clone());
         latest_at_impl(&mut store);
-
-        // TODO(#1619): bring back garbage collection
-        // store.gc(
-        //     GarbageCollectionTarget::DropAtLeastPercentage(1.0),
-        //     Timeline::new("frame_nr", TimeType::Sequence),
-        //     MsgId::name(),
-        // );
-        // latest_at_impl(&mut store);
     }
 }
 
@@ -317,13 +303,14 @@ fn latest_at_impl(store: &mut DataStore) {
     for table in store.to_data_tables(None) {
         store2.insert_table(&table).unwrap();
     }
+    // Stress test GC
+    store2.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+    for table in store.to_data_tables(None) {
+        store2.insert_table(&table).unwrap();
+    }
     let mut store = store2;
 
-    if let err @ Err(_) = store.sanity_check() {
-        store.sort_indices_if_needed();
-        eprintln!("{store}");
-        err.unwrap();
-    }
+    sanity_unwrap(&mut store);
 
     let mut assert_latest_components = |frame_nr: TimeInt, rows: &[(ComponentName, &DataRow)]| {
         let timeline_frame_nr = Timeline::new("frame_nr", TimeType::Sequence);
@@ -442,11 +429,7 @@ fn range_impl(store: &mut DataStore) {
     let row4_4 = test_row!(ent_path @ [build_frame_nr(frame4)] => 5; [insts4_3, points4_4]);
     insert(store, &row4_4);
 
-    if let err @ Err(_) = store.sanity_check() {
-        store.sort_indices_if_needed();
-        eprintln!("{store}");
-        err.unwrap();
-    }
+    sanity_unwrap(store);
 
     // Each entry in `rows_at_times` corresponds to a dataframe that's expected to be returned
     // by the range query.
@@ -462,6 +445,11 @@ fn range_impl(store: &mut DataStore) {
             for table in store.to_data_tables(None) {
                 store2.insert_table(&table).unwrap();
             }
+            store2.wipe_timeless_data();
+            store2.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+            for table in store.to_data_tables(None) {
+                store2.insert_table(&table).unwrap();
+            }
             let mut store = store2;
 
             let mut expected_timeless = Vec::<DataFrame>::new();
@@ -882,36 +870,32 @@ fn gc_impl(store: &mut DataStore) {
             }
         }
 
-        if let err @ Err(_) = store.sanity_check() {
-            store.sort_indices_if_needed();
-            eprintln!("{store}");
-            err.unwrap();
-        }
+        sanity_unwrap(store);
         _ = store.to_dataframe(); // simple way of checking that everything is still readable
 
-        // TODO(#1619): bring back garbage collection
-
-        // let row_id_chunks = store.gc(
-        //     GarbageCollectionTarget::DropAtLeastPercentage(1.0 / 3.0),
-        //     Timeline::new("frame_nr", TimeType::Sequence),
-        //     MsgId::name(),
-        // );
-
-        // let row_ids = row_id_chunks
-        //     .iter()
-        //     .flat_map(|chunk| arrow_array_deserialize_iterator::<Option<MsgId>>(&**chunk).unwrap())
-        //     .map(Option::unwrap) // MsgId is always present
-        //     .collect::<ahash::HashSet<_>>();
+        let stats = DataStoreStats::from_store(store);
 
-        // for row_id in &row_ids {
-        //     assert!(store.get_msg_metadata(row_id).is_some());
-        // }
-
-        // store.clear_msg_metadata(&row_ids);
+        let (row_ids, stats_diff) =
+            store.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0 / 3.0));
+        for row_id in &row_ids {
+            assert!(store.get_msg_metadata(row_id).is_none());
+        }
 
-        // for row_id in &row_ids {
-        //     assert!(store.get_msg_metadata(row_id).is_none());
-        // }
+        // NOTE: only temporal data and row metadata get purged!
+        let num_bytes_dropped =
+            (stats_diff.temporal.num_bytes + stats_diff.metadata_registry.num_bytes) as f64;
+        let num_bytes_dropped_expected_min =
+            (stats.temporal.num_bytes + stats.metadata_registry.num_bytes) as f64 * 0.95 / 3.0;
+        let num_bytes_dropped_expected_max =
+            (stats.temporal.num_bytes + stats.metadata_registry.num_bytes) as f64 * 1.05 / 3.0;
+        assert!(
+            num_bytes_dropped_expected_min <= num_bytes_dropped
+                && num_bytes_dropped <= num_bytes_dropped_expected_max,
+            "{} <= {} <= {}",
+            re_format::format_bytes(num_bytes_dropped_expected_min),
+            re_format::format_bytes(num_bytes_dropped),
+            re_format::format_bytes(num_bytes_dropped_expected_max),
+        );
     }
 }
 
diff --git a/crates/re_arrow_store/tests/dump.rs b/crates/re_arrow_store/tests/dump.rs
index 5293870bf882..5a18c4d962e9 100644
--- a/crates/re_arrow_store/tests/dump.rs
+++ b/crates/re_arrow_store/tests/dump.rs
@@ -3,7 +3,10 @@
 use std::sync::atomic::{AtomicBool, Ordering};
 
 use itertools::Itertools;
-use re_arrow_store::{test_row, DataStore, DataStoreStats, TimeInt, TimeRange, Timeline};
+use re_arrow_store::{
+    test_row, test_util::sanity_unwrap, DataStore, DataStoreStats, GarbageCollectionTarget,
+    TimeInt, TimeRange, Timeline,
+};
 use re_log_types::{
     component_types::InstanceKey,
     datagen::{
@@ -27,6 +30,16 @@ fn data_store_dump() {
         let mut store3 = DataStore::new(InstanceKey::name(), config.clone());
 
         data_store_dump_impl(&mut store1, &mut store2, &mut store3);
+
+        // stress-test GC impl
+        store1.wipe_timeless_data();
+        store1.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+        store2.wipe_timeless_data();
+        store2.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+        store3.wipe_timeless_data();
+        store3.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+
+        data_store_dump_impl(&mut store1, &mut store2, &mut store3);
     }
 }
 
@@ -52,31 +65,19 @@ fn data_store_dump_impl(store1: &mut DataStore, store2: &mut DataStore, store3:
     for table in &tables {
         insert_table(store1, table);
     }
-    if let err @ Err(_) = store1.sanity_check() {
-        store1.sort_indices_if_needed();
-        eprintln!("{store1}");
-        err.unwrap();
-    }
+    sanity_unwrap(store1);
 
     // Dump the first store into the second one.
     for table in store1.to_data_tables(None) {
         store2.insert_table(&table).unwrap();
     }
-    if let err @ Err(_) = store2.sanity_check() {
-        store2.sort_indices_if_needed();
-        eprintln!("{store2}");
-        err.unwrap();
-    }
+    sanity_unwrap(store2);
 
     // Dump the second store into the third one.
     for table in store2.to_data_tables(None) {
         store3.insert_table(&table).unwrap();
     }
-    if let err @ Err(_) = store3.sanity_check() {
-        store3.sort_indices_if_needed();
-        eprintln!("{store3}");
-        err.unwrap();
-    }
+    sanity_unwrap(store3);
 
     let store1_df = store1.to_dataframe();
     let store2_df = store2.to_dataframe();
@@ -94,12 +95,14 @@ fn data_store_dump_impl(store1: &mut DataStore, store2: &mut DataStore, store3:
     let store2_stats = DataStoreStats::from_store(store2);
     let store3_stats = DataStoreStats::from_store(store3);
     assert!(
-        store1_stats <= store2_stats,
+        store1_stats.temporal.num_bytes <= store2_stats.temporal.num_bytes
+            && store1_stats.timeless.num_bytes <= store2_stats.timeless.num_bytes,
         "First store should have <= amount of data of second store:\n\
             {store1_stats:#?}\n{store2_stats:#?}"
     );
     assert!(
-        store2_stats <= store3_stats,
+        store2_stats.temporal.num_bytes <= store3_stats.temporal.num_bytes
+            && store2_stats.timeless.num_bytes <= store3_stats.timeless.num_bytes,
         "Second store should have <= amount of data of third store:\n\
             {store2_stats:#?}\n{store3_stats:#?}"
     );
@@ -119,6 +122,12 @@ fn data_store_dump_filtered() {
         let mut store2 = DataStore::new(InstanceKey::name(), config.clone());
 
         data_store_dump_filtered_impl(&mut store1, &mut store2);
+
+        // stress-test GC impl
+        store1.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+        store2.gc(GarbageCollectionTarget::DropAtLeastFraction(1.0));
+
+        data_store_dump_filtered_impl(&mut store1, &mut store2);
     }
 }
 
@@ -140,11 +149,7 @@ fn data_store_dump_filtered_impl(store1: &mut DataStore, store2: &mut DataStore)
     for table in &tables {
         store1.insert_table(table).unwrap();
     }
-    if let err @ Err(_) = store1.sanity_check() {
-        store1.sort_indices_if_needed();
-        eprintln!("{store1}");
-        err.unwrap();
-    }
+    sanity_unwrap(store1);
 
     // Dump frame1 from the first store into the second one.
     for table in store1.to_data_tables((timeline_frame_nr, TimeRange::new(frame1, frame1)).into()) {
@@ -166,11 +171,7 @@ fn data_store_dump_filtered_impl(store1: &mut DataStore, store2: &mut DataStore)
     for table in store1.to_data_tables((timeline_frame_nr, TimeRange::new(frame4, frame4)).into()) {
         store2.insert_table(&table).unwrap();
     }
-    if let err @ Err(_) = store2.sanity_check() {
-        store2.sort_indices_if_needed();
-        eprintln!("{store2}");
-        err.unwrap();
-    }
+    sanity_unwrap(store2);
 
     let store1_df = store1.to_dataframe();
     let store2_df = store2.to_dataframe();
@@ -182,7 +183,8 @@ fn data_store_dump_filtered_impl(store1: &mut DataStore, store2: &mut DataStore)
     let store1_stats = DataStoreStats::from_store(store1);
     let store2_stats = DataStoreStats::from_store(store2);
     assert!(
-        store1_stats <= store2_stats,
+        store1_stats.temporal.num_bytes <= store2_stats.temporal.num_bytes
+            && store1_stats.timeless.num_bytes <= store2_stats.timeless.num_bytes,
         "First store should have <= amount of data of second store:\n\
             {store1_stats:#?}\n{store2_stats:#?}"
     );
diff --git a/crates/re_arrow_store/tests/internals.rs b/crates/re_arrow_store/tests/internals.rs
index 9233438b72e0..63888bf65b21 100644
--- a/crates/re_arrow_store/tests/internals.rs
+++ b/crates/re_arrow_store/tests/internals.rs
@@ -53,7 +53,7 @@ fn pathological_bucket_topology() {
         let timepoint = TimePoint::from([build_frame_nr(frame_nr.into())]);
         for _ in 0..num {
             let row = DataRow::from_cells1(
-                RowId::ZERO,
+                RowId::random(),
                 ent_path.clone(),
                 timepoint.clone(),
                 num_instances,
@@ -62,7 +62,7 @@ fn pathological_bucket_topology() {
             store_forward.insert_row(&row).unwrap();
 
             let row = DataRow::from_cells1(
-                RowId::ZERO,
+                RowId::random(),
                 ent_path.clone(),
                 timepoint.clone(),
                 num_instances,
@@ -84,7 +84,7 @@ fn pathological_bucket_topology() {
             .map(|frame_nr| {
                 let timepoint = TimePoint::from([build_frame_nr(frame_nr.into())]);
                 DataRow::from_cells1(
-                    RowId::ZERO,
+                    RowId::random(),
                     ent_path.clone(),
                     timepoint,
                     num_instances,
diff --git a/crates/re_data_store/Cargo.toml b/crates/re_data_store/Cargo.toml
index 4b5991b37c0a..607efc62ef3b 100644
--- a/crates/re_data_store/Cargo.toml
+++ b/crates/re_data_store/Cargo.toml
@@ -25,10 +25,11 @@ serde = ["dep:serde", "re_log_types/serde"]
 
 [dependencies]
 re_arrow_store.workspace = true
+re_format.workspace = true
 re_int_histogram.workspace = true
+re_log.workspace = true
 re_log_encoding = { workspace = true, optional = true }
 re_log_types.workspace = true
-re_log.workspace = true
 re_smart_channel.workspace = true
 
 ahash.workspace = true
diff --git a/crates/re_data_store/src/log_db.rs b/crates/re_data_store/src/log_db.rs
index 5821b97cac2a..b520262aa4f9 100644
--- a/crates/re_data_store/src/log_db.rs
+++ b/crates/re_data_store/src/log_db.rs
@@ -204,8 +204,8 @@ impl LogDb {
     }
 
     pub fn num_rows(&self) -> usize {
-        self.entity_db.data_store.total_timeless_rows() as usize
-            + self.entity_db.data_store.total_temporal_rows() as usize
+        self.entity_db.data_store.num_timeless_rows() as usize
+            + self.entity_db.data_store.num_temporal_rows() as usize
     }
 
     pub fn is_empty(&self) -> bool {
@@ -251,9 +251,16 @@ impl LogDb {
         crate::profile_function!();
         assert!((0.0..=1.0).contains(&fraction_to_purge));
 
-        // TODO(#1619): bring back garbage collection
-        let drop_row_ids: ahash::HashSet<_> = Default::default();
+        let (drop_row_ids, stats_diff) = self.entity_db.data_store.gc(
+            re_arrow_store::GarbageCollectionTarget::DropAtLeastFraction(fraction_to_purge as _),
+        );
+        re_log::debug!(
+            num_row_ids_dropped = drop_row_ids.len(),
+            size_bytes_dropped = re_format::format_bytes(stats_diff.total.num_bytes as _),
+            "purged datastore"
+        );
 
+        let drop_row_ids: ahash::HashSet<_> = drop_row_ids.into_iter().collect();
         let cutoff_times = self.entity_db.data_store.oldest_time_per_timeline();
 
         let Self {
diff --git a/crates/re_log_types/src/data_cell.rs b/crates/re_log_types/src/data_cell.rs
index 316d25f7978d..568df8fe8928 100644
--- a/crates/re_log_types/src/data_cell.rs
+++ b/crates/re_log_types/src/data_cell.rs
@@ -1,8 +1,9 @@
 use std::sync::Arc;
 
+use arrow2::datatypes::DataType;
 use itertools::Itertools as _;
 
-use crate::{Component, ComponentName, DeserializableComponent, SerializableComponent};
+use crate::{Component, ComponentName, DeserializableComponent, SerializableComponent, SizeBytes};
 
 // ---
 
@@ -114,7 +115,8 @@ pub struct DataCellInner {
     // TODO(#1696): Store this within the datatype itself.
     pub(crate) name: ComponentName,
 
-    /// The size in bytes of both the underlying arrow data _and_ the inner cell itself.
+    /// The pre-computed size of the cell (stack + heap) as well as its underlying arrow data,
+    /// in bytes.
     ///
     /// This is always zero unless [`Self::compute_size_bytes`] has been called, which is a very
     /// costly operation.
@@ -412,7 +414,6 @@ impl DataCell {
     pub fn is_sorted_and_unique(&self) -> DataCellResult<bool> {
         use arrow2::{
             array::{Array, PrimitiveArray},
-            datatypes::DataType,
             types::NativeType,
         };
 
@@ -441,24 +442,6 @@ impl DataCell {
             _ => Err(DataCellError::UnsupportedDatatype(arr.data_type().clone())),
         }
     }
-
-    /// Returns the total (heap) allocated size of the cell in bytes, provided that
-    /// [`Self::compute_size_bytes`] has been called first (zero otherwise).
-    ///
-    /// This is an approximation, accurate enough for most purposes (stats, GC trigger, ...).
-    ///
-    /// This is `O(1)`, the value is computed and cached by calling [`Self::compute_size_bytes`].
-    #[inline]
-    pub fn size_bytes(&self) -> u64 {
-        let Self { inner } = self;
-
-        (inner.size_bytes > 0)
-            .then_some(std::mem::size_of_val(inner) as u64 + inner.size_bytes)
-            .unwrap_or_else(|| {
-                re_log::warn_once!("called `DataCell::size_bytes() without computing it first");
-                0
-            })
-    }
 }
 
 // ---
@@ -492,7 +475,7 @@ impl std::fmt::Display for DataCell {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.write_fmt(format_args!(
             "DataCell({})",
-            re_format::format_bytes(self.size_bytes() as _)
+            re_format::format_bytes(self.total_size_bytes() as _)
         ))?;
         re_format::arrow::format_table(
             // NOTE: wrap in a ListArray so that it looks more cell-like (i.e. single row)
@@ -506,7 +489,8 @@ impl std::fmt::Display for DataCell {
 // ---
 
 impl DataCell {
-    /// Compute and cache the total (heap) allocated size of the underlying arrow array in bytes.
+    /// Compute and cache the total size (stack + heap) of the inner cell and its underlying arrow
+    /// array, in bytes.
     /// This does nothing if the size has already been computed and cached before.
     ///
     /// The caller must the sole owner of this cell, as this requires mutating an `Arc` under the
@@ -523,8 +507,23 @@ impl DataCell {
     }
 }
 
+impl SizeBytes for DataCell {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        (self.inner.size_bytes > 0)
+            .then_some(self.inner.size_bytes)
+            .unwrap_or_else(|| {
+                re_log::warn_once!(
+                    "called `DataCell::heap_size_bytes() without computing it first"
+                );
+                0
+            })
+    }
+}
+
 impl DataCellInner {
-    /// Compute and cache the total (heap) allocated size of the underlying arrow array in bytes.
+    /// Compute and cache the total size (stack + heap) of the cell and its underlying arrow array,
+    /// in bytes.
     /// This does nothing if the size has already been computed and cached before.
     ///
     /// Beware: this is _very_ costly!
@@ -541,10 +540,11 @@ impl DataCellInner {
             return;
         }
 
-        *size_bytes = (std::mem::size_of_val(name)
-            + std::mem::size_of_val(size_bytes)
-            + std::mem::size_of_val(values)) as u64
-            + arrow2::compute::aggregate::estimated_bytes_size(&*self.values) as u64;
+        *size_bytes = name.total_size_bytes()
+            + size_bytes.total_size_bytes()
+            + values.data_type().total_size_bytes()
+            + std::mem::size_of_val(values) as u64
+            + arrow2::compute::aggregate::estimated_bytes_size(&**values) as u64;
     }
 }
 
@@ -556,8 +556,8 @@ fn data_cell_sizes() {
     // not computed
     {
         let cell = DataCell::from_arrow(InstanceKey::name(), UInt64Array::from_vec(vec![]).boxed());
-        assert_eq!(0, cell.size_bytes());
-        assert_eq!(0, cell.size_bytes());
+        assert_eq!(0, cell.heap_size_bytes());
+        assert_eq!(0, cell.heap_size_bytes());
     }
 
     // zero-sized
@@ -566,9 +566,8 @@ fn data_cell_sizes() {
             DataCell::from_arrow(InstanceKey::name(), UInt64Array::from_vec(vec![]).boxed());
         cell.compute_size_bytes();
 
-        // only the size of the outer & inner cells themselves
-        assert_eq!(56, cell.size_bytes());
-        assert_eq!(56, cell.size_bytes());
+        assert_eq!(112, cell.heap_size_bytes());
+        assert_eq!(112, cell.heap_size_bytes());
     }
 
     // anything else
@@ -579,9 +578,9 @@ fn data_cell_sizes() {
         );
         cell.compute_size_bytes();
 
-        // 56 bytes for the inner & outer cells + 3x u64s
-        assert_eq!(80, cell.size_bytes());
-        assert_eq!(80, cell.size_bytes());
+        // zero-sized + 3x u64s
+        assert_eq!(136, cell.heap_size_bytes());
+        assert_eq!(136, cell.heap_size_bytes());
     }
 }
 
diff --git a/crates/re_log_types/src/data_row.rs b/crates/re_log_types/src/data_row.rs
index 460a645bb20a..6cee609fa8fb 100644
--- a/crates/re_log_types/src/data_row.rs
+++ b/crates/re_log_types/src/data_row.rs
@@ -2,7 +2,9 @@ use ahash::HashSetExt;
 use nohash_hasher::IntSet;
 use smallvec::SmallVec;
 
-use crate::{ComponentName, DataCell, DataCellError, DataTable, EntityPath, TableId, TimePoint};
+use crate::{
+    ComponentName, DataCell, DataCellError, DataTable, EntityPath, SizeBytes, TableId, TimePoint,
+};
 
 // ---
 
@@ -126,6 +128,13 @@ impl RowId {
     }
 }
 
+impl SizeBytes for RowId {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        0
+    }
+}
+
 impl std::ops::Deref for RowId {
     type Target = re_tuid::Tuid;
 
diff --git a/crates/re_log_types/src/data_table.rs b/crates/re_log_types/src/data_table.rs
index 4c54140be722..adc9df0e0344 100644
--- a/crates/re_log_types/src/data_table.rs
+++ b/crates/re_log_types/src/data_table.rs
@@ -7,7 +7,7 @@ use smallvec::SmallVec;
 
 use crate::{
     ArrowMsg, ComponentName, DataCell, DataCellError, DataRow, DataRowError, EntityPath, RowId,
-    TimePoint, Timeline,
+    SizeBytes, TimePoint, Timeline,
 };
 
 // ---
@@ -41,9 +41,6 @@ pub enum DataTableError {
 
 pub type DataTableResult<T> = ::std::result::Result<T, DataTableError>;
 
-// TODO(#1757): The timepoint should be serialized as one column per timeline... that would be both
-// more efficient and yield much better debugging views of our tables.
-
 // ---
 
 pub type RowIdVec = SmallVec<[RowId; 4]>;
@@ -107,8 +104,7 @@ impl DataCellColumn {
         Self(smallvec::smallvec![None; num_rows])
     }
 
-    /// Compute and cache the total (heap) allocated size of each individual underlying
-    /// [`DataCell`].
+    /// Compute and cache the size of each individual underlying [`DataCell`].
     /// This does nothing for cells whose size has already been computed and cached before.
     ///
     /// Beware: this is _very_ costly!
@@ -120,6 +116,13 @@ impl DataCellColumn {
     }
 }
 
+impl SizeBytes for DataCellColumn {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        self.0.heap_size_bytes()
+    }
+}
+
 // ---
 
 /// A unique ID for a [`DataTable`].
@@ -161,6 +164,13 @@ impl TableId {
     }
 }
 
+impl SizeBytes for TableId {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        0
+    }
+}
+
 impl std::ops::Deref for TableId {
     type Target = re_tuid::Tuid;
 
@@ -515,6 +525,27 @@ impl DataTable {
     }
 }
 
+impl SizeBytes for DataTable {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        let Self {
+            table_id,
+            col_row_id,
+            col_timelines,
+            col_entity_path,
+            col_num_instances,
+            columns,
+        } = self;
+
+        table_id.heap_size_bytes()
+            + col_row_id.heap_size_bytes()
+            + col_timelines.heap_size_bytes()
+            + col_entity_path.heap_size_bytes()
+            + col_num_instances.heap_size_bytes()
+            + columns.heap_size_bytes()
+    }
+}
+
 // --- Serialization ---
 
 use arrow2::{
@@ -697,8 +728,6 @@ impl DataTable {
         let mut field = Field::new(name, data.data_type().clone(), false)
             .with_metadata([(METADATA_KIND.to_owned(), METADATA_KIND_CONTROL.to_owned())].into());
 
-        // TODO(cmc): why do we have to do this manually on the way out, but it's done
-        // automatically on our behalf on the way in...?
         if let DataType::Extension(name, _, _) = data.data_type() {
             field
                 .metadata
@@ -724,8 +753,6 @@ impl DataTable {
         let mut field = Field::new(name, datatype.clone(), false)
             .with_metadata([(METADATA_KIND.to_owned(), METADATA_KIND_CONTROL.to_owned())].into());
 
-        // TODO(cmc): why do we have to do this manually on the way out, but it's done
-        // automatically on our behalf on the way in...?
         if let DataType::Extension(name, _, _) = datatype {
             field
                 .metadata
@@ -979,6 +1006,8 @@ impl DataTable {
                 .downcast_ref::<ListArray<i32>>()
                 .ok_or(DataTableError::NotAColumn(component.to_string()))?
                 .iter()
+                // TODO(#1805): Schema metadata gets cloned in every single array.
+                // This'll become a problem as soon as we enable batching.
                 .map(|array| array.map(|values| DataCell::from_arrow(component, values)))
                 .collect(),
         ))
diff --git a/crates/re_log_types/src/lib.rs b/crates/re_log_types/src/lib.rs
index 0a20ae6ef4da..8f964b8195a7 100644
--- a/crates/re_log_types/src/lib.rs
+++ b/crates/re_log_types/src/lib.rs
@@ -17,6 +17,7 @@ mod data_table;
 pub mod hash;
 mod index;
 pub mod path;
+mod size_bytes;
 mod time;
 pub mod time_point;
 mod time_range;
@@ -53,6 +54,7 @@ pub use self::data_table::{
 };
 pub use self::index::*;
 pub use self::path::*;
+pub use self::size_bytes::SizeBytes;
 pub use self::time::{Duration, Time};
 pub use self::time_point::{TimeInt, TimePoint, TimeType, Timeline, TimelineName};
 pub use self::time_range::{TimeRange, TimeRangeF};
diff --git a/crates/re_log_types/src/path/component_name.rs b/crates/re_log_types/src/path/component_name.rs
index 4e0020903746..bb96e862f84c 100644
--- a/crates/re_log_types/src/path/component_name.rs
+++ b/crates/re_log_types/src/path/component_name.rs
@@ -1,3 +1,5 @@
+use crate::SizeBytes;
+
 re_string_interner::declare_new_type!(
     /// The name of an entity component, e.g. `pos` or `color`.
     pub struct ComponentName;
@@ -15,6 +17,7 @@ impl ComponentName {
     /// Excludes the rerun namespace, so you'll get `color` but `ext.confidence`.
     ///
     /// Used for most UI elements.
+    #[inline]
     pub fn short_name(&self) -> &'static str {
         let full_name = self.0.as_str();
         if let Some(short_name) = full_name.strip_prefix("rerun.") {
@@ -24,3 +27,10 @@ impl ComponentName {
         }
     }
 }
+
+impl SizeBytes for ComponentName {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        0
+    }
+}
diff --git a/crates/re_log_types/src/path/entity_path.rs b/crates/re_log_types/src/path/entity_path.rs
index 1a68576555c0..d23b257aa64b 100644
--- a/crates/re_log_types/src/path/entity_path.rs
+++ b/crates/re_log_types/src/path/entity_path.rs
@@ -2,6 +2,7 @@ use std::sync::Arc;
 
 use crate::{
     hash::Hash64, parse_entity_path, path::entity_path_impl::EntityPathImpl, EntityPathPart,
+    SizeBytes,
 };
 
 // ----------------------------------------------------------------------------
@@ -156,6 +157,13 @@ impl EntityPath {
     }
 }
 
+impl SizeBytes for EntityPath {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        0 // NOTE: we assume it's amortized due to the `Arc`
+    }
+}
+
 impl FromIterator<EntityPathPart> for EntityPath {
     fn from_iter<T: IntoIterator<Item = EntityPathPart>>(parts: T) -> Self {
         Self::new(parts.into_iter().collect())
diff --git a/crates/re_log_types/src/size_bytes.rs b/crates/re_log_types/src/size_bytes.rs
new file mode 100644
index 000000000000..a670eee44d74
--- /dev/null
+++ b/crates/re_log_types/src/size_bytes.rs
@@ -0,0 +1,173 @@
+use std::collections::{BTreeMap, HashMap};
+
+use arrow2::datatypes::{DataType, Field};
+use smallvec::SmallVec;
+
+// ---
+
+/// Approximations of stack and heap size for both internal and external types.
+///
+/// Motly used for statistics and triggering events such as garbage collection.
+pub trait SizeBytes: Sized {
+    /// Returns the total size of `self` in bytes, accounting for both stack and heap space.
+    #[inline]
+    fn total_size_bytes(&self) -> u64 {
+        self.stack_size_bytes() + self.heap_size_bytes()
+    }
+
+    /// Returns the total size of `self` on the stack, in bytes.
+    ///
+    /// Defaults to `std::mem::size_of_val(self)`.
+    #[inline]
+    fn stack_size_bytes(&self) -> u64 {
+        std::mem::size_of_val(self) as _
+    }
+
+    /// Returns the total size of `self` on the heap, in bytes.
+    fn heap_size_bytes(&self) -> u64;
+}
+
+// --- Std ---
+
+impl SizeBytes for String {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        self.capacity() as u64
+    }
+}
+
+impl<K: SizeBytes, V: SizeBytes> SizeBytes for BTreeMap<K, V> {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        // TODO(cmc): This is sub-optimal if these types are PODs.
+
+        // NOTE: It's all on the heap at this point.
+        self.keys().map(SizeBytes::total_size_bytes).sum::<u64>()
+            + self.values().map(SizeBytes::total_size_bytes).sum::<u64>()
+    }
+}
+
+impl<K: SizeBytes, V: SizeBytes, S> SizeBytes for HashMap<K, V, S> {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        // TODO(cmc): This is sub-optimal if these types are PODs.
+
+        // NOTE: It's all on the heap at this point.
+        self.keys().map(SizeBytes::total_size_bytes).sum::<u64>()
+            + self.values().map(SizeBytes::total_size_bytes).sum::<u64>()
+    }
+}
+
+impl<T: SizeBytes> SizeBytes for Vec<T> {
+    /// Does not take capacity into account.
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        // TODO(cmc): This is sub-optimal if these types are PODs.
+
+        // NOTE: It's all on the heap at this point.
+        self.iter().map(SizeBytes::total_size_bytes).sum::<u64>()
+    }
+}
+
+impl<T: SizeBytes, const N: usize> SizeBytes for SmallVec<[T; N]> {
+    /// Does not take capacity into account.
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        // TODO(cmc): This is sub-optimal if these types are PODs.
+
+        // NOTE: It's all on the heap at this point.
+        self.iter().map(SizeBytes::total_size_bytes).sum::<u64>()
+    }
+}
+
+impl<T: SizeBytes> SizeBytes for Option<T> {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        self.as_ref().map_or(0, SizeBytes::heap_size_bytes)
+    }
+}
+
+// NOTE: `impl<T: bytemuck::Pod> SizeBytesExt for T {}` would be nice but violates orphan rules.
+macro_rules! impl_size_bytes_pod {
+    ($ty:ty) => {
+        impl SizeBytes for $ty {
+            #[inline]
+            fn heap_size_bytes(&self) -> u64 {
+                0
+            }
+        }
+    };
+    ($ty:ty, $($rest:ty),+) => {
+        impl_size_bytes_pod!($ty); impl_size_bytes_pod!($($rest),+);
+    };
+}
+
+impl_size_bytes_pod!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128, bool, f32, f64);
+
+// --- Arrow ---
+
+impl SizeBytes for DataType {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        match self {
+            DataType::Null
+            | DataType::Binary
+            | DataType::Boolean
+            | DataType::Date32
+            | DataType::Date64
+            | DataType::Float16
+            | DataType::Float32
+            | DataType::Float64
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::Int8
+            | DataType::LargeBinary
+            | DataType::LargeUtf8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::UInt8
+            | DataType::Time32(_)
+            | DataType::Time64(_)
+            | DataType::Duration(_)
+            | DataType::Interval(_)
+            | DataType::FixedSizeBinary(_)
+            | DataType::Decimal(_, _)
+            | DataType::Decimal256(_, _)
+            | DataType::Utf8 => 0,
+            DataType::Timestamp(_, str) => str.heap_size_bytes(),
+            DataType::List(field)
+            | DataType::FixedSizeList(field, _)
+            | DataType::LargeList(field)
+            | DataType::Map(field, _) => field.total_size_bytes(), // NOTE: Boxed, it's all on the heap
+            DataType::Struct(fields) => fields.heap_size_bytes(),
+            DataType::Union(fields, indices, _) => {
+                fields.heap_size_bytes() + indices.heap_size_bytes()
+            }
+            DataType::Dictionary(_, datatype, _) => datatype.total_size_bytes(), // NOTE: Boxed, it's all on the heap
+            DataType::Extension(name, datatype, extra) => {
+                name.heap_size_bytes()
+                + datatype.total_size_bytes() // NOTE: Boxed, it's all on the heap
+                + extra.heap_size_bytes()
+            }
+        }
+    }
+}
+
+impl SizeBytes for Field {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        let Field {
+            name,
+            data_type,
+            is_nullable,
+            metadata,
+        } = self;
+
+        name.heap_size_bytes()
+            + data_type.heap_size_bytes()
+            + is_nullable.heap_size_bytes()
+            + metadata.heap_size_bytes()
+    }
+}
diff --git a/crates/re_log_types/src/time_point/mod.rs b/crates/re_log_types/src/time_point/mod.rs
index 79a1eeae526f..4074810b8222 100644
--- a/crates/re_log_types/src/time_point/mod.rs
+++ b/crates/re_log_types/src/time_point/mod.rs
@@ -3,7 +3,7 @@ use std::collections::{btree_map, BTreeMap};
 mod time_int;
 mod timeline;
 
-use crate::{time::Time, TimeRange};
+use crate::{time::Time, SizeBytes, TimeRange};
 
 // Re-exports
 pub use time_int::TimeInt;
@@ -73,6 +73,7 @@ impl TimePoint {
 
     /// Computes the union of two `TimePoint`s, keeping the maximum time value in case of
     /// conflicts.
+    #[inline]
     pub fn union_max(mut self, rhs: &Self) -> Self {
         for (&timeline, &time) in rhs {
             match self.0.entry(timeline) {
@@ -89,6 +90,23 @@ impl TimePoint {
     }
 }
 
+impl SizeBytes for TimePoint {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        type K = Timeline;
+        type V = TimeInt;
+
+        // NOTE: This is only here to make sure this method fails to compile if the inner type
+        // changes, as the following size computation assumes POD types.
+        let inner: &BTreeMap<K, V> = &self.0;
+
+        let keys_size_bytes = std::mem::size_of::<K>() * inner.len();
+        let values_size_bytes = std::mem::size_of::<V>() * inner.len();
+
+        (keys_size_bytes + values_size_bytes) as u64
+    }
+}
+
 // ----------------------------------------------------------------------------
 
 /// The type of a [`TimeInt`] or [`Timeline`].
diff --git a/crates/re_log_types/src/time_point/timeline.rs b/crates/re_log_types/src/time_point/timeline.rs
index 20c207365cbc..d42f92d0869c 100644
--- a/crates/re_log_types/src/time_point/timeline.rs
+++ b/crates/re_log_types/src/time_point/timeline.rs
@@ -1,6 +1,6 @@
 use arrow2::datatypes::{DataType, TimeUnit};
 
-use crate::{TimeRange, TimeType};
+use crate::{SizeBytes, TimeRange, TimeType};
 
 re_string_interner::declare_new_type!(
     /// The name of a timeline. Often something like `"log_time"` or `"frame_nr"`.
@@ -100,6 +100,13 @@ impl Timeline {
 
 impl nohash_hasher::IsEnabled for Timeline {}
 
+impl SizeBytes for Timeline {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        0
+    }
+}
+
 // required for [`nohash_hasher`].
 #[allow(clippy::derive_hash_xor_eq)]
 impl std::hash::Hash for Timeline {
diff --git a/crates/re_log_types/src/time_range.rs b/crates/re_log_types/src/time_range.rs
index 68b7e4a7f159..e8350932e9cc 100644
--- a/crates/re_log_types/src/time_range.rs
+++ b/crates/re_log_types/src/time_range.rs
@@ -1,6 +1,6 @@
 use std::ops::RangeInclusive;
 
-use crate::{TimeInt, TimeReal};
+use crate::{SizeBytes, TimeInt, TimeReal};
 
 // ----------------------------------------------------------------------------
 
@@ -62,6 +62,13 @@ impl TimeRange {
     }
 }
 
+impl SizeBytes for TimeRange {
+    #[inline]
+    fn heap_size_bytes(&self) -> u64 {
+        0
+    }
+}
+
 impl From<TimeRange> for RangeInclusive<TimeInt> {
     fn from(range: TimeRange) -> RangeInclusive<TimeInt> {
         range.min..=range.max
diff --git a/crates/re_query/src/query.rs b/crates/re_query/src/query.rs
index 5c4bd34194b0..e52a1deea003 100644
--- a/crates/re_query/src/query.rs
+++ b/crates/re_query/src/query.rs
@@ -168,7 +168,7 @@ pub fn __populate_example_store() -> DataStore {
     let points = vec![Point2D { x: 1.0, y: 2.0 }, Point2D { x: 3.0, y: 4.0 }];
 
     let row = DataRow::from_cells2(
-        RowId::ZERO,
+        RowId::random(),
         ent_path,
         timepoint,
         instances.len() as _,
@@ -180,7 +180,7 @@ pub fn __populate_example_store() -> DataStore {
     let colors = vec![ColorRGBA(0xff000000)];
 
     let row = DataRow::from_cells2(
-        RowId::ZERO,
+        RowId::random(),
         ent_path,
         timepoint,
         instances.len() as _,
diff --git a/crates/re_viewer/src/ui/memory_panel.rs b/crates/re_viewer/src/ui/memory_panel.rs
index c2e9f9450059..796c98977a9d 100644
--- a/crates/re_viewer/src/ui/memory_panel.rs
+++ b/crates/re_viewer/src/ui/memory_panel.rs
@@ -1,4 +1,4 @@
-use re_arrow_store::{DataStoreConfig, DataStoreStats};
+use re_arrow_store::{DataStoreConfig, DataStoreRowStats, DataStoreStats};
 use re_format::{format_bytes, format_number};
 use re_memory::{util::sec_since_start, MemoryHistory, MemoryLimit, MemoryUse};
 use re_renderer::WgpuResourcePoolStatistics;
@@ -26,7 +26,7 @@ impl MemoryPanel {
                 (gpu_resource_stats.total_buffer_size_in_bytes
                     + gpu_resource_stats.total_texture_size_in_bytes) as _,
             ),
-            Some(store_stats.total_size_bytes as _),
+            Some(store_stats.total.num_bytes as _),
         );
     }
 
@@ -215,13 +215,13 @@ impl MemoryPanel {
             .num_columns(3)
             .show(ui, |ui| {
                 let DataStoreStats {
-                    total_timeless_rows,
-                    total_timeless_size_bytes,
-                    total_temporal_rows,
-                    total_temporal_size_bytes,
-                    total_temporal_buckets,
-                    total_rows,
-                    total_size_bytes,
+                    type_registry,
+                    metadata_registry,
+                    autogenerated,
+                    timeless,
+                    temporal,
+                    temporal_buckets,
+                    total,
                 } = *store_stats;
 
                 ui.label(egui::RichText::new("Stats").italics());
@@ -230,30 +230,44 @@ impl MemoryPanel {
                 ui.label("Size");
                 ui.end_row();
 
-                let label_buckets = |ui: &mut egui::Ui, num_buckets| {
-                    ui.label(re_format::format_number(num_buckets as _))
-                };
-                let label_rows =
-                    |ui: &mut egui::Ui, num_rows| ui.label(re_format::format_number(num_rows as _));
-                let label_size =
-                    |ui: &mut egui::Ui, size| ui.label(re_format::format_bytes(size as _));
+                fn label_row_stats(ui: &mut egui::Ui, row_stats: DataStoreRowStats) {
+                    let DataStoreRowStats {
+                        num_rows,
+                        num_bytes,
+                    } = row_stats;
+
+                    ui.label(re_format::format_number(num_rows as _));
+                    ui.label(re_format::format_bytes(num_bytes as _));
+                }
+
+                ui.label("Type registry:");
+                ui.label("");
+                label_row_stats(ui, type_registry);
+                ui.end_row();
+
+                ui.label("Metadata registry:");
+                ui.label("");
+                label_row_stats(ui, metadata_registry);
+                ui.end_row();
+
+                ui.label("Cluster cache:");
+                ui.label("");
+                label_row_stats(ui, autogenerated);
+                ui.end_row();
 
                 ui.label("Timeless:");
                 ui.label("");
-                label_rows(ui, total_timeless_rows);
-                label_size(ui, total_timeless_size_bytes);
+                label_row_stats(ui, timeless);
                 ui.end_row();
 
                 ui.label("Temporal:");
-                label_buckets(ui, total_temporal_buckets);
-                label_rows(ui, total_temporal_rows);
-                label_size(ui, total_temporal_size_bytes);
+                ui.label(re_format::format_number(temporal_buckets as _));
+                label_row_stats(ui, temporal);
                 ui.end_row();
 
                 ui.label("Total");
-                label_buckets(ui, total_temporal_buckets);
-                label_rows(ui, total_rows);
-                label_size(ui, total_size_bytes);
+                ui.label(re_format::format_number(temporal_buckets as _));
+                label_row_stats(ui, total);
                 ui.end_row();
             });
     }
diff --git a/crates/re_viewer/src/ui/time_panel/mod.rs b/crates/re_viewer/src/ui/time_panel/mod.rs
index c937cb5374c3..e106cb4ed74a 100644
--- a/crates/re_viewer/src/ui/time_panel/mod.rs
+++ b/crates/re_viewer/src/ui/time_panel/mod.rs
@@ -747,9 +747,12 @@ fn initialize_time_ranges_ui(
         .prefix_times
         .get(ctx.rec_cfg.time_ctrl.timeline())
     {
-        let timeline_axis = TimelineAxis::new(ctx.rec_cfg.time_ctrl.time_type(), times);
-        time_view = time_view.or_else(|| Some(view_everything(&time_x_range, &timeline_axis)));
-        time_range.extend(timeline_axis.ranges);
+        // NOTE: `times` can be empty if a GC wiped everything.
+        if !times.is_empty() {
+            let timeline_axis = TimelineAxis::new(ctx.rec_cfg.time_ctrl.time_type(), times);
+            time_view = time_view.or_else(|| Some(view_everything(&time_x_range, &timeline_axis)));
+            time_range.extend(timeline_axis.ranges);
+        }
     }
 
     TimeRangesUi::new(
diff --git a/scripts/lint.py b/scripts/lint.py
index c74719628d13..594748c94056 100755
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -18,7 +18,7 @@
 debug_format_of_err = re.compile(r"\{\:#?\?\}.*, err")
 error_match_name = re.compile(r"Err\((\w+)\)")
 wasm_caps = re.compile(r"\bWASM\b")
-nb_prefix = re.compile(r"\bnb_")
+nb_prefix = re.compile(r"nb_")
 
 
 def lint_line(line: str) -> Optional[str]:
@@ -102,6 +102,7 @@ def test_lint_line() -> None:
         "if let Err(error) = foo",
         "We use WASM in Rerun",
         "nb_instances",
+        "inner_nb_instances",
     ]
 
     for line in should_pass: