diff --git a/crates/re_arrow_store/Cargo.toml b/crates/re_arrow_store/Cargo.toml index 8487f656b17f..93e91b269a2f 100644 --- a/crates/re_arrow_store/Cargo.toml +++ b/crates/re_arrow_store/Cargo.toml @@ -111,3 +111,7 @@ required-features = ["polars"] [[bench]] name = "data_store" harness = false + +[[bench]] +name = "arrow2_convert" +harness = false diff --git a/crates/re_arrow_store/benches/arrow2_convert.rs b/crates/re_arrow_store/benches/arrow2_convert.rs new file mode 100644 index 000000000000..c53dd6b5bd80 --- /dev/null +++ b/crates/re_arrow_store/benches/arrow2_convert.rs @@ -0,0 +1,141 @@ +//! Keeping track of performance issues/regressions in `arrow2_convert` that directly affect us. + +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + +use arrow2::{array::PrimitiveArray, datatypes::PhysicalType, types::PrimitiveType}; +use criterion::{criterion_group, criterion_main, Criterion}; +use re_log_types::{ + component_types::InstanceKey, external::arrow2_convert::deserialize::TryIntoCollection, + Component as _, DataCell, +}; + +// --- + +criterion_group!(benches, serialize, deserialize); +criterion_main!(benches); + +// --- + +#[cfg(not(debug_assertions))] +const NUM_INSTANCES: usize = 100_000; + +// `cargo test` also runs the benchmark setup code, so make sure they run quickly: +#[cfg(debug_assertions)] +const NUM_INSTANCES: usize = 1; + +// --- + +fn serialize(c: &mut Criterion) { + let mut group = c.benchmark_group(format!( + "arrow2_convert/serialize/primitive/instances={NUM_INSTANCES}" + )); + group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); + + { + group.bench_function("arrow2_convert", |b| { + b.iter(|| { + let cell = DataCell::from_component::(0..NUM_INSTANCES as u64); + assert_eq!(NUM_INSTANCES as u32, cell.num_instances()); + assert_eq!( + cell.datatype().to_physical_type(), + PhysicalType::Primitive(PrimitiveType::UInt64) + ); + cell + }); + }); + } + + { + group.bench_function("arrow2/from_values", |b| { + b.iter(|| { + let values = PrimitiveArray::from_values(0..NUM_INSTANCES as u64).boxed(); + let cell = crate::DataCell::from_arrow(InstanceKey::name(), values); + assert_eq!(NUM_INSTANCES as u32, cell.num_instances()); + assert_eq!( + cell.datatype().to_physical_type(), + PhysicalType::Primitive(PrimitiveType::UInt64) + ); + cell + }); + }); + } + + { + group.bench_function("arrow2/from_vec", |b| { + b.iter(|| { + // NOTE: We do the `collect()` here on purpose! + // + // All of these APIs have to allocate an array under the hood, except `from_vec` + // which is O(1) (it just unsafely reuses the vec's data pointer). + // We need to measure the collection in order to have a leveled playing field. + let values = PrimitiveArray::from_vec((0..NUM_INSTANCES as u64).collect()).boxed(); + let cell = crate::DataCell::from_arrow(InstanceKey::name(), values); + assert_eq!(NUM_INSTANCES as u32, cell.num_instances()); + assert_eq!( + cell.datatype().to_physical_type(), + PhysicalType::Primitive(PrimitiveType::UInt64) + ); + cell + }); + }); + } +} + +fn deserialize(c: &mut Criterion) { + let mut group = c.benchmark_group(format!( + "arrow2_convert/deserialize/primitive/instances={NUM_INSTANCES}" + )); + group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _)); + + let cell = DataCell::from_component::(0..NUM_INSTANCES as u64); + let data = cell.as_arrow(); + + { + group.bench_function("arrow2_convert", |b| { + b.iter(|| { + let keys: Vec = data.as_ref().try_into_collection().unwrap(); + assert_eq!(NUM_INSTANCES, keys.len()); + assert_eq!( + InstanceKey(NUM_INSTANCES as u64 / 2), + keys[NUM_INSTANCES / 2] + ); + keys + }); + }); + } + + { + group.bench_function("arrow2/validity_checks", |b| { + b.iter(|| { + let data = data.as_any().downcast_ref::>().unwrap(); + let keys: Vec = data + .into_iter() + .filter_map(|v| v.copied().map(InstanceKey)) + .collect(); + assert_eq!(NUM_INSTANCES, keys.len()); + assert_eq!( + InstanceKey(NUM_INSTANCES as u64 / 2), + keys[NUM_INSTANCES / 2] + ); + keys + }); + }); + } + + { + group.bench_function("arrow2/validity_bypass", |b| { + b.iter(|| { + let data = data.as_any().downcast_ref::>().unwrap(); + assert!(data.validity().is_none()); + let keys: Vec = data.values_iter().copied().map(InstanceKey).collect(); + assert_eq!(NUM_INSTANCES, keys.len()); + assert_eq!( + InstanceKey(NUM_INSTANCES as u64 / 2), + keys[NUM_INSTANCES / 2] + ); + keys + }); + }); + } +}