Skip to content

Commit

Permalink
merge cmc/datastore/get_rid_of_copies (#584)
Browse files Browse the repository at this point in the history
  • Loading branch information
teh-cmc committed Dec 18, 2022
1 parent b7f3549 commit a41180e
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 46 deletions.
42 changes: 36 additions & 6 deletions crates/re_arrow_store/src/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -848,14 +848,36 @@ pub struct ComponentBucket {
/// The offset of this bucket in the global table.
pub(crate) row_offset: RowIndex,

/// Has this bucket been retired yet?
///
/// At any given moment, all buckets except the currently active one have to be retired.
pub(crate) retired: bool,

/// The time ranges (plural!) covered by this bucket.
/// Buckets are never sorted over time, so these time ranges can grow arbitrarily large.
///
/// These are only used for garbage collection.
pub(crate) time_ranges: HashMap<Timeline, TimeRange>,

/// All the data for this bucket. This is a single column!
pub(crate) data: Box<dyn Array>,
/// All the data for this bucket: many rows of a single column.
///
/// During the active lifespan of the bucket, this can contain an arbitrary number of chunks,
/// depending on how the data was inserted (e.g. single insertions vs. batches).
/// All of these chunks get compacted into one contiguous array when the bucket is retired,
/// i.e. when the bucket is full and a new one is created.
///
/// Note that, as of today (#589), we do not support batch insertion nor do we support chunks
/// of non-unit length: chunks always contain one and only one row's worth of data until the
/// bucket is retired.
pub(crate) chunks: Vec<Box<dyn Array>>,

/// The total number of rows present in this bucket, across all chunks.
pub(crate) total_rows: u64,
/// The size of this bucket in bytes, across all chunks.
///
/// Accurately computing the size of arrow arrays is surpsingly costly, which is why we cache
/// this.
pub(crate) total_size_bytes: u64,
}

impl std::fmt::Display for ComponentBucket {
Expand All @@ -876,12 +898,13 @@ impl std::fmt::Display for ComponentBucket {
// TODO(#439): is that still true with deletion?
self.row_offset.as_u64()
+ self
.data
.chunks
.len()
.checked_sub(1)
.expect("buckets are never empty") as u64,
))?;

f.write_fmt(format_args!("retired: {}\n", self.retired))?;
f.write_str("time ranges:\n")?;
for (timeline, time_range) in &self.time_ranges {
f.write_fmt(format_args!(
Expand All @@ -890,7 +913,14 @@ impl std::fmt::Display for ComponentBucket {
))?;
}

let chunk = Chunk::new(vec![self.data()]);
let rows = if self.retired {
self.data()
} else {
use arrow2::compute::concatenate::concatenate;
let chunks = self.chunks.iter().map(|chunk| &**chunk).collect::<Vec<_>>();
vec![concatenate(&chunks).unwrap()]
};
let chunk = Chunk::new(rows);
f.write_str(&arrow2::io::print::write(&[chunk], &[self.name.as_str()]))?;

Ok(())
Expand All @@ -900,12 +930,12 @@ impl std::fmt::Display for ComponentBucket {
impl ComponentBucket {
/// Returns the number of rows stored across this bucket.
pub fn total_rows(&self) -> u64 {
self.data.len() as u64
self.total_rows
}

/// Returns the size of the data stored across this bucket, in bytes.
pub fn total_size_bytes(&self) -> u64 {
arrow2::compute::aggregate::estimated_bytes_size(&*self.data) as u64
self.total_size_bytes
}
}

Expand Down
23 changes: 14 additions & 9 deletions crates/re_arrow_store/src/store_read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,8 @@ impl DataStore {

/// Retrieves the data associated with a list of `components` at the specified `indices`.
///
/// If the associated data is found, it will be written to returned array at the appropriate
/// index, or `None` otherwise.
/// If the associated data is found, it will be written into the returned array at the
/// appropriate index, or `None` otherwise.
///
/// `row_indices` takes a list of options so that one can easily re-use the results obtained
/// from [`Self::query`].
Expand Down Expand Up @@ -574,7 +574,7 @@ impl ComponentTable {
}

impl ComponentBucket {
/// Get this `ComponentBucket`s debug name
/// Returns the name of component stored in this bucket.
#[allow(dead_code)]
pub fn name(&self) -> &str {
&self.name
Expand All @@ -583,21 +583,26 @@ impl ComponentBucket {
/// Returns a shallow clone of the row data for the given `row_idx`.
pub fn get(&self, row_idx: RowIndex) -> Box<dyn Array> {
let row_idx = row_idx.as_u64() - self.row_offset.as_u64();
let rows = self.data.slice(row_idx as usize, 1);

let rows = if self.retired {
self.chunks[0].slice(row_idx as _, 1)
} else {
self.chunks[row_idx as usize].slice(0, 1)
};

// This has to be safe to unwrap, otherwise it would never have made it past insertion.
rows.as_any()
.downcast_ref::<ListArray<i32>>()
.unwrap()
.value(0)
}

/// Returns the entire data Array in this component
pub fn data(&self) -> Box<dyn Array> {
// shallow copy
self.data.clone()
/// Returns a shallow clone of the data chunks in this component.
pub fn data(&self) -> Vec<Box<dyn Array>> {
self.chunks.clone() // shallow
}

/// Return an iterator over the time ranges in this bucket
/// Return an iterator over the time ranges in this bucket.
#[allow(dead_code)]
pub fn iter_time_ranges(&self) -> impl Iterator<Item = (&Timeline, &TimeRange)> {
self.time_ranges.iter()
Expand Down
Loading

0 comments on commit a41180e

Please sign in to comment.