Skip to content

Commit

Permalink
Use Typed Buffers in Arrays (#1811) (#1176) (#3743)
Browse files Browse the repository at this point in the history
* Remove RawPtrBox (#1811) (#1176)

* Clippy

* Extract get_offsets function
  • Loading branch information
tustvold authored Feb 23, 2023
1 parent ebe6f53 commit 47e4b61
Show file tree
Hide file tree
Showing 17 changed files with 242 additions and 261 deletions.
19 changes: 6 additions & 13 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::print_long_array;
use crate::builder::BooleanBuilder;
use crate::iterator::BooleanIter;
use crate::raw_pointer::RawPtrBox;
use crate::{print_long_array, Array, ArrayAccessor};
use crate::{Array, ArrayAccessor};
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
use arrow_data::bit_mask::combine_option_bitmap;
use arrow_data::ArrayData;
Expand Down Expand Up @@ -67,9 +67,7 @@ use std::any::Any;
#[derive(Clone)]
pub struct BooleanArray {
data: ArrayData,
/// Pointer to the value array. The lifetime of this must be <= to the value buffer
/// stored in `data`, so it's safe to store.
raw_values: RawPtrBox<u8>,
raw_values: Buffer,
}

impl std::fmt::Debug for BooleanArray {
Expand Down Expand Up @@ -102,7 +100,7 @@ impl BooleanArray {
///
/// Note this doesn't take the offset of this array into account.
pub fn values(&self) -> &Buffer {
&self.data.buffers()[0]
&self.raw_values
}

/// Returns the number of non null, true values within this array
Expand Down Expand Up @@ -328,13 +326,8 @@ impl From<ArrayData> for BooleanArray {
1,
"BooleanArray data should contain a single buffer only (values buffer)"
);
let ptr = data.buffers()[0].as_ptr();
Self {
data,
// SAFETY:
// ArrayData must be valid, and validated data type above
raw_values: unsafe { RawPtrBox::new(ptr) },
}
let raw_values = data.buffers()[0].clone();
Self { data, raw_values }
}
}

Expand Down
42 changes: 17 additions & 25 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::{empty_offsets, print_long_array};
use crate::array::{get_offsets, print_long_array};
use crate::builder::GenericByteBuilder;
use crate::iterator::ArrayIter;
use crate::raw_pointer::RawPtrBox;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, OffsetSizeTrait};
use arrow_buffer::ArrowNativeType;
use arrow_buffer::buffer::OffsetBuffer;
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use std::any::Any;
Expand All @@ -39,16 +39,16 @@ use std::any::Any;
/// [`LargeBinaryArray`]: crate::LargeBinaryArray
pub struct GenericByteArray<T: ByteArrayType> {
data: ArrayData,
value_offsets: RawPtrBox<T::Offset>,
value_data: RawPtrBox<u8>,
value_offsets: OffsetBuffer<T::Offset>,
value_data: Buffer,
}

impl<T: ByteArrayType> Clone for GenericByteArray<T> {
fn clone(&self) -> Self {
Self {
data: self.data.clone(),
value_offsets: self.value_offsets,
value_data: self.value_data,
value_offsets: self.value_offsets.clone(),
value_data: self.value_data.clone(),
}
}
}
Expand All @@ -68,7 +68,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {

/// Returns the raw value data
pub fn value_data(&self) -> &[u8] {
self.data.buffers()[1].as_slice()
self.value_data.as_slice()
}

/// Returns true if all data within this array is ASCII
Expand All @@ -82,15 +82,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[T::Offset] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the element at index `i`
Expand Down Expand Up @@ -161,6 +153,8 @@ impl<T: ByteArrayType> GenericByteArray<T> {
.slice_with_length(self.data.offset() * element_len, value_len * element_len);

drop(self.data);
drop(self.value_data);
drop(self.value_offsets);

let try_mutable_null_buffer = match null_bit_buffer {
None => Ok(None),
Expand Down Expand Up @@ -280,18 +274,16 @@ impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
T::Offset::PREFIX,
T::PREFIX,
);
// Handle case of empty offsets
let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
true => empty_offsets::<T::Offset>().as_ptr() as *const _,
false => data.buffers()[0].as_ptr(),
};
let values = data.buffers()[1].as_ptr();
// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
let value_data = data.buffers()[1].clone();
Self {
data,
// SAFETY:
// ArrayData must be valid, and validated data type above
value_offsets: unsafe { RawPtrBox::new(offsets) },
value_data: unsafe { RawPtrBox::new(values) },
value_offsets,
value_data,
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions arrow-array/src/array/fixed_size_binary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::print_long_array;
use crate::iterator::FixedSizeBinaryIter;
use crate::raw_pointer::RawPtrBox;
use crate::{print_long_array, Array, ArrayAccessor, FixedSizeListArray};
use crate::{Array, ArrayAccessor, FixedSizeListArray};
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
Expand Down Expand Up @@ -50,7 +50,7 @@ use std::any::Any;
#[derive(Clone)]
pub struct FixedSizeBinaryArray {
data: ArrayData,
value_data: RawPtrBox<u8>,
value_data: Buffer,
length: i32,
}

Expand Down Expand Up @@ -357,14 +357,14 @@ impl From<ArrayData> for FixedSizeBinaryArray {
1,
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
);
let value_data = data.buffers()[0].as_ptr();
let value_data = data.buffers()[0].clone();
let length = match data.data_type() {
DataType::FixedSizeBinary(len) => *len,
_ => panic!("Expected data type to be FixedSizeBinary"),
};
Self {
data,
value_data: unsafe { RawPtrBox::new(value_data) },
value_data,
length,
}
}
Expand Down
5 changes: 2 additions & 3 deletions arrow-array/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::print_long_array;
use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
use crate::{
make_array, print_long_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType,
};
use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use std::any::Any;
Expand Down
46 changes: 11 additions & 35 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::make_array;
use crate::array::{get_offsets, make_array, print_long_array};
use crate::builder::{GenericListBuilder, PrimitiveBuilder};
use crate::{
iterator::GenericListArrayIter, print_long_array, raw_pointer::RawPtrBox, Array,
ArrayAccessor, ArrayRef, ArrowPrimitiveType,
iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType,
};
use arrow_buffer::buffer::OffsetBuffer;
use arrow_buffer::ArrowNativeType;
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType, Field};
Expand All @@ -45,35 +45,24 @@ impl OffsetSizeTrait for i64 {
const PREFIX: &'static str = "Large";
}

/// Returns a slice of `OffsetSize` consisting of a single zero value
#[inline]
pub(crate) fn empty_offsets<OffsetSize: OffsetSizeTrait>() -> &'static [OffsetSize] {
static OFFSET: &[i64] = &[0];
// SAFETY:
// OffsetSize is ArrowNativeType and is therefore trivially transmutable
let (prefix, val, suffix) = unsafe { OFFSET.align_to::<OffsetSize>() };
assert!(prefix.is_empty() && suffix.is_empty());
val
}

/// Generic struct for a variable-size list array.
///
/// Columnar format in Apache Arrow:
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout>
///
/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]`
pub struct GenericListArray<OffsetSize> {
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
data: ArrayData,
values: ArrayRef,
value_offsets: RawPtrBox<OffsetSize>,
value_offsets: OffsetBuffer<OffsetSize>,
}

impl<OffsetSize> Clone for GenericListArray<OffsetSize> {
impl<OffsetSize: OffsetSizeTrait> Clone for GenericListArray<OffsetSize> {
fn clone(&self) -> Self {
Self {
data: self.data.clone(),
values: self.values.clone(),
value_offsets: self.value_offsets,
value_offsets: self.value_offsets.clone(),
}
}
}
Expand Down Expand Up @@ -118,15 +107,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[OffsetSize] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the length for value at index `i`.
Expand Down Expand Up @@ -242,15 +223,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
}

let values = make_array(values);
// Handle case of empty offsets
let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
false => data.buffers()[0].as_ptr(),
};

// SAFETY:
// Verified list type in call to `Self::get_type`
let value_offsets = unsafe { RawPtrBox::new(offsets) };
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };

Ok(Self {
data,
values,
Expand Down
27 changes: 6 additions & 21 deletions arrow-array/src/array/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::raw_pointer::RawPtrBox;
use crate::{make_array, print_long_array, Array, ArrayRef, StringArray, StructArray};
use crate::array::{get_offsets, print_long_array};
use crate::{make_array, Array, ArrayRef, StringArray, StructArray};
use arrow_buffer::buffer::OffsetBuffer;
use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType, Field};
Expand All @@ -38,7 +39,7 @@ pub struct MapArray {
/// The second child of `entries`, the "values" of this MapArray
values: ArrayRef,
/// The start and end offsets of each entry
value_offsets: RawPtrBox<i32>,
value_offsets: OffsetBuffer<i32>,
}

impl MapArray {
Expand Down Expand Up @@ -86,15 +87,7 @@ impl MapArray {
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[i32] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the length for value at index `i`.
Expand Down Expand Up @@ -159,18 +152,10 @@ impl MapArray {
let keys = make_array(entries.child_data()[0].clone());
let values = make_array(entries.child_data()[1].clone());
let entries = make_array(entries);
let value_offsets = data.buffers()[0].as_ptr();

// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { RawPtrBox::<i32>::new(value_offsets) };
unsafe {
if (*value_offsets.as_ptr().offset(0)) != 0 {
return Err(ArrowError::InvalidArgumentError(String::from(
"offsets do not start at zero",
)));
}
}
let value_offsets = unsafe { get_offsets(&data) };

Ok(Self {
data,
Expand Down
27 changes: 25 additions & 2 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
mod binary_array;

use crate::types::*;
use arrow_buffer::buffer::{OffsetBuffer, ScalarBuffer};
use arrow_buffer::ArrowNativeType;
use arrow_data::ArrayData;
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
use std::any::Any;
Expand Down Expand Up @@ -636,8 +638,29 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
make_array(ArrayData::new_null(data_type, length))
}

// Helper function for printing potentially long arrays.
pub(crate) fn print_long_array<A, F>(
/// Helper function that gets offset from an [`ArrayData`]
///
/// # Safety
///
/// - ArrayData must contain a valid [`OffsetBuffer`] as its first buffer
unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> {
match data.is_empty() && data.buffers()[0].is_empty() {
true => OffsetBuffer::new_empty(),
false => {
let buffer = ScalarBuffer::new(
data.buffers()[0].clone(),
data.offset(),
data.len() + 1,
);
// Safety:
// ArrayData is valid
unsafe { OffsetBuffer::new_unchecked(buffer) }
}
}
}

/// Helper function for printing potentially long arrays.
fn print_long_array<A, F>(
array: &A,
f: &mut std::fmt::Formatter,
print_item: F,
Expand Down
Loading

0 comments on commit 47e4b61

Please sign in to comment.