From b9ded5bbf159fe238942da60a4e148c9bdf491c1 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 24 Mar 2024 20:36:41 +0800 Subject: [PATCH 1/5] feat: support live_view_array layout and basic construction --- arrow-array/src/array/list_view_array.rs | 1175 +++++++++++++++++ arrow-array/src/array/mod.rs | 31 +- .../src/builder/generic_list_builder.rs | 2 +- .../src/builder/generic_list_view_builder.rs | 705 ++++++++++ arrow-array/src/builder/mod.rs | 8 + arrow-array/src/builder/struct_builder.rs | 8 + arrow-array/src/cast.rs | 16 + arrow-array/src/iterator.rs | 4 +- arrow-array/src/record_batch.rs | 4 +- arrow-buffer/src/buffer/mod.rs | 3 + arrow-buffer/src/buffer/size.rs | 91 ++ arrow-data/src/data.rs | 69 +- arrow-data/src/equal/list_view.rs | 52 + arrow-data/src/equal/mod.rs | 7 +- 14 files changed, 2160 insertions(+), 15 deletions(-) create mode 100644 arrow-array/src/array/list_view_array.rs create mode 100644 arrow-array/src/builder/generic_list_view_builder.rs create mode 100644 arrow-buffer/src/buffer/size.rs create mode 100644 arrow-data/src/equal/list_view.rs diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs new file mode 100644 index 000000000000..b0e3a0f26898 --- /dev/null +++ b/arrow-array/src/array/list_view_array.rs @@ -0,0 +1,1175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::{get_offsets, get_sizes, make_array, print_long_array}; +use crate::builder::{GenericListViewBuilder, PrimitiveBuilder}; +use crate::{new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, FixedSizeListArray}; +use arrow_buffer::{NullBuffer, OffsetBuffer, SizeBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use std::any::Any; +use std::ops::Add; +use std::sync::Arc; +use crate::iterator::GenericListViewArrayIter; + +// See [`ListViewBuilder`](crate::builder::ListViewBuilder) for how to construct a [`ListViewArray`] +pub type ListViewArray = GenericListViewArray; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`. +/// +// See [`LargeListViewArray`](crate::builder::LargeListViewBuilder) for how to construct a [`LargeListViewArray`] +pub type LargeListViewArray = GenericListViewArray; + +/// +/// Different than [`crate::GenericListArray`] as it stores both an offset and length +/// meaning that take / filter operations can be implemented without copying the underlying data. +/// +/// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout +pub struct GenericListViewArray { + data_type: DataType, + nulls: Option, + values: ArrayRef, + value_offsets: OffsetBuffer, + value_sizes: SizeBuffer, + len: usize, +} + + +impl Clone for GenericListViewArray { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), + values: self.values.clone(), + value_offsets: self.value_offsets.clone(), + value_sizes: self.value_sizes.clone(), + len: self.len, + } + } +} + +impl GenericListViewArray { + /// The data type constructor of listview array. + /// The input is the schema of the child array and + /// the output is the [`DataType`], ListView or LargeListView. + pub const DATA_TYPE_CONSTRUCTOR: fn(FieldRef) -> DataType = if OffsetSize::IS_LARGE { + DataType::LargeListView + } else { + DataType::ListView + }; + + pub fn try_new( + field: FieldRef, + offsets: OffsetBuffer, + sizes: SizeBuffer, + values: ArrayRef, + nulls: Option, + ) -> Result { + + //todo: check illegal offset and size values + let mut len = 0usize; + for i in 0..offsets.len() { + let offset = offsets[i].as_usize(); + let size = sizes[i].as_usize(); + if offset + size > values.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", + OffsetSize::PREFIX, offset, size, values.len() + ))); + } + len = len.add(size); + } + + let len = offsets.len(); + if len != sizes.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", + OffsetSize::PREFIX, len, sizes.len() + ))); + } + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ListViewArray, expected {len} got {}", + OffsetSize::PREFIX, + n.len(), + ))); + } + } + if !field.is_nullable() && values.is_nullable() { + return Err(ArrowError::InvalidArgumentError(format!( + "Non-nullable field of {}ListViewArray {:?} cannot contain nulls", + OffsetSize::PREFIX, + field.name() + ))); + } + + if field.data_type() != values.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListViewArray expected data type {} got {} for {:?}", + OffsetSize::PREFIX, + field.data_type(), + values.data_type(), + field.name() + ))); + } + + + Ok(Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls, + values, + value_offsets: offsets, + value_sizes: sizes, + len, + }) + } + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + offsets: OffsetBuffer, + sizes: SizeBuffer, + values: ArrayRef, + nulls: Option, + ) -> Self { + Self::try_new(field, offsets, sizes, values, nulls).unwrap() + } + + /// Create a new [`GenericListViewArray`] of length `len` where all values are null + pub fn new_null(field: FieldRef, len: usize) -> Self { + let values = new_empty_array(field.data_type()); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls: Some(NullBuffer::new_null(len)), + value_offsets: OffsetBuffer::new_zeroed(len), + value_sizes: SizeBuffer::new_zeroed(len), + values, + len: 0usize + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts( + self, + ) -> ( + FieldRef, + OffsetBuffer, + SizeBuffer, + ArrayRef, + Option, + ) { + let f = match self.data_type { + DataType::ListView(f) | DataType::LargeListView(f) => f, + _ => unreachable!(), + }; + (f, self.value_offsets, self.value_sizes, self.values, self.nulls) + } + + /// Returns a reference to the offsets of this list + /// + /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &OffsetBuffer { + &self.value_offsets + } + + /// Returns a reference to the values of this list + #[inline] + pub fn values(&self) -> &ArrayRef { + &self.values + } + + /// Returns a reference to the sizes of this list + /// + /// Unlike [`Self::value_sizes`] this returns the [`SizeBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn sizes(&self) -> &SizeBuffer { + &self.value_sizes + } + + + /// Returns a clone of the value type of this list. + pub fn value_type(&self) -> DataType { + self.values.data_type().clone() + } + + /// Returns ith value of this list view array. + /// # Safety + /// Caller must ensure that the index is within the array bounds + pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { + let end = self.value_offsets().get_unchecked(i + 1).as_usize(); + let start = self.value_offsets().get_unchecked(i).as_usize(); + self.values.slice(start, end - start) + } + + /// Returns ith value of this list view array. + pub fn value(&self, i: usize) -> ArrayRef { + let offset = self.value_offsets()[i].as_usize(); + let length = self.value_sizes()[i].as_usize(); + self.values.slice(offset, length) + } + + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[OffsetSize] { + &self.value_offsets + } + + /// Returns the sizes values in the offsets buffer + #[inline] + pub fn value_sizes(&self) -> &[OffsetSize] { + &self.value_sizes + } + + /// Returns the length for value at index `i`. + #[inline] + pub fn value_length(&self, i: usize) -> OffsetSize { + self.value_sizes[i] + } + + /// constructs a new iterator + pub fn iter<'a>(&'a self) -> GenericListViewArrayIter<'a, OffsetSize> { + GenericListViewArrayIter::<'a, OffsetSize>::new(self) + } + + #[inline] + fn get_type(data_type: &DataType) -> Option<&DataType> { + match (OffsetSize::IS_LARGE, data_type) { + (true, DataType::LargeListView(child)) | (false, DataType::ListView(child)) => { + Some(child.data_type()) + } + _ => None, + } + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + value_sizes: self.value_sizes.slice(offset, length), + len: length, + } + } + + pub fn from_iter_primitive(iter: I) -> Self + where + T: ArrowPrimitiveType, + P: IntoIterator::Native>>, + I: IntoIterator>, + { + let iter = iter.into_iter(); + let size_hint = iter.size_hint().0; + let mut builder = + GenericListViewBuilder::with_capacity(PrimitiveBuilder::::new(), size_hint); + + for i in iter { + match i { + Some(p) => { + let mut size = 0usize; + for t in p { + builder.values().append_option(t); + size = size.add(1); + } + builder.append(true, size); + } + None => builder.append(false, 0), + } + } + builder.finish() + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } +} + + +impl Array for GenericListViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } +} + +impl std::fmt::Debug for GenericListViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let prefix = OffsetSize::PREFIX; + write!(f, "{prefix}ListViewArray\n[\n")?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl From> for ArrayData { + fn from(array: GenericListViewArray) -> Self { + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![array.value_offsets.into_inner().into_inner(), array.value_sizes.into_inner().into_inner()]) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } + } +} + +impl From for GenericListViewArray { + fn from(data: ArrayData) -> Self { + Self::try_new_from_array_data(data) + .expect("Expected infallible creation of GenericListViewArray from ArrayDataRef failed") + } +} + +impl From for GenericListViewArray { + fn from(value: FixedSizeListArray) -> Self { + let (field, size) = match value.data_type() { + DataType::FixedSizeList(f, size) => (f, *size as usize), + _ => unreachable!(), + }; + + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + let sizes = SizeBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), + nulls: value.nulls().cloned(), + values: value.values().clone(), + value_offsets: offsets, + value_sizes: sizes, + len: value.len(), + } + } +} + +impl GenericListViewArray { + fn try_new_from_array_data(data: ArrayData) -> Result { + if data.buffers().len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray data should contain two buffer (value offsets & value size), had {}", + data.buffers().len() + ))); + } + + if data.child_data().len() != 1 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray should contain a single child array (values array), had {}", + data.child_data().len() + ))); + } + + let values = data.child_data()[0].clone(); + + if let Some(child_data_type) = Self::get_type(data.data_type()) { + if values.data_type() != child_data_type { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's child datatype {:?} does not \ + correspond to the List's datatype {:?}", + values.data_type(), + child_data_type + ))); + } + } else { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is {:?}", + data.data_type() + ))); + } + + let values = make_array(values); + // SAFETY: + // ArrayData is valid, and verified type above + let value_offsets = unsafe { get_offsets(&data) }; + let value_sizes = unsafe { get_sizes(&data) }; + + Ok(Self { + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + values, + value_offsets, + value_sizes, + len: data.len(), + }) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::Int32Type; + use crate::{Int32Array, Int64Array}; + use arrow_buffer::{bit_util, Buffer, ScalarBuffer}; + use arrow_schema::DataType::LargeListView; + use arrow_schema::Field; + use crate::builder::{FixedSizeListBuilder, Int32Builder, ListViewBuilder}; + use crate::cast::AsArray; + + fn create_from_buffers() -> ListViewArray { + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6])); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = SizeBuffer::new(ScalarBuffer::from(vec![3, 3, 2])); + + ListViewArray::new(field, offsets, sizes,Arc::new(values), None) + } + + + #[test] + fn test_from_iter_primitive() { + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + Some(vec![Some(6), Some(7)]), + ]; + let list_array = ListViewArray::from_iter_primitive::(data); + let another = create_from_buffers(); + assert_eq!(list_array, another) + } + + #[test] + fn test_empty_list_view_array() { + // Construct an empty value array + let value_data = ArrayData::builder(DataType::Int32) + .len(0) + .add_buffer(Buffer::from([])) + .build() + .unwrap(); + + // Construct an empty offset buffer + let value_offsets = Buffer::from([]); + // Construct an empty size buffer + let value_sizes = Buffer::from([]); + + // Construct a list view array from the above two + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(0) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .build() + .unwrap(); + + let list_array = ListViewArray::from(list_data); + assert_eq!(list_array.len(), 0) + } + + + #[test] + fn test_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); + // Construct a buffer for value sizes, for the nested array: + let value_sizes = Buffer::from_slice_ref([3, 3, 2, 0]); + + // Construct a list view array from the above two + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_buffer(value_sizes.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + + // Now test with a non-zero offset (skip first element) + // [[3, 4, 5], [6, 7]] + let list_data = ArrayData::builder(list_data_type) + .len(2) + .offset(1) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(2, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[1]); + assert_eq!(2, list_array.value_sizes()[1]); + assert_eq!(2, list_array.value_length(1)); + assert_eq!( + 3, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + } + + #[test] + fn test_large_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); + + // Construct a buffer for value sizes, for the nested array: + let value_sizes = Buffer::from_slice_ref([3i64, 3, 2, 0]); + + // Construct a list view array from the above two + let list_data_type = DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_buffer(value_sizes.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + + // Now test with a non-zero offset + // [[3, 4, 5], [6, 7]] + let list_data = ArrayData::builder(list_data_type) + .len(2) + .offset(1) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(2, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[1]); + assert_eq!(2, list_array.value_sizes()[1]); + assert_eq!(2, list_array.value_length(1)); + assert_eq!( + 3, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + } + + + #[test] + fn test_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list view array from the above two + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array.as_any().downcast_ref::().unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_sizes()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + } + + #[test] + fn test_large_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list view array from the above two + let list_data_type = DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + } + + #[test] + #[should_panic(expected = "index out of bounds: the len is 10 but the index is 10")] + fn test_list_view_array_index_out_of_bound() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list array from the above two + let list_data_type = LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + assert_eq!(9, list_array.len()); + + list_array.value(10); + } + #[test] + #[should_panic(expected = "ListViewArray data should contain two buffer (value offsets & value size), had 0")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_buffer_len() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic(expected = "ListViewArray data should contain two buffer (value offsets & value size), had 1")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_child_array_len() { + let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic(expected = "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is ListView")] + fn test_from_array_data_validation() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.values().append_value(1); + builder.append(true, 0); + let array = builder.finish(); + let _ = LargeListViewArray::from(array.into_data()); + } + + #[test] + fn test_list_view_array_offsets_need_not_start_at_zero() { + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); + let value_sizes = Buffer::from_slice_ref([0, 0, 3, 2]); + + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .build() + .unwrap(); + + let list_array = ListViewArray::from(list_data); + assert_eq!(list_array.value_length(0), 0); + assert_eq!(list_array.value_length(1), 0); + assert_eq!(list_array.value_length(2), 3); + } + + #[test] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_alignment() { + let offset_buf = Buffer::from_slice_ref([0_u64]); + let offset_buf2 = offset_buf.slice(1); + + let size_buf = Buffer::from_slice_ref([0_u64]); + let size_buf2 = size_buf.slice(1); + + let values: [i32; 8] = [0; 8]; + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(Buffer::from_slice_ref(values)) + .build_unchecked() + }; + + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .add_buffer(offset_buf2) + .add_buffer(size_buf2) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + fn list_array_equality() { + // test scaffold + fn do_comparison( + lhs_data: Vec>>>, + rhs_data: Vec>>>, + should_equal: bool, + ) { + let lhs = ListViewArray::from_iter_primitive::(lhs_data.clone()); + let rhs = ListViewArray::from_iter_primitive::(rhs_data.clone()); + assert_eq!(lhs == rhs, should_equal); + + let lhs = LargeListViewArray::from_iter_primitive::(lhs_data); + let rhs = LargeListViewArray::from_iter_primitive::(rhs_data); + assert_eq!(lhs == rhs, should_equal); + } + + do_comparison( + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + true, + ); + + do_comparison( + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + false, + ); + + do_comparison( + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(0), Some(0)]), + ], + false, + ); + + do_comparison( + vec![None, None, Some(vec![Some(1)])], + vec![None, None, Some(vec![Some(2)])], + false, + ); + } + + #[test] + fn test_empty_offsets() { + let f = Arc::new(Field::new("element", DataType::Int32, true)); + let string = ListViewArray::from( + ArrayData::builder(DataType::ListView(f.clone())) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + assert_eq!(string.value_sizes(), &[0]); + + let string = LargeListViewArray::from( + ArrayData::builder(DataType::LargeListView(f)) + .buffers(vec![Buffer::from(&[]),Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + assert_eq!(string.value_sizes(), &[0]); + } + + #[test] + fn test_try_new() { + let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); + let sizes = SizeBuffer::new(vec![1, 3, 1, 0 ].into()); + let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); + let values = Arc::new(values) as ArrayRef; + + let field = Arc::new(Field::new("element", DataType::Int32, false)); + ListViewArray::new(field.clone(), offsets.clone(), sizes.clone(), values.clone(), None); + + let nulls = NullBuffer::new_null(4); + ListViewArray::new(field.clone(), offsets, sizes.clone(), values.clone(), Some(nulls)); + + let nulls = NullBuffer::new_null(4); + let offsets = OffsetBuffer::new(vec![0, 1, 2, 3, 4].into()); + let sizes = SizeBuffer::new(vec![1, 1, 1, 1, 0].into()); + let err = LargeListViewArray::try_new(field, offsets.clone(), sizes.clone(), values.clone(), Some(nulls)) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect length of null buffer for LargeListViewArray, expected 5 got 4" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, false)); + let err = LargeListViewArray::try_new(field.clone(), offsets.clone(), sizes.clone(), values.clone(), None) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: LargeListViewArray expected data type Int64 got Int32 for \"element\"" + ); + + let nulls = NullBuffer::new_null(7); + let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); + let values = Arc::new(values); + + let err = + LargeListViewArray::try_new(field, offsets.clone(), sizes.clone(), values.clone(),None).unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Non-nullable field of LargeListViewArray \"element\" cannot contain nulls" + ); + + } + + #[test] + fn test_from_fixed_size_list() { + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[0, 0, 0]); + builder.append(false); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + let list: ListViewArray = builder.finish().into(); + + let values: Vec<_> = list + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) + } +} \ No newline at end of file diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index b115ff9c14cc..f981ba4021a3 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer, SizeBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -65,13 +65,14 @@ mod union_array; pub use union_array::*; mod run_array; - pub use run_array::*; mod byte_view_array; - pub use byte_view_array::*; +mod list_view_array; +pub use list_view_array::*; + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`] so that it can be @@ -519,6 +520,12 @@ impl PartialEq for GenericListArray { } } +impl PartialEq for GenericListViewArray { + fn eq(&self, other: &Self) -> bool { + self.to_data().eq(&other.to_data()) + } +} + impl PartialEq for MapArray { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) @@ -606,7 +613,9 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef, DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, + DataType::ListView(_) => Arc::new(ListViewArray::from(data)) as ArrayRef, DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, + DataType::LargeListView(_) => Arc::new(LargeListViewArray::from(data)) as ArrayRef, DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, @@ -687,6 +696,22 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { } } +/// Helper function that gets size from an [`ArrayData`] +/// +/// # Safety +unsafe fn get_sizes(data: &ArrayData) -> SizeBuffer { + match data.is_empty() && data.buffers()[1].is_empty() { + true => SizeBuffer::new_empty(), + false => { + let buffer = + ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); + // Safety: + // ArrayData is valid + SizeBuffer::new(buffer) + } + } +} + /// Helper function for printing potentially long arrays. fn print_long_array(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result where diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 25903bcf546b..d6fff12e1ab8 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -548,7 +548,7 @@ mod tests { } #[test] - fn test_boxed_primitive_aray_builder() { + fn test_boxed_primitive_array_builder() { let values_builder = make_builder(&DataType::Int32, 5); let mut builder = ListBuilder::new(values_builder); diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs new file mode 100644 index 000000000000..59ec9592f5aa --- /dev/null +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -0,0 +1,705 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +use std::any::Any; +use std::sync::Arc; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, OffsetBuffer, SizeBuffer}; +use arrow_schema::{Field, FieldRef}; +use crate::builder::ArrayBuilder; +use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait}; + +#[derive(Debug)] +pub struct GenericListViewBuilder { + offsets_builder: BufferBuilder, + sizes_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: T, + field: Option, +} + + + + +impl Default for GenericListViewBuilder { + fn default() -> Self { + Self::new(T::default()) + } +} + +impl GenericListViewBuilder { + /// Creates a new [`GenericListBuilder`] from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new [`GenericListBuilder`] from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let offsets_builder = BufferBuilder::::new(capacity); + let sizes_builder = BufferBuilder::::new(capacity); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + sizes_builder, + field: None, + } + } + + /// Override the field passed to [`GenericListArray::new`] + /// + /// By default a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + +impl ArrayBuilder +for GenericListViewBuilder + where + T: 'static, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericListViewBuilder + where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call [`append`](#method.append) to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Returns the child array builder as an immutable reference + pub fn values_ref(&self) -> &T { + &self.values_builder + } + + /// Finish the current variable-length list array slot + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + pub fn append(&mut self, is_valid: bool, size: usize) { + self.offsets_builder.append(OffsetSize::from_usize(self.values_builder.len() - size).unwrap()); + let size = OffsetSize::from_usize(size).unwrap(); + self.sizes_builder.append(size); + self.null_buffer_builder.append(is_valid); + } + + #[inline] + pub fn append_value(&mut self, i: I) + where + T: Extend>, + I: IntoIterator>, + { + self.extend(std::iter::once(Some(i))) + } + + /// Append a null to this [`GenericListBuilder`] + /// + /// See [`Self::append_value`] for an example use. + #[inline] + pub fn append_null(&mut self) { + self.offsets_builder.append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); + self.sizes_builder.append(OffsetSize::from_usize(0).unwrap()); + self.null_buffer_builder.append_null(); + } + + /// Appends an optional value into this [`GenericListBuilder`] + /// + /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] + #[inline] + pub fn append_option(&mut self, i: Option) + where + T: Extend>, + I: IntoIterator>, + { + match i { + Some(i) => self.append_value(i), + None => self.append_null(), + } + } + + /// Builds the [`GenericListViewArray`] and reset this builder. + pub fn finish(&mut self) -> GenericListViewArray { + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); + + let offsets = self.offsets_builder.finish(); + // Safety: Safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + let sizes = self.sizes_builder.finish(); + let sizes = SizeBuffer::new(sizes.into()); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + GenericListViewArray::new(field, offsets, sizes, values ,nulls) + } + + /// Builds the [`GenericListArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericListViewArray { + let values = self.values_builder.finish_cloned(); + let nulls = self.null_buffer_builder.finish_cloned(); + + let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + // Safety: safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice()); + let sizes = SizeBuffer::new(sizes.into()); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[OffsetSize] { + self.offsets_builder.as_slice() + } +} + +impl Extend> for GenericListViewBuilder + where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + match v { + Some(elements) => { + let origin_size = self.values_builder.len(); + self.values_builder.extend(elements); + let size = self.values_builder.len() - origin_size; + self.append(true, size); + } + None => self.append(false, 0), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::{Int32Builder, ListViewBuilder, make_builder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Array, Int32Array}; + use arrow_schema::DataType; + + fn _test_generic_list_view_array_builder() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true,3); + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true,3); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true,2); + let list_array = builder.finish(); + + let list_values = list_array.values().as_primitive::(); + assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(list_array.value_offsets(), [0, 3, 6].map(O::usize_as)); + assert_eq!(list_array.value_sizes(), [3, 3, 2].map(O::usize_as)); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_sizes()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_length(2)); + for i in 0..2 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_view_array_builder() { + _test_generic_list_view_array_builder::() + } + + #[test] + fn test_large_list_view_array_builder() { + _test_generic_list_view_array_builder::() + } + + fn _test_generic_list_view_array_builder_nulls() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true,3); + builder.append(false,0); + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true,3); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true,2); + + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_sizes()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_length(2)); + + } + + #[test] + fn test_list_view_array_builder_nulls() { + _test_generic_list_view_array_builder_nulls::() + } + + #[test] + fn test_large_list_view_array_builder_nulls() { + _test_generic_list_view_array_builder_nulls::() + } + + #[test] + fn test_list_view_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true,1); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true,1); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert!(builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true,1); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_array_builder_finish_cloned() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true,1); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true,1); + + let mut arr = builder.finish_cloned(); + assert_eq!(2, arr.len()); + assert!(!builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true,1); + arr = builder.finish(); + assert_eq!(3, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_list_view_array_builder() { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListViewBuilder::new(primitive_builder); + let mut builder = ListViewBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true,2); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true,2); + builder.append(true,2); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true,3); + builder.values().append(false,0); + builder.values().values().append_value(8); + builder.values().append(true,1); + builder.append(true,3); + + builder.append(false,0); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true,2); + builder.append(true,1); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5]); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1]); + + let l2 = l1.values().as_list_view::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8]); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_extend() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.extend([ + Some(vec![Some(1), Some(2), Some(7), None]), + Some(vec![]), + Some(vec![Some(4), Some(5)]), + None, + ]); + + let array = builder.finish(); + assert_eq!(array.value_offsets(), [0, 4, 4, 6]); + assert_eq!(array.value_sizes(), [4, 0, 2, 0]); + assert_eq!(array.null_count(), 1); + assert!(array.is_null(3)); + let elements = array.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); + assert_eq!(elements.null_count(), 1); + assert!(elements.is_null(3)); + } + + #[test] + fn test_boxed_primitive_array_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListViewBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true,1); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true,1); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_view_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::ListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + #[test] + fn test_boxed_large_list_view_large_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + fn test_boxed_generic_list_view_generic_list_view_array_builder( + values_builder: Box, + ) { + let mut builder: GenericListViewBuilder> = + GenericListViewBuilder::>::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,2); + builder.append(true,2); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an (Large)ListViewBuilder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(false,0); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,1); + builder.append(true,3); + + builder.append(false,0); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,2); + builder.append(true,1); + + let l1 = builder.finish(); + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5].map(O::usize_as)); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1].map(O::usize_as)); + + let l2 = l1.values().as_list_view::(); + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8].map(O::usize_as)); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2].map(O::usize_as)); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_with_field() { + let field = Arc::new(Field::new("bar", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_null(); // This is fine as nullability refers to nullability of values + builder.append_value([Some(4)]); + let array = builder.finish(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::ListView(field.clone())); + + builder.append_value([Some(4), Some(5)]); + let array = builder.finish(); + assert_eq!(array.data_type(), &DataType::ListView(field)); + assert_eq!(array.len(), 1); + } + + #[test] + #[should_panic(expected = "Non-nullable field of ListViewArray \\\"item\\\" cannot contain nulls")] + fn test_checks_nullability() { + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), None]); + builder.finish(); + } + + #[test] + #[should_panic(expected = "ListViewArray expected data type Int64 got Int32")] + fn test_checks_data_type() { + let field = Arc::new(Field::new("item", DataType::Int64, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1)]); + builder.finish(); + } +} + diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index e4ab7ae4ba23..109492de18ea 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -181,6 +181,8 @@ pub use generic_byte_run_builder::*; mod generic_bytes_view_builder; pub use generic_bytes_view_builder::*; mod union_builder; +mod generic_list_view_builder; +pub use generic_list_view_builder::*; pub use union_builder::*; @@ -315,3 +317,9 @@ pub type StringBuilder = GenericStringBuilder; /// Builder for [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringBuilder = GenericStringBuilder; + +/// Builder for [`ListViewArray`](crate::array::ListViewArray) +pub type ListViewBuilder = GenericListViewBuilder; + +/// Builder for [`LargeListViewArray`](crate::array::LargeListViewArray) +pub type LargeListViewBuilder = GenericListViewBuilder; diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 1e2e402f745f..52ed42cab6cf 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -253,6 +253,14 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } + DataType::LargeListView(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new(LargeListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } DataType::Map(field, _) => match field.data_type() { DataType::Struct(fields) => { let map_field_names = MapFieldNames { diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 2e21f3e7e640..12b18e9817f5 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -795,6 +795,14 @@ pub trait AsArray: private::Sealed { self.as_list_opt().expect("list array") } + /// Downcast this to a [`GenericListViewArray`] returning `None` if not possible + fn as_list_view_opt(&self) -> Option<&GenericListViewArray>; + + /// Downcast this to a [`GenericListViewArray`] panicking if not possible + fn as_list_view(&self) -> &GenericListViewArray { + self.as_list_view_opt().expect("list view array") + } + /// Downcast this to a [`FixedSizeBinaryArray`] returning `None` if not possible fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray>; @@ -860,6 +868,10 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_any().downcast_ref() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_any().downcast_ref() } @@ -907,6 +919,10 @@ impl AsArray for ArrayRef { self.as_ref().as_list_opt() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_ref().as_list_view_opt() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_ref().as_fixed_size_binary_opt() } diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 3f9cc0d525c1..5b36d19359eb 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -19,7 +19,7 @@ use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, - GenericStringArray, PrimitiveArray, + GenericListViewArray, GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; use arrow_buffer::NullBuffer; @@ -141,6 +141,8 @@ pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; /// an iterator that returns Some(T) or None, that can be used on any ListArray pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; +/// an iterator that returns Some(T) or None, that can be used on any ListArray +pub type GenericListViewArrayIter<'a, O> = ArrayIter<&'a GenericListViewArray>; /// an iterator that returns Some(T) or None, that can be used on any MapArray pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index c56b1fd308cf..5fcfaa0b2709 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -626,9 +626,7 @@ mod tests { use std::collections::HashMap; use super::*; - use crate::{ - BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray, - }; + use crate::{BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::Fields; diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index d33e68795e4e..c00bd326b3cb 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -32,4 +32,7 @@ pub use boolean::*; mod null; pub use null::*; mod run; +mod size; +pub use size::*; + pub use run::*; diff --git a/arrow-buffer/src/buffer/size.rs b/arrow-buffer/src/buffer/size.rs new file mode 100644 index 000000000000..9d2e5889ca1c --- /dev/null +++ b/arrow-buffer/src/buffer/size.rs @@ -0,0 +1,91 @@ +use std::ops::Deref; +use crate::{ArrowNativeType, MutableBuffer, ScalarBuffer}; + +#[derive(Debug, Clone)] +pub struct SizeBuffer(ScalarBuffer); + +impl SizeBuffer { + + /// Create a new [`SizeBuffer`] containing `len` `0` values + pub fn new_zeroed(len: usize) -> Self { + let len_bytes = len.checked_mul(std::mem::size_of::()).expect("overflow"); + let buffer = MutableBuffer::from_len_zeroed(len_bytes); + Self(buffer.into_buffer().into()) + } + + /// Create a new [`SizeBuffer`] from the provided [`ScalarBuffer`] + pub fn new(buffer: ScalarBuffer) -> Self { + assert!(!buffer.is_empty(), "offsets cannot be empty"); + assert!( + buffer[0] >= O::usize_as(0), + "offsets must be greater than 0" + ); + Self(buffer) + } + + /// Create a new [`SizeBuffer`] containing a single 0 value + pub fn new_empty() -> Self { + let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); + Self(buffer.into_buffer().into()) + } + + /// Create a new [`SizeBuffer`] from the iterator of slice lengths + /// + /// ``` + /// # use arrow_buffer::SizeBuffer; + /// let offsets = SizeBuffer::::from_lengths([1, 3, 5]); + /// assert_eq!(offsets.as_ref(), &[1, 3, 5]); + /// ``` + pub fn from_lengths(lengths: I) -> Self + where + I: IntoIterator, + { + let iter = lengths.into_iter(); + let mut out = Vec::with_capacity(iter.size_hint().0); + + for size in iter { + out.push(O::usize_as(size)) + } + Self(out.into()) + } + + /// Returns the inner [`ScalarBuffer`] + pub fn inner(&self) -> &ScalarBuffer { + &self.0 + } + + /// Returns the inner [`ScalarBuffer`], consuming self + pub fn into_inner(self) -> ScalarBuffer { + self.0 + } + + /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self(self.0.slice(offset, len.saturating_add(1))) + } + + /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.0.ptr_eq(&other.0) + } +} + + +impl Deref for SizeBuffer { + type Target = [T]; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef<[T]> for SizeBuffer { + #[inline] + fn as_ref(&self) -> &[T] { + self + } +} diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index e227b168eee5..0d832a453693 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -119,17 +119,21 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff [buffer, empty_buffer] } DataType::ListView(_) => [ + // offset buffer MutableBuffer::new(capacity * mem::size_of::()), + // size buffer MutableBuffer::new(capacity * mem::size_of::()), ], DataType::LargeList(_) => { - // offset buffer always starts with a zero + // offset buffer let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); buffer.push(0i64); [buffer, empty_buffer] } DataType::LargeListView(_) => [ + // offset buffer MutableBuffer::new(capacity * mem::size_of::()), + // size buffer MutableBuffer::new(capacity * mem::size_of::()), ], DataType::FixedSizeBinary(size) => { @@ -856,6 +860,17 @@ impl ArrayData { self.typed_buffer(0, self.len + 1) } + /// Returns a reference to the data in `buffer` as a typed slice + /// after validating. The returned slice is guaranteed to have at + /// least `len` entries. + fn typed_sizes(&self) -> Result<&[T], ArrowError> { + // An empty list-like array can have 0 sizes + if self.len == 0 && self.buffers[1].is_empty() { + return Ok(&[]); + } + self.typed_buffer(1, self.len) + } + /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating fn typed_buffer( &self, @@ -929,6 +944,26 @@ impl ArrayData { Ok(()) } + fn validate_sizes( + &self, + values_length: usize, + ) -> Result<(), ArrowError> { + let sizes = self.typed_sizes::()?; + if sizes.is_empty() { + return Ok(()); + } + + let total_size: usize = sizes.iter().map(|x| x.to_usize().unwrap()).sum(); + if total_size > values_length { + return Err(ArrowError::InvalidArgumentError(format!( + "Total size of {} is larger than values length {}", + total_size, values_length, + ))); + } + + Ok(()) + } + /// Validates the layout of `child_data` ArrayData structures fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { @@ -937,11 +972,23 @@ impl ArrayData { self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::ListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets::(values_data.len)?; + self.validate_sizes::(values_data.len)?; + Ok(()) + } DataType::LargeList(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::LargeListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets::(values_data.len)?; + self.validate_sizes::(values_data.len)?; + Ok(()) + } DataType::FixedSizeList(field, list_size) => { let values_data = self.get_single_valid_child_data(field.data_type())?; @@ -1546,9 +1593,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } + DataType::ListView(_) => DataTypeLayout::new_list_view::(), + DataType::LargeListView(_) => DataTypeLayout::new_list_view::(), DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), DataType::Map(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, @@ -1652,6 +1698,21 @@ impl DataTypeLayout { variadic: true, } } + + /// Describes a list view type + pub fn new_list_view() -> Self { + Self { + buffers: vec![BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + },BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }], + can_contain_null_mask: true, + variadic: true, + } + } } /// Layout specification for a single data type buffer diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs new file mode 100644 index 000000000000..b6a5d7e945fa --- /dev/null +++ b/arrow-data/src/equal/list_view.rs @@ -0,0 +1,52 @@ +use num::Integer; +use arrow_buffer::ArrowNativeType; +use crate::ArrayData; +use crate::data::count_nulls; + +use super::equal_range; + +pub(super) fn list_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_offsets = lhs.buffer::(0); + let rhs_offsets = rhs.buffer::(0); + let lhs_sizes = lhs.buffer::(1); + let rhs_sizes = rhs.buffer::(1); + for i in 0..len { + + // compare offsets and sizes + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + let lhs_offset_start = lhs_offsets[lhs_pos].to_usize().unwrap(); + let rhs_offset_start = rhs_offsets[rhs_pos].to_usize().unwrap(); + let lhs_size = lhs_sizes[lhs_pos].to_usize().unwrap(); + let rhs_size = rhs_sizes[rhs_pos].to_usize().unwrap(); + if lhs_size != rhs_size { + return false; + } + + // compare nulls + let lhs_null_count = count_nulls(lhs.nulls(), lhs_offset_start, lhs_size); + let rhs_null_count = count_nulls(rhs.nulls(), rhs_offset_start, lhs_size); + if lhs_null_count != rhs_null_count { + return false; + } + + print!("lhs_offset_start: {}, rhs_offset_start: {}, lhs_size: {}\n", lhs_offset_start, rhs_offset_start, lhs_size); + // compare values + if !equal_range( + &lhs.child_data()[0], + &rhs.child_data()[0], + lhs_offset_start, + lhs_offset_start, + lhs_size, + ) { + return false; + } + } + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index dba6a0186a56..0c721a595eb4 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -37,6 +37,7 @@ mod structure; mod union; mod utils; mod variable_size; +mod list_view; // these methods assume the same type, len and null count. // For this reason, they are not exposed and are instead used @@ -52,6 +53,7 @@ use primitive::primitive_equal; use structure::struct_equal; use union::union_equal; use variable_size::variable_sized_equal; +use crate::equal::list_view::list_view_equal; use self::run::run_equal; @@ -102,10 +104,9 @@ fn equal_values( byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not yet implemented") - } + DataType::ListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), From d8aba1bd892cb5d660e0cc7db4fec2617b99cc46 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 31 Mar 2024 21:39:05 +0800 Subject: [PATCH 2/5] chore: cargo clippy & fmt --- arrow-array/src/array/list_view_array.rs | 150 ++++++++---- arrow-array/src/array/mod.rs | 3 +- .../src/builder/generic_list_view_builder.rs | 214 +++++++++--------- arrow-array/src/builder/mod.rs | 2 +- arrow-array/src/builder/struct_builder.rs | 4 +- arrow-array/src/record_batch.rs | 4 +- arrow-buffer/src/buffer/size.rs | 27 ++- arrow-data/src/data.rs | 19 +- arrow-data/src/equal/list_view.rs | 27 ++- arrow-data/src/equal/mod.rs | 4 +- 10 files changed, 272 insertions(+), 182 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index b0e3a0f26898..bd54aa28716c 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -17,15 +17,20 @@ use crate::array::{get_offsets, get_sizes, make_array, print_long_array}; use crate::builder::{GenericListViewBuilder, PrimitiveBuilder}; -use crate::{new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, FixedSizeListArray}; +use crate::iterator::GenericListViewArrayIter; +use crate::{ + new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, FixedSizeListArray, + OffsetSizeTrait, +}; use arrow_buffer::{NullBuffer, OffsetBuffer, SizeBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef}; use std::any::Any; use std::ops::Add; use std::sync::Arc; -use crate::iterator::GenericListViewArrayIter; +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i32`. +/// // See [`ListViewBuilder`](crate::builder::ListViewBuilder) for how to construct a [`ListViewArray`] pub type ListViewArray = GenericListViewArray; @@ -48,7 +53,6 @@ pub struct GenericListViewArray { len: usize, } - impl Clone for GenericListViewArray { fn clone(&self) -> Self { Self { @@ -72,6 +76,7 @@ impl GenericListViewArray { DataType::ListView }; + /// Returns the data type of the list view array pub fn try_new( field: FieldRef, offsets: OffsetBuffer, @@ -79,8 +84,6 @@ impl GenericListViewArray { values: ArrayRef, nulls: Option, ) -> Result { - - //todo: check illegal offset and size values let mut len = 0usize; for i in 0..offsets.len() { let offset = offsets[i].as_usize(); @@ -95,7 +98,7 @@ impl GenericListViewArray { } let len = offsets.len(); - if len != sizes.len() { + if len != sizes.len() { return Err(ArrowError::InvalidArgumentError(format!( "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", OffsetSize::PREFIX, len, sizes.len() @@ -129,7 +132,6 @@ impl GenericListViewArray { ))); } - Ok(Self { data_type: Self::DATA_TYPE_CONSTRUCTOR(field), nulls, @@ -164,7 +166,7 @@ impl GenericListViewArray { value_offsets: OffsetBuffer::new_zeroed(len), value_sizes: SizeBuffer::new_zeroed(len), values, - len: 0usize + len: 0usize, } } @@ -182,7 +184,13 @@ impl GenericListViewArray { DataType::ListView(f) | DataType::LargeListView(f) => f, _ => unreachable!(), }; - (f, self.value_offsets, self.value_sizes, self.values, self.nulls) + ( + f, + self.value_offsets, + self.value_sizes, + self.values, + self.nulls, + ) } /// Returns a reference to the offsets of this list @@ -209,7 +217,6 @@ impl GenericListViewArray { &self.value_sizes } - /// Returns a clone of the value type of this list. pub fn value_type(&self) -> DataType { self.values.data_type().clone() @@ -276,11 +283,13 @@ impl GenericListViewArray { } } + /// Creates a [`GenericListViewArray`] from an iterator of primitive values + /// pub fn from_iter_primitive(iter: I) -> Self - where - T: ArrowPrimitiveType, - P: IntoIterator::Native>>, - I: IntoIterator>, + where + T: ArrowPrimitiveType, + P: IntoIterator::Native>>, + I: IntoIterator>, { let iter = iter.into_iter(); let size_hint = iter.size_hint().0; @@ -316,7 +325,6 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray } } - impl Array for GenericListViewArray { fn as_any(&self) -> &dyn Any { self @@ -373,7 +381,7 @@ impl Array for GenericListViewArray { } } -impl std::fmt::Debug for GenericListViewArray { +impl std::fmt::Debug for GenericListViewArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let prefix = OffsetSize::PREFIX; write!(f, "{prefix}ListViewArray\n[\n")?; @@ -390,7 +398,10 @@ impl From> for Arr let builder = ArrayDataBuilder::new(array.data_type) .len(len) .nulls(array.nulls) - .buffers(vec![array.value_offsets.into_inner().into_inner(), array.value_sizes.into_inner().into_inner()]) + .buffers(vec![ + array.value_offsets.into_inner().into_inner(), + array.value_sizes.into_inner().into_inner(), + ]) .child_data(vec![array.values.to_data()]); unsafe { builder.build_unchecked() } @@ -475,17 +486,16 @@ impl GenericListViewArray { } } - #[cfg(test)] mod tests { use super::*; + use crate::builder::{FixedSizeListBuilder, Int32Builder, ListViewBuilder}; + use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Int32Array, Int64Array}; use arrow_buffer::{bit_util, Buffer, ScalarBuffer}; use arrow_schema::DataType::LargeListView; use arrow_schema::Field; - use crate::builder::{FixedSizeListBuilder, Int32Builder, ListViewBuilder}; - use crate::cast::AsArray; fn create_from_buffers() -> ListViewArray { // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -494,10 +504,9 @@ mod tests { let field = Arc::new(Field::new("item", DataType::Int32, true)); let sizes = SizeBuffer::new(ScalarBuffer::from(vec![3, 3, 2])); - ListViewArray::new(field, offsets, sizes,Arc::new(values), None) + ListViewArray::new(field, offsets, sizes, Arc::new(values), None) } - #[test] fn test_from_iter_primitive() { let data = vec![ @@ -525,7 +534,8 @@ mod tests { let value_sizes = Buffer::from([]); // Construct a list view array from the above two - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(0) .add_buffer(value_offsets) @@ -538,7 +548,6 @@ mod tests { assert_eq!(list_array.len(), 0) } - #[test] fn test_list_view_array() { // Construct a value array @@ -555,7 +564,8 @@ mod tests { let value_sizes = Buffer::from_slice_ref([3, 3, 2, 0]); // Construct a list view array from the above two - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -651,7 +661,7 @@ mod tests { let value_sizes = Buffer::from_slice_ref([3i64, 3, 2, 0]); // Construct a list view array from the above two - let list_data_type = DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -730,7 +740,6 @@ mod tests { ); } - #[test] fn test_list_view_array_slice() { // Construct a value array @@ -754,7 +763,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list view array from the above two - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -787,7 +797,10 @@ mod tests { } // Check offset and length for each non-null value. - let sliced_list_array = sliced_array.as_any().downcast_ref::().unwrap(); + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); assert_eq!(2, sliced_list_array.value_offsets()[2]); assert_eq!(2, sliced_list_array.value_sizes()[2]); assert_eq!(2, sliced_list_array.value_length(2)); @@ -824,7 +837,7 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list view array from the above two - let list_data_type = DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -913,7 +926,9 @@ mod tests { list_array.value(10); } #[test] - #[should_panic(expected = "ListViewArray data should contain two buffer (value offsets & value size), had 0")] + #[should_panic( + expected = "ListViewArray data should contain two buffer (value offsets & value size), had 0" + )] #[cfg(not(feature = "force_validate"))] fn test_list_view_array_invalid_buffer_len() { let value_data = unsafe { @@ -922,7 +937,8 @@ mod tests { .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build_unchecked() }; - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -933,11 +949,14 @@ mod tests { } #[test] - #[should_panic(expected = "ListViewArray data should contain two buffer (value offsets & value size), had 1")] + #[should_panic( + expected = "ListViewArray data should contain two buffer (value offsets & value size), had 1" + )] #[cfg(not(feature = "force_validate"))] fn test_list_view_array_invalid_child_array_len() { let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -948,7 +967,9 @@ mod tests { } #[test] - #[should_panic(expected = "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is ListView")] + #[should_panic( + expected = "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is ListView" + )] fn test_from_array_data_validation() { let mut builder = ListViewBuilder::new(Int32Builder::new()); builder.values().append_value(1); @@ -968,7 +989,8 @@ mod tests { let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); let value_sizes = Buffer::from_slice_ref([0, 0, 3, 2]); - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1000,7 +1022,8 @@ mod tests { .build_unchecked() }; - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .add_buffer(offset_buf2) @@ -1098,7 +1121,7 @@ mod tests { let string = LargeListViewArray::from( ArrayData::builder(DataType::LargeListView(f)) - .buffers(vec![Buffer::from(&[]),Buffer::from(&[])]) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) .add_child_data(ArrayData::new_empty(&DataType::Int32)) .build() .unwrap(), @@ -1111,21 +1134,39 @@ mod tests { #[test] fn test_try_new() { let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); - let sizes = SizeBuffer::new(vec![1, 3, 1, 0 ].into()); + let sizes = SizeBuffer::new(vec![1, 3, 1, 0].into()); let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); let values = Arc::new(values) as ArrayRef; let field = Arc::new(Field::new("element", DataType::Int32, false)); - ListViewArray::new(field.clone(), offsets.clone(), sizes.clone(), values.clone(), None); + ListViewArray::new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ); let nulls = NullBuffer::new_null(4); - ListViewArray::new(field.clone(), offsets, sizes.clone(), values.clone(), Some(nulls)); + ListViewArray::new( + field.clone(), + offsets, + sizes.clone(), + values.clone(), + Some(nulls), + ); let nulls = NullBuffer::new_null(4); let offsets = OffsetBuffer::new(vec![0, 1, 2, 3, 4].into()); let sizes = SizeBuffer::new(vec![1, 1, 1, 1, 0].into()); - let err = LargeListViewArray::try_new(field, offsets.clone(), sizes.clone(), values.clone(), Some(nulls)) - .unwrap_err(); + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + Some(nulls), + ) + .unwrap_err(); assert_eq!( err.to_string(), @@ -1133,8 +1174,14 @@ mod tests { ); let field = Arc::new(Field::new("element", DataType::Int64, false)); - let err = LargeListViewArray::try_new(field.clone(), offsets.clone(), sizes.clone(), values.clone(), None) - .unwrap_err(); + let err = LargeListViewArray::try_new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); assert_eq!( err.to_string(), @@ -1145,14 +1192,19 @@ mod tests { let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); let values = Arc::new(values); - let err = - LargeListViewArray::try_new(field, offsets.clone(), sizes.clone(), values.clone(),None).unwrap_err(); + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); assert_eq!( err.to_string(), "Invalid argument error: Non-nullable field of LargeListViewArray \"element\" cannot contain nulls" ); - } #[test] @@ -1172,4 +1224,4 @@ mod tests { .collect(); assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) } -} \ No newline at end of file +} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index f981ba4021a3..debee567e822 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -703,8 +703,7 @@ unsafe fn get_sizes(data: &ArrayData) -> SizeBuffer { match data.is_empty() && data.buffers()[1].is_empty() { true => SizeBuffer::new_empty(), false => { - let buffer = - ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); + let buffer = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); // Safety: // ArrayData is valid SizeBuffer::new(buffer) diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs index 59ec9592f5aa..1fb55bb5e11a 100644 --- a/arrow-array/src/builder/generic_list_view_builder.rs +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -15,14 +15,15 @@ // specific language governing permissions and limitations // under the License. - -use std::any::Any; -use std::sync::Arc; -use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, OffsetBuffer, SizeBuffer}; -use arrow_schema::{Field, FieldRef}; use crate::builder::ArrayBuilder; use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait}; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, OffsetBuffer, SizeBuffer}; +use arrow_schema::{Field, FieldRef}; +use std::any::Any; +use std::sync::Arc; +/// Builder for [`GenericListViewArray`] +/// #[derive(Debug)] pub struct GenericListViewBuilder { offsets_builder: BufferBuilder, @@ -32,54 +33,16 @@ pub struct GenericListViewBuilder field: Option, } - - - impl Default for GenericListViewBuilder { fn default() -> Self { Self::new(T::default()) } } -impl GenericListViewBuilder { - /// Creates a new [`GenericListBuilder`] from a given values array builder - pub fn new(values_builder: T) -> Self { - let capacity = values_builder.len(); - Self::with_capacity(values_builder, capacity) - } - - /// Creates a new [`GenericListBuilder`] from a given values array builder - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, capacity: usize) -> Self { - let offsets_builder = BufferBuilder::::new(capacity); - let sizes_builder = BufferBuilder::::new(capacity); - Self { - offsets_builder, - null_buffer_builder: NullBufferBuilder::new(capacity), - values_builder, - sizes_builder, - field: None, - } - } - - /// Override the field passed to [`GenericListArray::new`] - /// - /// By default a nullable field is created with the name `item` - /// - /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the - /// field's data type does not match that of `T` - pub fn with_field(self, field: impl Into) -> Self { - Self { - field: Some(field.into()), - ..self - } - } -} - impl ArrayBuilder -for GenericListViewBuilder - where - T: 'static, + for GenericListViewBuilder +where + T: 'static, { /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { @@ -112,9 +75,44 @@ for GenericListViewBuilder } } +impl GenericListViewBuilder { + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let offsets_builder = BufferBuilder::::new(capacity); + let sizes_builder = BufferBuilder::::new(capacity); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + sizes_builder, + field: None, + } + } + + /// Override the field passed to [`GenericListViewArray::new`] + /// + /// By default a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + impl GenericListViewBuilder - where - T: 'static, +where + T: 'static, { /// Returns the child array builder as a mutable reference. /// @@ -136,39 +134,43 @@ impl GenericListViewBuilder(&mut self, i: I) - where - T: Extend>, - I: IntoIterator>, + where + T: Extend>, + I: IntoIterator>, { self.extend(std::iter::once(Some(i))) } - /// Append a null to this [`GenericListBuilder`] + /// Append a null to this [`GenericListViewBuilder`] /// /// See [`Self::append_value`] for an example use. #[inline] pub fn append_null(&mut self) { - self.offsets_builder.append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); - self.sizes_builder.append(OffsetSize::from_usize(0).unwrap()); + self.offsets_builder + .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); + self.sizes_builder + .append(OffsetSize::from_usize(0).unwrap()); self.null_buffer_builder.append_null(); } - /// Appends an optional value into this [`GenericListBuilder`] + /// Appends an optional value into this [`GenericListViewBuilder`] /// /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] #[inline] pub fn append_option(&mut self, i: Option) - where - T: Extend>, - I: IntoIterator>, + where + T: Extend>, + I: IntoIterator>, { match i { Some(i) => self.append_value(i), @@ -192,10 +194,10 @@ impl GenericListViewBuilder f.clone(), None => Arc::new(Field::new("item", values.data_type().clone(), true)), }; - GenericListViewArray::new(field, offsets, sizes, values ,nulls) + GenericListViewArray::new(field, offsets, sizes, values, nulls) } - /// Builds the [`GenericListArray`] without resetting the builder. + /// Builds the [`GenericListViewArray`] without resetting the builder. pub fn finish_cloned(&self) -> GenericListViewArray { let values = self.values_builder.finish_cloned(); let nulls = self.null_buffer_builder.finish_cloned(); @@ -222,10 +224,10 @@ impl GenericListViewBuilder Extend> for GenericListViewBuilder - where - O: OffsetSizeTrait, - B: ArrayBuilder + Extend, - V: IntoIterator, +where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, { #[inline] fn extend>>(&mut self, iter: T) { @@ -246,7 +248,7 @@ impl Extend> for GenericListViewBuilder #[cfg(test)] mod tests { use super::*; - use crate::builder::{Int32Builder, ListViewBuilder, make_builder}; + use crate::builder::{make_builder, Int32Builder, ListViewBuilder}; use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Array, Int32Array}; @@ -260,14 +262,14 @@ mod tests { builder.values().append_value(0); builder.values().append_value(1); builder.values().append_value(2); - builder.append(true,3); + builder.append(true, 3); builder.values().append_value(3); builder.values().append_value(4); builder.values().append_value(5); - builder.append(true,3); + builder.append(true, 3); builder.values().append_value(6); builder.values().append_value(7); - builder.append(true,2); + builder.append(true, 2); let list_array = builder.finish(); let list_values = list_array.values().as_primitive::(); @@ -304,15 +306,15 @@ mod tests { builder.values().append_value(0); builder.values().append_value(1); builder.values().append_value(2); - builder.append(true,3); - builder.append(false,0); + builder.append(true, 3); + builder.append(false, 0); builder.values().append_value(3); builder.values().append_null(); builder.values().append_value(5); - builder.append(true,3); + builder.append(true, 3); builder.values().append_value(6); builder.values().append_value(7); - builder.append(true,2); + builder.append(true, 2); let list_array = builder.finish(); @@ -322,7 +324,6 @@ mod tests { assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); assert_eq!(O::from_usize(3).unwrap(), list_array.value_sizes()[2]); assert_eq!(O::from_usize(3).unwrap(), list_array.value_length(2)); - } #[test] @@ -341,16 +342,16 @@ mod tests { let mut builder = ListViewBuilder::new(values_builder); builder.values().append_slice(&[1, 2, 3]); - builder.append(true,1); + builder.append(true, 1); builder.values().append_slice(&[4, 5, 6]); - builder.append(true,1); + builder.append(true, 1); let mut arr = builder.finish(); assert_eq!(2, arr.len()); assert!(builder.is_empty()); builder.values().append_slice(&[7, 8, 9]); - builder.append(true,1); + builder.append(true, 1); arr = builder.finish(); assert_eq!(1, arr.len()); assert!(builder.is_empty()); @@ -362,16 +363,16 @@ mod tests { let mut builder = ListViewBuilder::new(values_builder); builder.values().append_slice(&[1, 2, 3]); - builder.append(true,1); + builder.append(true, 1); builder.values().append_slice(&[4, 5, 6]); - builder.append(true,1); + builder.append(true, 1); let mut arr = builder.finish_cloned(); assert_eq!(2, arr.len()); assert!(!builder.is_empty()); builder.values().append_slice(&[7, 8, 9]); - builder.append(true,1); + builder.append(true, 1); arr = builder.finish(); assert_eq!(3, arr.len()); assert!(builder.is_empty()); @@ -386,27 +387,27 @@ mod tests { // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] builder.values().values().append_value(1); builder.values().values().append_value(2); - builder.values().append(true,2); + builder.values().append(true, 2); builder.values().values().append_value(3); builder.values().values().append_value(4); - builder.values().append(true,2); - builder.append(true,2); + builder.values().append(true, 2); + builder.append(true, 2); builder.values().values().append_value(5); builder.values().values().append_value(6); builder.values().values().append_value(7); - builder.values().append(true,3); - builder.values().append(false,0); + builder.values().append(true, 3); + builder.values().append(false, 0); builder.values().values().append_value(8); - builder.values().append(true,1); - builder.append(true,3); + builder.values().append(true, 1); + builder.append(true, 3); - builder.append(false,0); + builder.append(false, 0); builder.values().values().append_value(9); builder.values().values().append_value(10); - builder.values().append(true,2); - builder.append(true,1); + builder.values().append(true, 2); + builder.append(true, 1); let l1 = builder.finish(); @@ -461,7 +462,7 @@ mod tests { .downcast_mut::() .expect("should be an Int32Builder") .append_slice(&[1, 2, 3]); - builder.append(true,1); + builder.append(true, 1); builder .values() @@ -469,7 +470,7 @@ mod tests { .downcast_mut::() .expect("should be an Int32Builder") .append_slice(&[4, 5, 6]); - builder.append(true,1); + builder.append(true, 1); let arr = builder.finish(); assert_eq!(2, arr.len()); @@ -498,7 +499,9 @@ mod tests { test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); } - fn test_boxed_generic_list_view_generic_list_view_array_builder( + fn test_boxed_generic_list_view_generic_list_view_array_builder< + O: OffsetSizeTrait + PartialEq, + >( values_builder: Box, ) { let mut builder: GenericListViewBuilder> = @@ -530,7 +533,7 @@ mod tests { .as_any_mut() .downcast_mut::>>() .expect("should be an (Large)ListViewBuilder") - .append(true,2); + .append(true, 2); builder .values() .as_any_mut() @@ -556,8 +559,8 @@ mod tests { .as_any_mut() .downcast_mut::>>() .expect("should be an (Large)ListViewBuilder") - .append(true,2); - builder.append(true,2); + .append(true, 2); + builder.append(true, 2); builder .values() @@ -594,13 +597,13 @@ mod tests { .as_any_mut() .downcast_mut::>>() .expect("should be an (Large)ListViewBuilder") - .append(true,3); + .append(true, 3); builder .values() .as_any_mut() .downcast_mut::>>() .expect("should be an (Large)ListViewBuilder") - .append(false,0); + .append(false, 0); builder .values() .as_any_mut() @@ -616,10 +619,10 @@ mod tests { .as_any_mut() .downcast_mut::>>() .expect("should be an (Large)ListViewBuilder") - .append(true,1); - builder.append(true,3); + .append(true, 1); + builder.append(true, 3); - builder.append(false,0); + builder.append(false, 0); builder .values() @@ -646,8 +649,8 @@ mod tests { .as_any_mut() .downcast_mut::>>() .expect("should be an (Large)ListViewBuilder") - .append(true,2); - builder.append(true,1); + .append(true, 2); + builder.append(true, 1); let l1 = builder.finish(); assert_eq!(4, l1.len()); @@ -685,7 +688,9 @@ mod tests { } #[test] - #[should_panic(expected = "Non-nullable field of ListViewArray \\\"item\\\" cannot contain nulls")] + #[should_panic( + expected = "Non-nullable field of ListViewArray \\\"item\\\" cannot contain nulls" + )] fn test_checks_nullability() { let field = Arc::new(Field::new("item", DataType::Int32, false)); let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); @@ -702,4 +707,3 @@ mod tests { builder.finish(); } } - diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 109492de18ea..dccba9b35f83 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -180,8 +180,8 @@ mod generic_byte_run_builder; pub use generic_byte_run_builder::*; mod generic_bytes_view_builder; pub use generic_bytes_view_builder::*; -mod union_builder; mod generic_list_view_builder; +mod union_builder; pub use generic_list_view_builder::*; pub use union_builder::*; diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 52ed42cab6cf..b3adea27e6eb 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -259,7 +259,9 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { let builder = make_builder(field.data_type(), capacity); - Box::new(LargeListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + Box::new( + LargeListViewBuilder::with_capacity(builder, capacity).with_field(field.clone()), + ) } DataType::Map(field, _) => match field.data_type() { DataType::Struct(fields) => { diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 5fcfaa0b2709..c56b1fd308cf 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -626,7 +626,9 @@ mod tests { use std::collections::HashMap; use super::*; - use crate::{BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray}; + use crate::{ + BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray, + }; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::Fields; diff --git a/arrow-buffer/src/buffer/size.rs b/arrow-buffer/src/buffer/size.rs index 9d2e5889ca1c..ff314cc2d975 100644 --- a/arrow-buffer/src/buffer/size.rs +++ b/arrow-buffer/src/buffer/size.rs @@ -1,11 +1,27 @@ -use std::ops::Deref; +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::{ArrowNativeType, MutableBuffer, ScalarBuffer}; +use std::ops::Deref; #[derive(Debug, Clone)] pub struct SizeBuffer(ScalarBuffer); impl SizeBuffer { - /// Create a new [`SizeBuffer`] containing `len` `0` values pub fn new_zeroed(len: usize) -> Self { let len_bytes = len.checked_mul(std::mem::size_of::()).expect("overflow"); @@ -37,8 +53,8 @@ impl SizeBuffer { /// assert_eq!(offsets.as_ref(), &[1, 3, 5]); /// ``` pub fn from_lengths(lengths: I) -> Self - where - I: IntoIterator, + where + I: IntoIterator, { let iter = lengths.into_iter(); let mut out = Vec::with_capacity(iter.size_hint().0); @@ -64,7 +80,7 @@ impl SizeBuffer { Self(self.0.slice(offset, len.saturating_add(1))) } - /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons + /// Returns true if this [`ScalarBuffer`] is equal to `other`, using pointer comparisons /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may /// return false when the arrays are logically equal #[inline] @@ -73,7 +89,6 @@ impl SizeBuffer { } } - impl Deref for SizeBuffer { type Target = [T]; diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 0d832a453693..5c752459d1ab 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -948,7 +948,7 @@ impl ArrayData { &self, values_length: usize, ) -> Result<(), ArrowError> { - let sizes = self.typed_sizes::()?; + let sizes = self.typed_sizes::()?; if sizes.is_empty() { return Ok(()); } @@ -1702,13 +1702,16 @@ impl DataTypeLayout { /// Describes a list view type pub fn new_list_view() -> Self { Self { - buffers: vec![BufferSpec::FixedWidth { - byte_width: mem::size_of::(), - alignment: mem::align_of::(), - },BufferSpec::FixedWidth { - byte_width: mem::size_of::(), - alignment: mem::align_of::(), - }], + buffers: vec![ + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + ], can_contain_null_mask: true, variadic: true, } diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs index b6a5d7e945fa..ffa3ff4e8d1c 100644 --- a/arrow-data/src/equal/list_view.rs +++ b/arrow-data/src/equal/list_view.rs @@ -1,7 +1,24 @@ -use num::Integer; -use arrow_buffer::ArrowNativeType; -use crate::ArrayData; +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::data::count_nulls; +use crate::ArrayData; +use arrow_buffer::ArrowNativeType; +use num::Integer; use super::equal_range; @@ -17,7 +34,6 @@ pub(super) fn list_view_equal( let lhs_sizes = lhs.buffer::(1); let rhs_sizes = rhs.buffer::(1); for i in 0..len { - // compare offsets and sizes let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; @@ -28,15 +44,12 @@ pub(super) fn list_view_equal( if lhs_size != rhs_size { return false; } - // compare nulls let lhs_null_count = count_nulls(lhs.nulls(), lhs_offset_start, lhs_size); let rhs_null_count = count_nulls(rhs.nulls(), rhs_offset_start, lhs_size); if lhs_null_count != rhs_null_count { return false; } - - print!("lhs_offset_start: {}, rhs_offset_start: {}, lhs_size: {}\n", lhs_offset_start, rhs_offset_start, lhs_size); // compare values if !equal_range( &lhs.child_data()[0], diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 0c721a595eb4..65bd6eea704c 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -20,6 +20,7 @@ //! depend on dynamic casting of `Array`. use crate::data::ArrayData; +use crate::equal::list_view::list_view_equal; use arrow_buffer::i256; use arrow_schema::{DataType, IntervalUnit}; use half::f16; @@ -30,6 +31,7 @@ mod dictionary; mod fixed_binary; mod fixed_list; mod list; +mod list_view; mod null; mod primitive; mod run; @@ -37,7 +39,6 @@ mod structure; mod union; mod utils; mod variable_size; -mod list_view; // these methods assume the same type, len and null count. // For this reason, they are not exposed and are instead used @@ -53,7 +54,6 @@ use primitive::primitive_equal; use structure::struct_equal; use union::union_equal; use variable_size::variable_sized_equal; -use crate::equal::list_view::list_view_equal; use self::run::run_equal; From 955eb2b1ae36ff495f54c4dbd81a72d61d48d66e Mon Sep 17 00:00:00 2001 From: kikkon Date: Sat, 6 Apr 2024 21:54:41 +0800 Subject: [PATCH 3/5] fix: remove offsetbuffer --- arrow-array/src/array/list_view_array.rs | 111 +++++++++++------- arrow-array/src/array/mod.rs | 29 +++-- .../src/builder/generic_list_view_builder.rs | 10 +- arrow-buffer/src/buffer/mod.rs | 2 - arrow-buffer/src/buffer/scalar.rs | 13 ++ arrow-buffer/src/buffer/size.rs | 106 ----------------- 6 files changed, 107 insertions(+), 164 deletions(-) delete mode 100644 arrow-buffer/src/buffer/size.rs diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index bd54aa28716c..d7ad2590c3e1 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{get_offsets, get_sizes, make_array, print_long_array}; +use crate::array::{get_view_offsets, get_view_sizes, make_array, print_long_array}; use crate::builder::{GenericListViewBuilder, PrimitiveBuilder}; use crate::iterator::GenericListViewArrayIter; use crate::{ new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, FixedSizeListArray, OffsetSizeTrait, }; -use arrow_buffer::{NullBuffer, OffsetBuffer, SizeBuffer}; +use arrow_buffer::{NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef}; use std::any::Any; @@ -48,8 +48,8 @@ pub struct GenericListViewArray { data_type: DataType, nulls: Option, values: ArrayRef, - value_offsets: OffsetBuffer, - value_sizes: SizeBuffer, + value_offsets: ScalarBuffer, + value_sizes: ScalarBuffer, len: usize, } @@ -76,35 +76,47 @@ impl GenericListViewArray { DataType::ListView }; - /// Returns the data type of the list view array + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Errors + /// + /// Errors if + /// + /// * `offsets.len() != sizes.len()` + /// * `offsets.len() != nulls.len()` + /// * `offsets.last() > values.len()` + /// * `!field.is_nullable() && values.is_nullable()` + /// * `field.data_type() != values.data_type()` + /// * `0 <= offsets[i] <= length of the child array` + /// * `0 <= offsets[i] + size[i] <= length of the child array` pub fn try_new( field: FieldRef, - offsets: OffsetBuffer, - sizes: SizeBuffer, + offsets: ScalarBuffer, + sizes: ScalarBuffer, values: ArrayRef, nulls: Option, ) -> Result { - let mut len = 0usize; for i in 0..offsets.len() { + let length = values.len(); let offset = offsets[i].as_usize(); let size = sizes[i].as_usize(); + if offset > length { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid offset value for {}ListViewArray, offset: {}, length: {}", + OffsetSize::PREFIX, + offset, + length + ))); + } if offset + size > values.len() { return Err(ArrowError::InvalidArgumentError(format!( "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", OffsetSize::PREFIX, offset, size, values.len() ))); } - len = len.add(size); } let len = offsets.len(); - if len != sizes.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", - OffsetSize::PREFIX, len, sizes.len() - ))); - } - if let Some(n) = nulls.as_ref() { if n.len() != len { return Err(ArrowError::InvalidArgumentError(format!( @@ -114,6 +126,13 @@ impl GenericListViewArray { ))); } } + if len != sizes.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", + OffsetSize::PREFIX, len, sizes.len() + ))); + } + if !field.is_nullable() && values.is_nullable() { return Err(ArrowError::InvalidArgumentError(format!( "Non-nullable field of {}ListViewArray {:?} cannot contain nulls", @@ -149,8 +168,8 @@ impl GenericListViewArray { /// Panics if [`Self::try_new`] returns an error pub fn new( field: FieldRef, - offsets: OffsetBuffer, - sizes: SizeBuffer, + offsets: ScalarBuffer, + sizes: ScalarBuffer, values: ArrayRef, nulls: Option, ) -> Self { @@ -163,8 +182,8 @@ impl GenericListViewArray { Self { data_type: Self::DATA_TYPE_CONSTRUCTOR(field), nulls: Some(NullBuffer::new_null(len)), - value_offsets: OffsetBuffer::new_zeroed(len), - value_sizes: SizeBuffer::new_zeroed(len), + value_offsets: ScalarBuffer::new_zeroed(len), + value_sizes: ScalarBuffer::new_zeroed(len), values, len: 0usize, } @@ -175,8 +194,8 @@ impl GenericListViewArray { self, ) -> ( FieldRef, - OffsetBuffer, - SizeBuffer, + ScalarBuffer, + ScalarBuffer, ArrayRef, Option, ) { @@ -195,10 +214,10 @@ impl GenericListViewArray { /// Returns a reference to the offsets of this list /// - /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] + /// Unlike [`Self::value_offsets`] this returns the [`ScalarBuffer`] /// allowing for zero-copy cloning #[inline] - pub fn offsets(&self) -> &OffsetBuffer { + pub fn offsets(&self) -> &ScalarBuffer { &self.value_offsets } @@ -213,7 +232,7 @@ impl GenericListViewArray { /// Unlike [`Self::value_sizes`] this returns the [`SizeBuffer`] /// allowing for zero-copy cloning #[inline] - pub fn sizes(&self) -> &SizeBuffer { + pub fn sizes(&self) -> &ScalarBuffer { &self.value_sizes } @@ -364,7 +383,7 @@ impl Array for GenericListViewArray { fn get_buffer_memory_size(&self) -> usize { let mut size = self.values.get_buffer_memory_size(); - size += self.value_offsets.inner().inner().capacity(); + size += self.value_offsets.inner().capacity(); if let Some(n) = self.nulls.as_ref() { size += n.buffer().capacity(); } @@ -373,7 +392,7 @@ impl Array for GenericListViewArray { fn get_array_memory_size(&self) -> usize { let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); - size += self.value_offsets.inner().inner().capacity(); + size += self.value_offsets.inner().capacity(); if let Some(n) = self.nulls.as_ref() { size += n.buffer().capacity(); } @@ -399,8 +418,8 @@ impl From> for Arr .len(len) .nulls(array.nulls) .buffers(vec![ - array.value_offsets.into_inner().into_inner(), - array.value_sizes.into_inner().into_inner(), + array.value_offsets.into_inner(), + array.value_sizes.into_inner(), ]) .child_data(vec![array.values.to_data()]); @@ -421,9 +440,20 @@ impl From for GenericListViewAr DataType::FixedSizeList(f, size) => (f, *size as usize), _ => unreachable!(), }; - - let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); - let sizes = SizeBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + let iter = std::iter::repeat(size).take(value.len()); + let mut offsets = Vec::with_capacity(iter.size_hint().0); + offsets.push(OffsetSize::usize_as(0)); + let mut acc = 0_usize; + let iter = std::iter::repeat(size).take(value.len()); + let mut sizes = Vec::with_capacity(iter.size_hint().0); + for size in iter { + acc = acc.checked_add(size).expect("usize overflow"); + offsets.push(OffsetSize::usize_as(acc)); + sizes.push(OffsetSize::usize_as(size)); + } + OffsetSize::from_usize(acc).expect("offset overflow"); + let sizes = ScalarBuffer::from(sizes); + let offsets = ScalarBuffer::from(offsets); Self { data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), nulls: value.nulls().cloned(), @@ -472,8 +502,8 @@ impl GenericListViewArray { let values = make_array(values); // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = unsafe { get_offsets(&data) }; - let value_sizes = unsafe { get_sizes(&data) }; + let value_offsets = get_view_offsets(&data); + let value_sizes = get_view_sizes(&data); Ok(Self { data_type: data.data_type().clone(), @@ -500,9 +530,9 @@ mod tests { fn create_from_buffers() -> ListViewArray { // [[0, 1, 2], [3, 4, 5], [6, 7]] let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); - let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6])); + let offsets = ScalarBuffer::from(vec![0, 3, 6]); let field = Arc::new(Field::new("item", DataType::Int32, true)); - let sizes = SizeBuffer::new(ScalarBuffer::from(vec![3, 3, 2])); + let sizes = ScalarBuffer::from(vec![3, 3, 2]); ListViewArray::new(field, offsets, sizes, Arc::new(values), None) } @@ -1133,8 +1163,8 @@ mod tests { #[test] fn test_try_new() { - let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); - let sizes = SizeBuffer::new(vec![1, 3, 1, 0].into()); + let offsets = ScalarBuffer::from(vec![0, 1, 4, 5]); + let sizes = ScalarBuffer::from(vec![1, 3, 1, 0]); let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); let values = Arc::new(values) as ArrayRef; @@ -1157,8 +1187,8 @@ mod tests { ); let nulls = NullBuffer::new_null(4); - let offsets = OffsetBuffer::new(vec![0, 1, 2, 3, 4].into()); - let sizes = SizeBuffer::new(vec![1, 1, 1, 1, 0].into()); + let offsets = ScalarBuffer::from(vec![0, 1, 2, 3, 4]); + let sizes = ScalarBuffer::from(vec![1, 1, 1, 1, 0]); let err = LargeListViewArray::try_new( field, offsets.clone(), @@ -1217,7 +1247,6 @@ mod tests { builder.values().append_slice(&[4, 5, 6]); builder.append(true); let list: ListViewArray = builder.finish().into(); - let values: Vec<_> = list .iter() .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index debee567e822..957fcb50b1c6 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer, SizeBuffer}; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -696,18 +696,27 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { } } -/// Helper function that gets size from an [`ArrayData`] +/// Helper function that gets list view offset from an [`ArrayData`] /// /// # Safety -unsafe fn get_sizes(data: &ArrayData) -> SizeBuffer { +/// +/// ArrayData must contain a valid [`ScalarBuffer`] as its first buffer +fn get_view_offsets(data: &ArrayData) -> ScalarBuffer { + match data.is_empty() && data.buffers()[0].is_empty() { + true => ScalarBuffer::new_empty(), + false => ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len() + 1), + } +} + +/// Helper function that gets list view size from an [`ArrayData`] +/// +/// # Safety +/// +/// ArrayData must contain a valid [`ScalarBuffer`] as its second buffer +fn get_view_sizes(data: &ArrayData) -> ScalarBuffer { match data.is_empty() && data.buffers()[1].is_empty() { - true => SizeBuffer::new_empty(), - false => { - let buffer = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); - // Safety: - // ArrayData is valid - SizeBuffer::new(buffer) - } + true => ScalarBuffer::new_empty(), + false => ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()), } } diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs index 1fb55bb5e11a..024ff6908713 100644 --- a/arrow-array/src/builder/generic_list_view_builder.rs +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -17,7 +17,7 @@ use crate::builder::ArrayBuilder; use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait}; -use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, OffsetBuffer, SizeBuffer}; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; use arrow_schema::{Field, FieldRef}; use std::any::Any; use std::sync::Arc; @@ -185,10 +185,10 @@ where let offsets = self.offsets_builder.finish(); // Safety: Safe by construction - let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + let offsets = ScalarBuffer::from(offsets); let sizes = self.sizes_builder.finish(); - let sizes = SizeBuffer::new(sizes.into()); + let sizes = ScalarBuffer::from(sizes); let field = match &self.field { Some(f) => f.clone(), @@ -204,10 +204,10 @@ where let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); // Safety: safe by construction - let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + let offsets = ScalarBuffer::from(offsets); let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice()); - let sizes = SizeBuffer::new(sizes.into()); + let sizes = ScalarBuffer::from(sizes); let field = match &self.field { Some(f) => f.clone(), diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index c00bd326b3cb..0278dc2c3320 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -32,7 +32,5 @@ pub use boolean::*; mod null; pub use null::*; mod run; -mod size; -pub use size::*; pub use run::*; diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 2019cc79830d..549cd891eff0 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -94,6 +94,19 @@ impl ScalarBuffer { pub fn ptr_eq(&self, other: &Self) -> bool { self.buffer.ptr_eq(&other.buffer) } + + /// Returns the length of this buffer in units of `T` + pub fn new_zeroed(len: usize) -> Self { + let len_bytes = len.checked_mul(std::mem::size_of::()).expect("overflow"); + let buffer = MutableBuffer::from_len_zeroed(len_bytes); + Self::from(buffer) + } + + /// Create a new [`ScalarBuffer`] containing a single 0 value + pub fn new_empty() -> Self { + let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); + Self::from(buffer.into_buffer()) + } } impl Deref for ScalarBuffer { diff --git a/arrow-buffer/src/buffer/size.rs b/arrow-buffer/src/buffer/size.rs deleted file mode 100644 index ff314cc2d975..000000000000 --- a/arrow-buffer/src/buffer/size.rs +++ /dev/null @@ -1,106 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::{ArrowNativeType, MutableBuffer, ScalarBuffer}; -use std::ops::Deref; - -#[derive(Debug, Clone)] -pub struct SizeBuffer(ScalarBuffer); - -impl SizeBuffer { - /// Create a new [`SizeBuffer`] containing `len` `0` values - pub fn new_zeroed(len: usize) -> Self { - let len_bytes = len.checked_mul(std::mem::size_of::()).expect("overflow"); - let buffer = MutableBuffer::from_len_zeroed(len_bytes); - Self(buffer.into_buffer().into()) - } - - /// Create a new [`SizeBuffer`] from the provided [`ScalarBuffer`] - pub fn new(buffer: ScalarBuffer) -> Self { - assert!(!buffer.is_empty(), "offsets cannot be empty"); - assert!( - buffer[0] >= O::usize_as(0), - "offsets must be greater than 0" - ); - Self(buffer) - } - - /// Create a new [`SizeBuffer`] containing a single 0 value - pub fn new_empty() -> Self { - let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); - Self(buffer.into_buffer().into()) - } - - /// Create a new [`SizeBuffer`] from the iterator of slice lengths - /// - /// ``` - /// # use arrow_buffer::SizeBuffer; - /// let offsets = SizeBuffer::::from_lengths([1, 3, 5]); - /// assert_eq!(offsets.as_ref(), &[1, 3, 5]); - /// ``` - pub fn from_lengths(lengths: I) -> Self - where - I: IntoIterator, - { - let iter = lengths.into_iter(); - let mut out = Vec::with_capacity(iter.size_hint().0); - - for size in iter { - out.push(O::usize_as(size)) - } - Self(out.into()) - } - - /// Returns the inner [`ScalarBuffer`] - pub fn inner(&self) -> &ScalarBuffer { - &self.0 - } - - /// Returns the inner [`ScalarBuffer`], consuming self - pub fn into_inner(self) -> ScalarBuffer { - self.0 - } - - /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self(self.0.slice(offset, len.saturating_add(1))) - } - - /// Returns true if this [`ScalarBuffer`] is equal to `other`, using pointer comparisons - /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may - /// return false when the arrays are logically equal - #[inline] - pub fn ptr_eq(&self, other: &Self) -> bool { - self.0.ptr_eq(&other.0) - } -} - -impl Deref for SizeBuffer { - type Target = [T]; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl AsRef<[T]> for SizeBuffer { - #[inline] - fn as_ref(&self) -> &[T] { - self - } -} From af38d1460f560bdf86e80021eefcda4962dfd821 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sat, 6 Apr 2024 22:09:08 +0800 Subject: [PATCH 4/5] fix: doc --- arrow-array/src/array/list_view_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index d7ad2590c3e1..2504fd217d81 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -229,7 +229,7 @@ impl GenericListViewArray { /// Returns a reference to the sizes of this list /// - /// Unlike [`Self::value_sizes`] this returns the [`SizeBuffer`] + /// Unlike [`Self::value_sizes`] this returns the [`ScalarBuffer`] /// allowing for zero-copy cloning #[inline] pub fn sizes(&self) -> &ScalarBuffer { From 82b6095038230b16ed6906f4e362b9c765fe9338 Mon Sep 17 00:00:00 2001 From: kikkon Date: Wed, 17 Apr 2024 22:54:56 +0800 Subject: [PATCH 5/5] chore: remove unused code --- arrow-array/src/array/list_view_array.rs | 56 +++++++++---------- arrow-array/src/array/mod.rs | 24 -------- .../src/builder/generic_list_view_builder.rs | 3 +- arrow-data/src/data.rs | 15 +---- 4 files changed, 29 insertions(+), 69 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 2504fd217d81..905d837bfd05 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{get_view_offsets, get_view_sizes, make_array, print_long_array}; +use crate::array::{make_array, print_long_array}; use crate::builder::{GenericListViewBuilder, PrimitiveBuilder}; use crate::iterator::GenericListViewArrayIter; use crate::{ @@ -50,7 +50,6 @@ pub struct GenericListViewArray { values: ArrayRef, value_offsets: ScalarBuffer, value_sizes: ScalarBuffer, - len: usize, } impl Clone for GenericListViewArray { @@ -61,7 +60,6 @@ impl Clone for GenericListViewArray { values: self.values.clone(), value_offsets: self.value_offsets.clone(), value_sizes: self.value_sizes.clone(), - len: self.len, } } } @@ -96,6 +94,24 @@ impl GenericListViewArray { values: ArrayRef, nulls: Option, ) -> Result { + + let len = offsets.len(); + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ListViewArray, expected {len} got {}", + OffsetSize::PREFIX, + n.len(), + ))); + } + } + if len != sizes.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", + OffsetSize::PREFIX, len, sizes.len() + ))); + } + for i in 0..offsets.len() { let length = values.len(); let offset = offsets[i].as_usize(); @@ -108,7 +124,7 @@ impl GenericListViewArray { length ))); } - if offset + size > values.len() { + if offset.saturating_add(size) > values.len() { return Err(ArrowError::InvalidArgumentError(format!( "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", OffsetSize::PREFIX, offset, size, values.len() @@ -116,22 +132,7 @@ impl GenericListViewArray { } } - let len = offsets.len(); - if let Some(n) = nulls.as_ref() { - if n.len() != len { - return Err(ArrowError::InvalidArgumentError(format!( - "Incorrect length of null buffer for {}ListViewArray, expected {len} got {}", - OffsetSize::PREFIX, - n.len(), - ))); - } - } - if len != sizes.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", - OffsetSize::PREFIX, len, sizes.len() - ))); - } + if !field.is_nullable() && values.is_nullable() { return Err(ArrowError::InvalidArgumentError(format!( @@ -157,7 +158,6 @@ impl GenericListViewArray { values, value_offsets: offsets, value_sizes: sizes, - len, }) } @@ -185,7 +185,6 @@ impl GenericListViewArray { value_offsets: ScalarBuffer::new_zeroed(len), value_sizes: ScalarBuffer::new_zeroed(len), values, - len: 0usize, } } @@ -298,7 +297,6 @@ impl GenericListViewArray { values: self.values.clone(), value_offsets: self.value_offsets.slice(offset, length), value_sizes: self.value_sizes.slice(offset, length), - len: length, } } @@ -366,11 +364,11 @@ impl Array for GenericListViewArray { } fn len(&self) -> usize { - self.len + self.values.len() } fn is_empty(&self) -> bool { - self.value_offsets.len() <= 1 + self.values.is_empty() } fn offset(&self) -> usize { @@ -384,6 +382,7 @@ impl Array for GenericListViewArray { fn get_buffer_memory_size(&self) -> usize { let mut size = self.values.get_buffer_memory_size(); size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); if let Some(n) = self.nulls.as_ref() { size += n.buffer().capacity(); } @@ -393,6 +392,7 @@ impl Array for GenericListViewArray { fn get_array_memory_size(&self) -> usize { let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); if let Some(n) = self.nulls.as_ref() { size += n.buffer().capacity(); } @@ -460,7 +460,6 @@ impl From for GenericListViewAr values: value.values().clone(), value_offsets: offsets, value_sizes: sizes, - len: value.len(), } } } @@ -502,8 +501,8 @@ impl GenericListViewArray { let values = make_array(values); // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = get_view_offsets(&data); - let value_sizes = get_view_sizes(&data); + let value_offsets = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let value_sizes = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); Ok(Self { data_type: data.data_type().clone(), @@ -511,7 +510,6 @@ impl GenericListViewArray { values, value_offsets, value_sizes, - len: data.len(), }) } } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 957fcb50b1c6..041e598560b8 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -696,30 +696,6 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { } } -/// Helper function that gets list view offset from an [`ArrayData`] -/// -/// # Safety -/// -/// ArrayData must contain a valid [`ScalarBuffer`] as its first buffer -fn get_view_offsets(data: &ArrayData) -> ScalarBuffer { - match data.is_empty() && data.buffers()[0].is_empty() { - true => ScalarBuffer::new_empty(), - false => ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len() + 1), - } -} - -/// Helper function that gets list view size from an [`ArrayData`] -/// -/// # Safety -/// -/// ArrayData must contain a valid [`ScalarBuffer`] as its second buffer -fn get_view_sizes(data: &ArrayData) -> ScalarBuffer { - match data.is_empty() && data.buffers()[1].is_empty() { - true => ScalarBuffer::new_empty(), - false => ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()), - } -} - /// Helper function for printing potentially long arrays. fn print_long_array(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result where diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs index 024ff6908713..1e8c8fe125d7 100644 --- a/arrow-array/src/builder/generic_list_view_builder.rs +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -136,8 +136,7 @@ where pub fn append(&mut self, is_valid: bool, size: usize) { self.offsets_builder .append(OffsetSize::from_usize(self.values_builder.len() - size).unwrap()); - let size = OffsetSize::from_usize(size).unwrap(); - self.sizes_builder.append(size); + self.sizes_builder.append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); self.null_buffer_builder.append(is_valid); } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 5c752459d1ab..630b31d18548 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -860,17 +860,6 @@ impl ArrayData { self.typed_buffer(0, self.len + 1) } - /// Returns a reference to the data in `buffer` as a typed slice - /// after validating. The returned slice is guaranteed to have at - /// least `len` entries. - fn typed_sizes(&self) -> Result<&[T], ArrowError> { - // An empty list-like array can have 0 sizes - if self.len == 0 && self.buffers[1].is_empty() { - return Ok(&[]); - } - self.typed_buffer(1, self.len) - } - /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating fn typed_buffer( &self, @@ -948,7 +937,7 @@ impl ArrayData { &self, values_length: usize, ) -> Result<(), ArrowError> { - let sizes = self.typed_sizes::()?; + let sizes: &[T] = self.typed_buffer(1, self.len)?; if sizes.is_empty() { return Ok(()); } @@ -974,7 +963,6 @@ impl ArrayData { } DataType::ListView(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; - self.validate_offsets::(values_data.len)?; self.validate_sizes::(values_data.len)?; Ok(()) } @@ -985,7 +973,6 @@ impl ArrayData { } DataType::LargeListView(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; - self.validate_offsets::(values_data.len)?; self.validate_sizes::(values_data.len)?; Ok(()) }