diff --git a/arrow-array/src/array/bytes_view_array.rs b/arrow-array/src/array/byte_view_array.rs similarity index 67% rename from arrow-array/src/array/bytes_view_array.rs rename to arrow-array/src/array/byte_view_array.rs index dd21c3e7dbc7..0217ce89779c 100644 --- a/arrow-array/src/array/bytes_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -16,21 +16,26 @@ // under the License. use crate::array::print_long_array; -use crate::builder::GenericBytesViewBuilder; +use crate::builder::GenericByteViewBuilder; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; -use crate::types::BytesViewType; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{Array, ArrayAccessor, ArrayRef}; use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; -use arrow_data::{ArrayData, ArrayDataBuilder, BytesView}; +use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::fmt::Debug; use std::marker::PhantomData; use std::sync::Arc; -/// An array of variable length bytes view arrays -pub struct GenericBytesViewArray { +/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays. +/// +/// Different than [`GenericByteArray`] as it stores both an offset and length +/// meaning that take / filter operations can be implemented without copying the underlying data. +/// +/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout +pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, buffers: Vec, @@ -38,7 +43,7 @@ pub struct GenericBytesViewArray { nulls: Option, } -impl Clone for GenericBytesViewArray { +impl Clone for GenericByteViewArray { fn clone(&self) -> Self { Self { data_type: T::DATA_TYPE, @@ -50,22 +55,22 @@ impl Clone for GenericBytesViewArray { } } -impl GenericBytesViewArray { - /// Create a new [`GenericBytesViewArray`] from the provided parts, panicking on failure +impl GenericByteViewArray { + /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure /// /// # Panics /// - /// Panics if [`GenericBytesViewArray::try_new`] returns an error + /// Panics if [`GenericByteViewArray::try_new`] returns an error pub fn new(views: ScalarBuffer, buffers: Vec, nulls: Option) -> Self { Self::try_new(views, buffers, nulls).unwrap() } - /// Create a new [`GenericBytesViewArray`] from the provided parts, returning an error on failure + /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure /// /// # Errors /// /// * `views.len() != nulls.len()` - /// * [BytesViewType::validate] fails + /// * [ByteViewType::validate] fails pub fn try_new( views: ScalarBuffer, buffers: Vec, @@ -93,7 +98,7 @@ impl GenericBytesViewArray { }) } - /// Create a new [`GenericBytesViewArray`] from the provided parts, without validation + /// Create a new [`GenericByteViewArray`] from the provided parts, without validation /// /// # Safety /// @@ -112,7 +117,7 @@ impl GenericBytesViewArray { } } - /// Create a new [`GenericBytesViewArray`] of length `len` where all values are null + /// Create a new [`GenericByteViewArray`] of length `len` where all values are null pub fn new_null(len: usize) -> Self { Self { data_type: T::DATA_TYPE, @@ -123,14 +128,14 @@ impl GenericBytesViewArray { } } - /// Creates a [`GenericBytesViewArray`] based on an iterator of values without nulls + /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls pub fn from_iter_values(iter: I) -> Self where Ptr: AsRef, I: IntoIterator, { let iter = iter.into_iter(); - let mut builder = GenericBytesViewBuilder::::with_capacity(iter.size_hint().0); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); for v in iter { builder.append_value(v); } @@ -179,7 +184,7 @@ impl GenericBytesViewArray { let ptr = self.views.as_ptr() as *const u8; std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize) } else { - let view = BytesView::from(*v); + let view = ByteView::from(*v); let data = self.buffers.get_unchecked(view.buffer_index as usize); let offset = view.offset as usize; data.get_unchecked(offset..offset + len as usize) @@ -204,7 +209,7 @@ impl GenericBytesViewArray { } } -impl Debug for GenericBytesViewArray { +impl Debug for GenericByteViewArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{}ViewArray\n[\n", T::PREFIX)?; print_long_array(self, f, |array, index, f| { @@ -214,7 +219,7 @@ impl Debug for GenericBytesViewArray { } } -impl Array for GenericBytesViewArray { +impl Array for GenericByteViewArray { fn as_any(&self) -> &dyn Any { self } @@ -265,19 +270,19 @@ impl Array for GenericBytesViewArray { } } -impl<'a, T: BytesViewType + ?Sized> ArrayAccessor for &'a GenericBytesViewArray { +impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray { type Item = &'a T::Native; fn value(&self, index: usize) -> Self::Item { - GenericBytesViewArray::value(self, index) + GenericByteViewArray::value(self, index) } unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - GenericBytesViewArray::value_unchecked(self, index) + GenericByteViewArray::value_unchecked(self, index) } } -impl<'a, T: BytesViewType + ?Sized> IntoIterator for &'a GenericBytesViewArray { +impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray { type Item = Option<&'a T::Native>; type IntoIter = ArrayIter; @@ -286,7 +291,7 @@ impl<'a, T: BytesViewType + ?Sized> IntoIterator for &'a GenericBytesViewArray From for GenericBytesViewArray { +impl From for GenericByteViewArray { fn from(value: ArrayData) -> Self { let views = value.buffers()[0].clone(); let views = ScalarBuffer::new(views, value.offset(), value.len()); @@ -301,8 +306,8 @@ impl From for GenericBytesViewArray { } } -impl From> for ArrayData { - fn from(mut array: GenericBytesViewArray) -> Self { +impl From> for ArrayData { + fn from(mut array: GenericByteViewArray) -> Self { let len = array.len(); array.buffers.insert(0, array.views.into_inner()); let builder = ArrayDataBuilder::new(T::DATA_TYPE) @@ -314,22 +319,22 @@ impl From> for ArrayData { } } -impl FromIterator> for GenericBytesViewArray +impl FromIterator> for GenericByteViewArray where Ptr: AsRef, { fn from_iter>>(iter: I) -> Self { let iter = iter.into_iter(); - let mut builder = GenericBytesViewBuilder::::with_capacity(iter.size_hint().0); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); builder.extend(iter); builder.finish() } } -/// A [`GenericBytesViewArray`] of `[u8]` -pub type BinaryViewArray = GenericBytesViewArray<[u8]>; +/// A [`GenericByteViewArray`] of `[u8]` +pub type BinaryViewArray = GenericByteViewArray; -/// A [`GenericBytesViewArray`] of `str` +/// A [`GenericByteViewArray`] of `str` /// /// ``` /// use arrow_array::StringViewArray; @@ -337,7 +342,7 @@ pub type BinaryViewArray = GenericBytesViewArray<[u8]>; /// assert_eq!(array.value(0), "hello"); /// assert_eq!(array.value(3), "large payload over 12 bytes"); /// ``` -pub type StringViewArray = GenericBytesViewArray; +pub type StringViewArray = GenericByteViewArray; impl From> for StringViewArray { fn from(v: Vec<&str>) -> Self { @@ -348,8 +353,9 @@ impl From> for StringViewArray { #[cfg(test)] mod tests { use crate::builder::StringViewBuilder; - use crate::types::BytesViewType; use crate::{Array, BinaryViewArray, StringViewArray}; + use arrow_buffer::{Buffer, ScalarBuffer}; + use arrow_data::ByteView; #[test] fn try_new() { @@ -363,20 +369,22 @@ mod tests { assert_eq!(array.value(3), "large payload over 12 bytes"); let array = BinaryViewArray::from_iter_values(vec![ - b"hello".to_bytes(), - b"world".to_bytes(), - b"lulu".to_bytes(), - b"large payload over 12 bytes".to_bytes(), + b"hello".as_slice(), + b"world".as_slice(), + b"lulu".as_slice(), + b"large payload over 12 bytes".as_slice(), ]); assert_eq!(array.value(0), b"hello"); assert_eq!(array.value(3), b"large payload over 12 bytes"); + // test empty array let array = { let mut builder = StringViewBuilder::new(); builder.finish() }; assert!(array.is_empty()); + // test builder append let array = { let mut builder = StringViewBuilder::new(); builder.append_value("hello"); @@ -387,5 +395,48 @@ mod tests { assert_eq!(array.value(0), "hello"); assert!(array.is_null(1)); assert_eq!(array.value(2), "large payload over 12 bytes"); + + // test builder's in_progress re-created + let array = { + // make a builder with small block size. + let mut builder = StringViewBuilder::new().with_block_size(14); + builder.append_value("large payload over 12 bytes"); + builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created")); + builder.finish() + }; + assert_eq!(array.value(0), "large payload over 12 bytes"); + assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"); + } + + #[test] + #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")] + fn new_with_invalid_view_data() { + let v = "large payload over 12 bytes"; + let view = ByteView { + length: 13, + prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()), + buffer_index: 3, + offset: 1, + }; + let views = ScalarBuffer::from(vec![view.into()]); + let buffers = vec![Buffer::from_slice_ref(v)]; + StringViewArray::new(views, buffers, None); + } + + #[test] + #[should_panic( + expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0" + )] + fn new_with_invalid_utf8_data() { + let v: Vec = vec![0xf0, 0x80, 0x80, 0x80]; + let view = ByteView { + length: v.len() as u32, + prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), + buffer_index: 0, + offset: 0, + }; + let views = ScalarBuffer::from(vec![view.into()]); + let buffers = vec![Buffer::from_slice_ref(v)]; + StringViewArray::new(views, buffers, None); } } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 8513b4050cce..b115ff9c14cc 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -68,9 +68,9 @@ mod run_array; pub use run_array::*; -mod bytes_view_array; +mod byte_view_array; -pub use bytes_view_array::*; +pub use byte_view_array::*; /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 9e45f2c2345f..66154ce56c46 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -16,20 +16,20 @@ // under the License. use crate::builder::ArrayBuilder; -use crate::types::BytesViewType; -use crate::{ArrayRef, GenericBytesViewArray}; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{ArrayRef, GenericByteViewArray}; use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; -use arrow_data::BytesView; +use arrow_data::ByteView; use std::any::Any; use std::marker::PhantomData; use std::sync::Arc; const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024; -/// A builder for [`GenericBytesViewArray`] +/// A builder for [`GenericByteViewArray`] /// /// See [`Self::append_value`] for the allocation strategy -pub struct GenericBytesViewBuilder { +pub struct GenericByteViewBuilder { views_builder: BufferBuilder, null_buffer_builder: NullBufferBuilder, completed: Vec, @@ -38,7 +38,7 @@ pub struct GenericBytesViewBuilder { phantom: PhantomData, } -impl GenericBytesViewBuilder { +impl GenericByteViewBuilder { /// Creates a new [`GenericByteViewBuilder`]. pub fn new() -> Self { Self::with_capacity(1024) @@ -93,7 +93,7 @@ impl GenericBytesViewBuilder { let offset = self.in_progress.len() as u32; self.in_progress.extend_from_slice(v); - let view = BytesView { + let view = ByteView { length, prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), buffer_index: self.completed.len() as u32, @@ -119,8 +119,8 @@ impl GenericBytesViewBuilder { self.views_builder.append(0); } - /// Builds the [`GenericBytesViewArray`] and reset this builder - pub fn finish(&mut self) -> GenericBytesViewArray { + /// Builds the [`GenericByteViewArray`] and reset this builder + pub fn finish(&mut self) -> GenericByteViewArray { let mut completed = std::mem::take(&mut self.completed); if !self.in_progress.is_empty() { completed.push(std::mem::take(&mut self.in_progress).into()); @@ -129,11 +129,11 @@ impl GenericBytesViewBuilder { let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); let nulls = self.null_buffer_builder.finish(); // SAFETY: valid by construction - unsafe { GenericBytesViewArray::new_unchecked(views, completed, nulls) } + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } } - /// Builds the [`GenericBytesViewArray`] without resetting the builder - pub fn finish_cloned(&self) -> GenericBytesViewArray { + /// Builds the [`GenericByteViewArray`] without resetting the builder + pub fn finish_cloned(&self) -> GenericByteViewArray { let mut completed = self.completed.clone(); if !self.in_progress.is_empty() { completed.push(Buffer::from_slice_ref(&self.in_progress)); @@ -143,17 +143,17 @@ impl GenericBytesViewBuilder { let views = ScalarBuffer::new(views, 0, len); let nulls = self.null_buffer_builder.finish_cloned(); // SAFETY: valid by construction - unsafe { GenericBytesViewArray::new_unchecked(views, completed, nulls) } + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } } } -impl Default for GenericBytesViewBuilder { +impl Default for GenericByteViewBuilder { fn default() -> Self { Self::new() } } -impl std::fmt::Debug for GenericBytesViewBuilder { +impl std::fmt::Debug for GenericByteViewBuilder { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}ViewBuilder", T::PREFIX)?; f.debug_struct("") @@ -165,7 +165,7 @@ impl std::fmt::Debug for GenericBytesViewBuilder { } } -impl ArrayBuilder for GenericBytesViewBuilder { +impl ArrayBuilder for GenericByteViewBuilder { fn len(&self) -> usize { self.null_buffer_builder.len() } @@ -191,8 +191,8 @@ impl ArrayBuilder for GenericBytesViewBuilder { } } -impl> Extend> - for GenericBytesViewBuilder +impl> Extend> + for GenericByteViewBuilder { #[inline] fn extend>>(&mut self, iter: I) { @@ -206,10 +206,10 @@ impl> Extend> /// /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with /// [`GenericByteViewBuilder::append_null`] as normal. -pub type StringViewBuilder = GenericBytesViewBuilder; +pub type StringViewBuilder = GenericByteViewBuilder; /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] /// /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with /// [`GenericByteViewBuilder::append_null`] as normal. -pub type BinaryViewBuilder = GenericBytesViewBuilder<[u8]>; +pub type BinaryViewBuilder = GenericByteViewBuilder; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 65ba46e5cd79..e33f7bde7cba 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1546,14 +1546,16 @@ pub type BinaryType = GenericBinaryType; /// An arrow binary array with i64 offsets pub type LargeBinaryType = GenericBinaryType; -mod bytes_view { +mod byte_view { + use crate::types::{BinaryViewType, StringViewType}; + pub trait Sealed: Send + Sync {} - impl Sealed for str {} - impl Sealed for [u8] {} + impl Sealed for StringViewType {} + impl Sealed for BinaryViewType {} } /// A trait over the variable length bytes view array types -pub trait BytesViewType: bytes_view::Sealed + 'static + PartialEq + AsRef { +pub trait ByteViewType: byte_view::Sealed + 'static + PartialEq + Send + Sync { /// If element in array is utf8 encoded string. const IS_UTF8: bool; @@ -1573,69 +1575,38 @@ pub trait BytesViewType: bytes_view::Sealed + 'static + PartialEq + AsRef type Native: bytes::ByteArrayNativeType + AsRef + AsRef<[u8]> + ?Sized; /// Type for owned corresponding to `Native` - type Owned: Debug + Clone + Sync + Send + AsRef; - - /// # Safety - /// The caller must ensure `index < self.len()`. - unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self; - - /// To bytes slice. - fn to_bytes(&self) -> &[u8]; - - /// To owned type - #[allow(clippy::wrong_self_convention)] - fn into_owned(&self) -> Self::Owned; + type Owned: Debug + Clone + Sync + Send + AsRef; /// Verifies that the provided buffers are valid for this array type fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError>; } -impl BytesViewType for str { +/// [`ByteViewType`] for string arrays +#[derive(PartialEq)] +pub struct StringViewType {} + +impl ByteViewType for StringViewType { const IS_UTF8: bool = true; const PREFIX: &'static str = "String"; type Native = str; type Owned = String; - #[inline(always)] - unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { - std::str::from_utf8_unchecked(slice) - } - - #[inline(always)] - fn to_bytes(&self) -> &[u8] { - self.as_bytes() - } - - fn into_owned(&self) -> Self::Owned { - self.to_string() - } - fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { validate_string_view(views, buffers) } } -impl BytesViewType for [u8] { +/// [`BinaryViewType`] for string arrays +#[derive(PartialEq)] +pub struct BinaryViewType {} + +impl ByteViewType for BinaryViewType { const IS_UTF8: bool = false; const PREFIX: &'static str = "Binary"; type Native = [u8]; type Owned = Vec; - #[inline(always)] - unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { - slice - } - - #[inline(always)] - fn to_bytes(&self) -> &[u8] { - self - } - - fn into_owned(&self) -> Self::Owned { - self.to_vec() - } - fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { validate_binary_view(views, buffers) } diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index e7a897240676..c83f867523d5 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -51,7 +51,6 @@ arrow-schema = { workspace = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } -simdutf8 = { version = "0.1.4", default-features = false, features = ["std", "aarch64_neon"] } [dev-dependencies] diff --git a/arrow-data/src/bytes_view.rs b/arrow-data/src/byte_view.rs similarity index 93% rename from arrow-data/src/bytes_view.rs rename to arrow-data/src/byte_view.rs index 40cc7c2798a6..5e65c1139001 100644 --- a/arrow-data/src/bytes_view.rs +++ b/arrow-data/src/byte_view.rs @@ -20,7 +20,7 @@ use arrow_schema::ArrowError; #[derive(Debug, Copy, Clone, Default)] #[repr(C)] -pub struct BytesView { +pub struct ByteView { /// The length of the string/bytes. pub length: u32, /// First 4 bytes of string/bytes data. @@ -31,23 +31,23 @@ pub struct BytesView { pub offset: u32, } -impl BytesView { +impl ByteView { #[inline(always)] pub fn as_u128(self) -> u128 { unsafe { std::mem::transmute(self) } } } -impl From for BytesView { +impl From for ByteView { #[inline] fn from(value: u128) -> Self { unsafe { std::mem::transmute(value) } } } -impl From for u128 { +impl From for u128 { #[inline] - fn from(value: BytesView) -> Self { + fn from(value: ByteView) -> Self { value.as_u128() } } @@ -60,7 +60,7 @@ pub fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<(), Ar /// Validates the combination of `views` and `buffers` is a valid StringView pub fn validate_string_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { validate_view_impl(views, buffers, |idx, b| { - simdutf8::basic::from_utf8(b).map_err(|e| { + std::str::from_utf8(b).map_err(|e| { ArrowError::InvalidArgumentError(format!( "Encountered non-UTF-8 data at index {idx}: {e}" )) @@ -83,7 +83,7 @@ where } f(idx, &v.to_le_bytes()[4..4 + len as usize])?; } else { - let view = BytesView::from(*v); + let view = ByteView::from(*v); let data = buffers.get(view.buffer_index as usize).ok_or_else(|| { ArrowError::InvalidArgumentError(format!( "Invalid buffer index at {idx}: got index {} but only has {} buffers", diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 6e4f1e283713..a0430394f092 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1581,7 +1581,7 @@ pub struct DataTypeLayout { /// Can contain a null bitmask pub can_contain_null_mask: bool, - /// This field only applies to the view type,[`DataType::BinaryView`] and [`DataType::Utf8View`] + /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`] /// If `variadic` is true, the number of buffers expected is only lower-bounded by /// buffers.len(). Buffers that exceed the lower bound are legal. pub variadic: bool, diff --git a/arrow-data/src/equal/bytes_view.rs b/arrow-data/src/equal/byte_view.rs similarity index 93% rename from arrow-data/src/equal/bytes_view.rs rename to arrow-data/src/equal/byte_view.rs index b225030b02f3..def395125366 100644 --- a/arrow-data/src/equal/bytes_view.rs +++ b/arrow-data/src/equal/byte_view.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::{ArrayData, BytesView}; +use crate::{ArrayData, ByteView}; -pub(super) fn bytes_view_equal( +pub(super) fn byte_view_equal( lhs: &ArrayData, rhs: &ArrayData, lhs_start: usize, @@ -53,8 +53,8 @@ pub(super) fn bytes_view_equal( } // check buffers - let l_view = BytesView::from(*l); - let r_view = BytesView::from(*r); + let l_view = ByteView::from(*l); + let r_view = ByteView::from(*r); let l_buffer = &lhs_buffers[l_view.buffer_index as usize]; let r_buffer = &rhs_buffers[r_view.buffer_index as usize]; @@ -69,3 +69,6 @@ pub(super) fn bytes_view_equal( } true } + +#[cfg(test)] +mod tests {} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index be33c7c0c2ad..d08f62ac0539 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -25,7 +25,7 @@ use arrow_schema::{DataType, IntervalUnit}; use half::f16; mod boolean; -mod bytes_view; +mod byte_view; mod dictionary; mod fixed_binary; mod fixed_list; @@ -42,7 +42,7 @@ mod variable_size; // For this reason, they are not exposed and are instead used // to build the generic functions below (`equal_range` and `equal`). use boolean::boolean_equal; -use bytes_view::bytes_view_equal; +use byte_view::byte_view_equal; use dictionary::dictionary_equal; use fixed_binary::fixed_binary_equal; use fixed_list::fixed_list_equal; @@ -99,7 +99,7 @@ fn equal_values( } DataType::FixedSizeBinary(_) => fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::BinaryView | DataType::Utf8View => { - bytes_view_equal(lhs, rhs, lhs_start, rhs_start, len) + byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index 4399d0f3eca2..59a049fe96cf 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -31,5 +31,5 @@ pub mod decimal; #[cfg(feature = "ffi")] pub mod ffi; -mod bytes_view; -pub use bytes_view::*; +mod byte_view; +pub use byte_view::*; diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 9105c522dbda..8afbe11fdd1c 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use super::{data::new_buffers, ArrayData, ArrayDataBuilder, BytesView}; +use super::{data::new_buffers, ArrayData, ArrayDataBuilder, ByteView}; use crate::bit_mask::set_bits; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; @@ -182,7 +182,7 @@ fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend { if len <= 12 { return *v; // Stored inline } - let mut view = BytesView::from(*v); + let mut view = ByteView::from(*v); view.buffer_index += buffer_offset; view.into() })) diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 9bd276428880..15011c547284 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -22,8 +22,8 @@ use arrow::array::{ StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, }; use arrow::datatypes::{Int16Type, Int32Type}; -use arrow_array::builder::{StringBuilder, StructBuilder}; -use arrow_array::{DictionaryArray, FixedSizeListArray}; +use arrow_array::builder::{StringBuilder, StringViewBuilder, StructBuilder}; +use arrow_array::{DictionaryArray, FixedSizeListArray, StringViewArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, Field, Fields}; @@ -307,6 +307,50 @@ fn test_fixed_size_binary_array() { test_equal(&a, &b, true); } +#[test] +fn test_string_view_equal() { + let a1 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + let a2 = StringViewArray::from(vec![ + "a very long string over 12 bytes", + "foo", + "very long string over 12 bytes", + "bar", + ]); + test_equal(&a1, &a2.slice(1, 3), true); + + let a1 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + let a2 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + test_equal(&a1, &a2, true); + + let a1_s = a1.slice(1, 1); + let a2_s = a2.slice(1, 1); + test_equal(&a1_s, &a2_s, true); + + let a1_s = a1.slice(2, 1); + let a2_s = a2.slice(0, 1); + test_equal(&a1_s, &a2_s, false); + + // test will null value. + let a1 = StringViewArray::from(vec!["foo", "very long string over 12 bytes", "bar"]); + let a2 = { + let mut builder = StringViewBuilder::new(); + builder.append_value("foo"); + builder.append_null(); + builder.append_option(Some("very long string over 12 bytes")); + builder.append_value("bar"); + builder.finish() + }; + test_equal(&a1, &a2, false); + + let a1_s = a1.slice(1, 2); + let a2_s = a2.slice(1, 3); + test_equal(&a1_s, &a2_s, false); + + let a1_s = a1.slice(1, 2); + let a2_s = a2.slice(2, 2); + test_equal(&a1_s, &a2_s, true); +} + #[test] fn test_string_offset() { let a = StringArray::from(vec![Some("a"), None, Some("b")]);