diff --git a/crates/polars-arrow/src/array/binview/mod.rs b/crates/polars-arrow/src/array/binview/mod.rs index 325966d5ecdf..ff4baafef2de 100644 --- a/crates/polars-arrow/src/array/binview/mod.rs +++ b/crates/polars-arrow/src/array/binview/mod.rs @@ -23,19 +23,22 @@ mod private { impl Sealed for str {} impl Sealed for [u8] {} } +pub use iterator::BinaryViewValueIter; pub use mutable::MutableBinaryViewArray; use private::Sealed; -use crate::array::binview::iterator::BinaryViewValueIter; -use crate::array::binview::view::{ - validate_binary_view, validate_utf8_only_view, validate_utf8_view, -}; +use crate::array::binview::view::{validate_binary_view, validate_utf8_only, validate_utf8_view}; use crate::array::iterator::NonNullValuesIter; use crate::bitmap::utils::{BitmapIter, ZipValidity}; - pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; pub type Utf8ViewArray = BinaryViewArrayGeneric; +pub type MutablePlString = MutableBinaryViewArray; +pub type MutablePlBinary = MutableBinaryViewArray<[u8]>; + +static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView; +static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View; + pub trait ViewType: Sealed + 'static + PartialEq + AsRef { const IS_UTF8: bool; const DATA_TYPE: ArrowDataType; @@ -49,6 +52,8 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef { #[allow(clippy::wrong_self_convention)] fn into_owned(&self) -> Self::Owned; + + fn dtype() -> &'static ArrowDataType; } impl ViewType for str { @@ -69,6 +74,9 @@ impl ViewType for str { fn into_owned(&self) -> Self::Owned { self.to_string() } + fn dtype() -> &'static ArrowDataType { + &UTF8_VIEW_TYPE + } } impl ViewType for [u8] { @@ -89,6 +97,10 @@ impl ViewType for [u8] { fn into_owned(&self) -> Self::Owned { self.to_vec() } + + fn dtype() -> &'static ArrowDataType { + &BIN_VIEW_TYPE + } } pub struct BinaryViewArrayGeneric { @@ -105,6 +117,12 @@ pub struct BinaryViewArrayGeneric { total_buffer_len: usize, } +impl PartialEq for BinaryViewArrayGeneric { + fn eq(&self, other: &Self) -> bool { + self.into_iter().zip(other).all(|(l, r)| l == r) + } +} + impl Clone for BinaryViewArrayGeneric { fn clone(&self) -> Self { Self { @@ -262,7 +280,7 @@ impl BinaryViewArrayGeneric { // data: 12 bytes let bytes = if len <= 12 { - let ptr = self.views.storage_ptr() as *const u8; + let ptr = self.views.as_ptr() as *const u8; std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize) } else { let buffer_idx = (v >> 64) as u32; @@ -285,6 +303,10 @@ impl BinaryViewArrayGeneric { BinaryViewValueIter::new(self) } + pub fn len_iter(&self) -> impl Iterator + '_ { + self.views.iter().map(|v| *v as u32) + } + /// Returns an iterator of the non-null values. pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric> { NonNullValuesIter::new(self, self.validity()) @@ -299,13 +321,19 @@ impl BinaryViewArrayGeneric { impl_mut_validity!(); impl_into_array!(); - pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + pub fn from_slice, P: AsRef<[Option]>>(slice: P) -> Self { let mutable = MutableBinaryViewArray::from_iterator( slice.as_ref().iter().map(|opt_v| opt_v.as_ref()), ); mutable.into() } + pub fn from_slice_values, P: AsRef<[S]>>(slice: P) -> Self { + let mutable = + MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref())); + mutable.into() + } + /// Get the total length of bytes that it would take to concatenate all binary/str values in this array. pub fn total_bytes_len(&self) -> usize { self.total_bytes_len @@ -320,12 +348,40 @@ impl BinaryViewArrayGeneric { pub fn len(&self) -> usize { self.views.len() } + + /// Garbage collect + pub fn gc(self) -> Self { + if self.buffers.is_empty() { + return self; + } + let mut mutable = MutableBinaryViewArray::with_capacity(self.len()); + let buffers = self.raw_buffers.as_ref(); + + for view in self.views.as_ref() { + unsafe { mutable.push_view(*view, buffers) } + } + mutable.freeze().with_validity(self.validity) + } + + pub fn maybe_gc(self) -> Self { + if self.total_buffer_len == 0 { + return self; + } + // Subtract the maximum amount of inlined strings. + let min_in_buffer = self.total_bytes_len.saturating_sub(self.len() * 12); + let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64); + + if frac < 0.25 { + return self.gc(); + } + self + } } impl BinaryViewArray { /// Validate the underlying bytes on UTF-8. pub fn validate_utf8(&self) -> PolarsResult<()> { - validate_utf8_only_view(&self.views, &self.buffers) + validate_utf8_only(&self.views, &self.buffers) } /// Convert [`BinaryViewArray`] to [`Utf8ViewArray`]. @@ -381,7 +437,7 @@ impl Array for BinaryViewArrayGeneric { } fn data_type(&self) -> &ArrowDataType { - &self.data_type + T::dtype() } fn validity(&self) -> Option<&Bitmap> { @@ -397,12 +453,14 @@ impl Array for BinaryViewArrayGeneric { } unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { + debug_assert!(offset + length <= self.len()); self.validity = self .validity .take() .map(|bitmap| bitmap.sliced_unchecked(offset, length)) .filter(|bitmap| bitmap.unset_bits() > 0); self.views.slice_unchecked(offset, length); + self.total_bytes_len = self.len_iter().map(|v| v as usize).sum::(); } fn with_validity(&self, validity: Option) -> Box { diff --git a/crates/polars-arrow/src/array/binview/mutable.rs b/crates/polars-arrow/src/array/binview/mutable.rs index f205e19f4478..e2ded543a5b0 100644 --- a/crates/polars-arrow/src/array/binview/mutable.rs +++ b/crates/polars-arrow/src/array/binview/mutable.rs @@ -1,16 +1,21 @@ +use std::any::Any; +use std::fmt::{Debug, Formatter}; use std::sync::Arc; use polars_error::PolarsResult; use polars_utils::slice::GetSaferUnchecked; -use crate::array::binview::view::validate_utf8_only_view; +use crate::array::binview::view::validate_utf8_only; use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; +use crate::array::{Array, MutableArray}; use crate::bitmap::MutableBitmap; use crate::buffer::Buffer; +use crate::datatypes::ArrowDataType; +use crate::legacy::trusted_len::TrustedLenPush; +use crate::trusted_len::TrustedLen; const DEFAULT_BLOCK_SIZE: usize = 8 * 1024; -#[derive(Debug, Clone)] pub struct MutableBinaryViewArray { views: Vec, completed_buffers: Vec>, @@ -23,6 +28,26 @@ pub struct MutableBinaryViewArray { total_buffer_len: usize, } +impl Clone for MutableBinaryViewArray { + fn clone(&self) -> Self { + Self { + views: self.views.clone(), + completed_buffers: self.completed_buffers.clone(), + in_progress_buffer: self.in_progress_buffer.clone(), + validity: self.validity.clone(), + phantom: Default::default(), + total_bytes_len: self.total_bytes_len, + total_buffer_len: self.total_buffer_len, + } + } +} + +impl Debug for MutableBinaryViewArray { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "mutable-binview{:?}", T::DATA_TYPE) + } +} + impl Default for MutableBinaryViewArray { fn default() -> Self { Self::with_capacity(0) @@ -75,17 +100,48 @@ impl MutableBinaryViewArray { self.views.reserve(additional); } + #[inline] pub fn len(&self) -> usize { self.views.len() } - fn init_validity(&mut self) { + #[inline] + pub fn capacity(&self) -> usize { + self.views.capacity() + } + + fn init_validity(&mut self, unset_last: bool) { let mut validity = MutableBitmap::with_capacity(self.views.capacity()); validity.extend_constant(self.len(), true); - validity.set(self.len() - 1, false); + if unset_last { + validity.set(self.len() - 1, false); + } self.validity = Some(validity); } + /// # Safety + /// - caller must allocate enough capacity + /// - caller must ensure the view and buffers match. + #[inline] + pub unsafe fn push_view(&mut self, v: u128, buffers: &[(*const u8, usize)]) { + let len = v as u32; + self.total_bytes_len += len as usize; + if len <= 12 { + debug_assert!(self.views.capacity() > self.views.len()); + self.views.push_unchecked(v) + } else { + self.total_buffer_len += len as usize; + let buffer_idx = (v >> 64) as u32; + let offset = (v >> 96) as u32; + let (data_ptr, data_len) = *buffers.get_unchecked_release(buffer_idx as usize); + let data = std::slice::from_raw_parts(data_ptr, data_len); + let offset = offset as usize; + let bytes = data.get_unchecked(offset..offset + len as usize); + let t = T::from_bytes_unchecked(bytes); + self.push_value_ignore_validity(t) + } + } + pub fn push_value_ignore_validity>(&mut self, value: V) { let value = value.as_ref(); let bytes = value.to_bytes(); @@ -140,10 +196,42 @@ impl MutableBinaryViewArray { self.views.push(0); match &mut self.validity { Some(validity) => validity.push(false), - None => self.init_validity(), + None => self.init_validity(true), } } + pub fn extend_null(&mut self, additional: usize) { + if self.validity.is_none() && additional > 0 { + self.init_validity(false); + } + self.views.extend(std::iter::repeat(0).take(additional)); + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, false); + } + } + + pub fn extend_constant>(&mut self, additional: usize, value: Option) { + if value.is_none() && self.validity.is_none() { + self.init_validity(false); + } + + if let Some(validity) = &mut self.validity { + validity.extend_constant(additional, value.is_some()) + } + + // Push and pop to get the properly encoded value. + // For long string this leads to a dictionary encoding, + // as we push the string only once in the buffers + let view_value = value + .map(|v| { + self.push_value_ignore_validity(v); + self.views.pop().unwrap() + }) + .unwrap_or(0); + self.views + .extend(std::iter::repeat(view_value).take(additional)); + } + impl_mutable_array_mut_validity!(); #[inline] @@ -158,6 +246,15 @@ impl MutableBinaryViewArray { } } + #[inline] + pub fn extend_trusted_len_values(&mut self, iterator: I) + where + I: TrustedLen, + P: AsRef, + { + self.extend_values(iterator) + } + #[inline] pub fn extend(&mut self, iterator: I) where @@ -170,6 +267,16 @@ impl MutableBinaryViewArray { } } + #[inline] + pub fn extend_trusted_len(&mut self, iterator: I) + where + I: TrustedLen>, + P: AsRef, + { + self.extend(iterator) + } + + #[inline] pub fn from_iterator(iterator: I) -> Self where I: Iterator>, @@ -200,11 +307,16 @@ impl MutableBinaryViewArray { .push(std::mem::take(&mut self.in_progress_buffer).into()); } } + + #[inline] + pub fn freeze(self) -> BinaryViewArrayGeneric { + self.into() + } } impl MutableBinaryViewArray<[u8]> { pub fn validate_utf8(&mut self) -> PolarsResult<()> { - validate_utf8_only_view(&self.views, &self.completed_buffers) + validate_utf8_only(&self.views, &self.completed_buffers) } } @@ -221,3 +333,43 @@ impl> FromIterator> for MutableBinar Self::from_iterator(iter.into_iter()) } } + +impl MutableArray for MutableBinaryViewArray { + fn data_type(&self) -> &ArrowDataType { + T::dtype() + } + + fn len(&self) -> usize { + MutableBinaryViewArray::len(self) + } + + fn validity(&self) -> Option<&MutableBitmap> { + self.validity.as_ref() + } + + fn as_box(&mut self) -> Box { + let mutable = std::mem::take(self); + let arr: BinaryViewArrayGeneric = mutable.into(); + arr.boxed() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn push_null(&mut self) { + MutableBinaryViewArray::push_null(self) + } + + fn reserve(&mut self, additional: usize) { + MutableBinaryViewArray::reserve(self, additional) + } + + fn shrink_to_fit(&mut self) { + self.views.shrink_to_fit() + } +} diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index 55df59a3ac8a..4152ec01fa35 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -2,6 +2,7 @@ use polars_error::*; use crate::buffer::Buffer; +#[derive(Debug)] pub struct View { /// The length of the string/bytes. pub length: u32, @@ -18,7 +19,7 @@ impl From for View { fn from(value: u128) -> Self { Self { length: value as u32, - prefix: (value >> 64) as u32, + prefix: (value >> 32) as u32, buffer_idx: (value >> 64) as u32, offset: (value >> 96) as u32, } @@ -84,7 +85,7 @@ pub(super) fn validate_utf8_view(views: &[u128], buffers: &[Buffer]) -> Pola validate_view(views, buffers, validate_utf8) } -pub(super) fn validate_utf8_only_view(views: &[u128], buffers: &[Buffer]) -> PolarsResult<()> { +pub(super) fn validate_utf8_only(views: &[u128], buffers: &[Buffer]) -> PolarsResult<()> { for view in views { let len = *view as u32; if len <= 12 { diff --git a/crates/polars-arrow/src/array/growable/binview.rs b/crates/polars-arrow/src/array/growable/binview.rs index d13474d99cd0..a4f4b1099ed8 100644 --- a/crates/polars-arrow/src/array/growable/binview.rs +++ b/crates/polars-arrow/src/array/growable/binview.rs @@ -15,6 +15,7 @@ pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { validity: Option, views: Vec, buffers: Vec>, + buffers_offsets: Vec, total_bytes_len: usize, total_buffer_len: usize, } @@ -36,9 +37,24 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { use_validity = true; }; - let n_buffers = arrays + let mut cum_sum = 0; + let cum_offset = arrays .iter() - .map(|binview| binview.data_buffers().len()) + .map(|binview| { + let out = cum_sum; + cum_sum += binview.data_buffers().len() as u32; + out + }) + .collect::>(); + + let buffers = arrays + .iter() + .flat_map(|array| array.data_buffers().as_ref()) + .cloned() + .collect::>(); + let total_buffer_len = arrays + .iter() + .map(|arr| arr.data_buffers().len()) .sum::(); Self { @@ -46,9 +62,10 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { data_type, validity: prepare_validity(use_validity, capacity), views: Vec::with_capacity(capacity), - buffers: Vec::with_capacity(n_buffers), + buffers, + buffers_offsets: cum_offset, total_bytes_len: 0, - total_buffer_len: 0, + total_buffer_len, } } @@ -65,33 +82,39 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { self.total_bytes_len, self.total_buffer_len, ) + .maybe_gc() } } -} -impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { - fn extend(&mut self, index: usize, start: usize, len: usize) { - let array = self.arrays[index]; - extend_validity(&mut self.validity, array, start, len); + /// # Safety + /// doesn't check bounds + pub unsafe fn extend_unchecked(&mut self, index: usize, start: usize, len: usize) { + let array = *self.arrays.get_unchecked(index); - let buffer_offset: u32 = self.buffers.len().try_into().expect("unsupported"); - let buffer_offset = (buffer_offset as u128) << 64; + extend_validity(&mut self.validity, array, start, len); let range = start..start + len; - let buffers_range = &array.data_buffers()[range.clone()]; - self.buffers.extend_from_slice(buffers_range); - for b in buffers_range { - self.total_buffer_len += b.len(); - } - - self.views.extend(array.views()[range].iter().map(|&view| { - self.total_bytes_len += (view as u32) as usize; + self.views + .extend(array.views().get_unchecked(range).iter().map(|&view| { + let len = (view as u32) as usize; + self.total_bytes_len += len; + + if len > 12 { + let buffer_offset = *self.buffers_offsets.get_unchecked(index); + let mask = (u32::MAX as u128) << 64; + (view & !mask) | ((buffer_offset as u128) << 64) + } else { + view + } + })); + } +} - // If null the buffer index is ignored because the length is 0, - // so we can just do this - view + buffer_offset - })); +impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { + fn extend(&mut self, index: usize, start: usize, len: usize) { + assert!(index < self.arrays.len()); + unsafe { self.extend_unchecked(index, start, len) } } fn extend_validity(&mut self, additional: usize) { @@ -126,6 +149,7 @@ impl<'a, T: ViewType + ?Sized> From> for BinaryVi val.total_bytes_len, val.total_buffer_len, ) + .maybe_gc() } } } diff --git a/crates/polars-arrow/src/array/growable/mod.rs b/crates/polars-arrow/src/array/growable/mod.rs index 88ecc49606a8..4799d0384ed5 100644 --- a/crates/polars-arrow/src/array/growable/mod.rs +++ b/crates/polars-arrow/src/array/growable/mod.rs @@ -32,6 +32,7 @@ mod dictionary; pub use dictionary::GrowableDictionary; mod binview; +pub use binview::GrowableBinaryViewArray; mod utils; /// Describes a struct that can be extended from slices of other pre-existing [`Array`]s. diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs index a8a99893007f..d80ecb69b171 100644 --- a/crates/polars-arrow/src/array/mod.rs +++ b/crates/polars-arrow/src/array/mod.rs @@ -698,7 +698,8 @@ mod values; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; pub use binview::{ - BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, Utf8ViewArray, ViewType, + BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, MutablePlBinary, + MutablePlString, Utf8ViewArray, ViewType, }; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; diff --git a/crates/polars-arrow/src/array/primitive/mod.rs b/crates/polars-arrow/src/array/primitive/mod.rs index 80b36c2ccceb..c85cbc8420b3 100644 --- a/crates/polars-arrow/src/array/primitive/mod.rs +++ b/crates/polars-arrow/src/array/primitive/mod.rs @@ -93,6 +93,20 @@ impl PrimitiveArray { }) } + /// # Safety + /// Doesn't check invariants + pub unsafe fn new_unchecked( + data_type: ArrowDataType, + values: Buffer, + validity: Option, + ) -> Self { + Self { + data_type, + values, + validity, + } + } + /// Returns a new [`PrimitiveArray`] with a different logical type. /// /// This function is useful to assign a different [`ArrowDataType`] to the array. diff --git a/crates/polars-arrow/src/array/primitive/mutable.rs b/crates/polars-arrow/src/array/primitive/mutable.rs index 986dc5d00060..9bf303800a9d 100644 --- a/crates/polars-arrow/src/array/primitive/mutable.rs +++ b/crates/polars-arrow/src/array/primitive/mutable.rs @@ -283,6 +283,10 @@ impl MutablePrimitiveArray { pub fn capacity(&self) -> usize { self.values.capacity() } + + pub fn freeze(self) -> PrimitiveArray { + self.into() + } } /// Accessors diff --git a/crates/polars-arrow/src/array/static_array.rs b/crates/polars-arrow/src/array/static_array.rs index bf1e81053e15..4ae0f44212ba 100644 --- a/crates/polars-arrow/src/array/static_array.rs +++ b/crates/polars-arrow/src/array/static_array.rs @@ -1,9 +1,11 @@ use bytemuck::Zeroable; +use crate::array::binview::BinaryViewValueIter; use crate::array::static_array_collect::ArrayFromIterDtype; use crate::array::{ - Array, ArrayValuesIter, BinaryArray, BinaryValueIter, BooleanArray, FixedSizeListArray, - ListArray, ListValuesIter, PrimitiveArray, Utf8Array, Utf8ValuesIter, + Array, ArrayValuesIter, BinaryArray, BinaryValueIter, BinaryViewArray, BooleanArray, + FixedSizeListArray, ListArray, ListValuesIter, PrimitiveArray, Utf8Array, Utf8ValuesIter, + Utf8ViewArray, }; use crate::bitmap::utils::{BitmapIter, ZipValidity}; use crate::bitmap::Bitmap; @@ -239,6 +241,70 @@ impl ParameterFreeDtypeStaticArray for BinaryArray { } } +impl StaticArray for BinaryViewArray { + type ValueT<'a> = &'a [u8]; + type ZeroableValueT<'a> = Option<&'a [u8]>; + type ValueIterT<'a> = BinaryViewValueIter<'a, [u8]>; + + unsafe fn value_unchecked(&self, idx: usize) -> Self::ValueT<'_> { + self.value_unchecked(idx) + } + + fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { + self.iter() + } + + fn values_iter(&self) -> Self::ValueIterT<'_> { + self.values_iter() + } + + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) + } + + fn full_null(length: usize, dtype: ArrowDataType) -> Self { + Self::new_null(dtype, length) + } +} + +impl ParameterFreeDtypeStaticArray for BinaryViewArray { + fn get_dtype() -> ArrowDataType { + ArrowDataType::BinaryView + } +} + +impl StaticArray for Utf8ViewArray { + type ValueT<'a> = &'a str; + type ZeroableValueT<'a> = Option<&'a str>; + type ValueIterT<'a> = BinaryViewValueIter<'a, str>; + + unsafe fn value_unchecked(&self, idx: usize) -> Self::ValueT<'_> { + self.value_unchecked(idx) + } + + fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { + self.iter() + } + + fn values_iter(&self) -> Self::ValueIterT<'_> { + self.values_iter() + } + + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) + } + + fn full_null(length: usize, dtype: ArrowDataType) -> Self { + Self::new_null(dtype, length) + } +} + +impl ParameterFreeDtypeStaticArray for Utf8ViewArray { + fn get_dtype() -> ArrowDataType { + ArrowDataType::Utf8View + } +} + impl StaticArray for ListArray { type ValueT<'a> = Box; type ZeroableValueT<'a> = Option>; diff --git a/crates/polars-arrow/src/array/static_array_collect.rs b/crates/polars-arrow/src/array/static_array_collect.rs index b48f44f48d72..27f86fec1f5b 100644 --- a/crates/polars-arrow/src/array/static_array_collect.rs +++ b/crates/polars-arrow/src/array/static_array_collect.rs @@ -3,15 +3,17 @@ use std::sync::Arc; use crate::array::static_array::{ParameterFreeDtypeStaticArray, StaticArray}; use crate::array::{ - Array, BinaryArray, BooleanArray, FixedSizeListArray, ListArray, MutableBinaryArray, - MutableBinaryValuesArray, PrimitiveArray, Utf8Array, + Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeListArray, ListArray, + MutableBinaryArray, MutableBinaryValuesArray, MutableBinaryViewArray, PrimitiveArray, + Utf8Array, Utf8ViewArray, }; use crate::bitmap::Bitmap; use crate::datatypes::ArrowDataType; #[cfg(feature = "dtype-array")] use crate::legacy::prelude::fixed_size_list::AnonymousBuilder as AnonymousFixedSizeListArrayBuilder; use crate::legacy::prelude::list::AnonymousBuilder as AnonymousListArrayBuilder; -use crate::legacy::trusted_len::{TrustedLen, TrustedLenPush}; +use crate::legacy::trusted_len::TrustedLenPush; +use crate::trusted_len::TrustedLen; use crate::types::NativeType; pub trait ArrayFromIterDtype: Sized { @@ -439,10 +441,12 @@ impl ArrayFromIter for BinaryArray { } impl ArrayFromIter> for BinaryArray { + #[inline] fn arr_from_iter>>(iter: I) -> Self { BinaryArray::from_iter(iter.into_iter().map(|s| Some(s?.into_bytes()))) } + #[inline] fn arr_from_iter_trusted(iter: I) -> Self where I: IntoIterator>, @@ -484,6 +488,70 @@ impl ArrayFromIter> for BinaryArray { } } +impl ArrayFromIter for BinaryViewArray { + #[inline] + fn arr_from_iter>(iter: I) -> Self { + MutableBinaryViewArray::from_values_iter(iter.into_iter().map(|a| a.into_bytes())).into() + } + + #[inline] + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + Self::arr_from_iter(iter) + } + + fn try_arr_from_iter>>(iter: I) -> Result { + let mut iter = iter.into_iter(); + let mut arr = MutableBinaryViewArray::with_capacity(iter.size_hint().0); + iter.try_for_each(|x| -> Result<(), E> { + arr.push_value_ignore_validity(x?.into_bytes()); + Ok(()) + })?; + Ok(arr.into()) + } + + // No faster implementation than this available, fall back to default. + // fn try_arr_from_iter_trusted(iter: I) -> Result +} + +impl ArrayFromIter> for BinaryViewArray { + #[inline] + fn arr_from_iter>>(iter: I) -> Self { + MutableBinaryViewArray::from_iter( + iter.into_iter().map(|opt_a| opt_a.map(|a| a.into_bytes())), + ) + .into() + } + + #[inline] + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + Self::arr_from_iter(iter) + } + + fn try_arr_from_iter, E>>>( + iter: I, + ) -> Result { + let mut iter = iter.into_iter(); + let mut arr = MutableBinaryViewArray::with_capacity(iter.size_hint().0); + iter.try_for_each(|x| -> Result<(), E> { + let x = x?; + arr.push(x.map(|x| x.into_bytes())); + Ok(()) + })?; + Ok(arr.into()) + } + + // No faster implementation than this available, fall back to default. + // fn try_arr_from_iter_trusted(iter: I) -> Result +} + /// We use this to re-use the binary collect implementation for strings. /// # Safety /// The array must be valid UTF-8. @@ -499,6 +567,54 @@ impl StrIntoBytes for String {} impl<'a> StrIntoBytes for &'a str {} impl<'a> StrIntoBytes for Cow<'a, str> {} +impl ArrayFromIter for Utf8ViewArray { + #[inline] + fn arr_from_iter>(iter: I) -> Self { + unsafe { BinaryViewArray::arr_from_iter(iter).to_utf8view_unchecked() } + } + + #[inline] + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + Self::arr_from_iter(iter) + } + + fn try_arr_from_iter>>(iter: I) -> Result { + unsafe { BinaryViewArray::try_arr_from_iter(iter).map(|arr| arr.to_utf8view_unchecked()) } + } + + // No faster implementation than this available, fall back to default. + // fn try_arr_from_iter_trusted(iter: I) -> Result +} + +impl ArrayFromIter> for Utf8ViewArray { + #[inline] + fn arr_from_iter>>(iter: I) -> Self { + unsafe { BinaryViewArray::arr_from_iter(iter).to_utf8view_unchecked() } + } + + #[inline] + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + Self::arr_from_iter(iter) + } + + fn try_arr_from_iter, E>>>( + iter: I, + ) -> Result { + unsafe { BinaryViewArray::try_arr_from_iter(iter).map(|arr| arr.to_utf8view_unchecked()) } + } + + // No faster implementation than this available, fall back to default. + // fn try_arr_from_iter_trusted(iter: I) -> Result +} + impl ArrayFromIter for Utf8Array { #[inline(always)] fn arr_from_iter>(iter: I) -> Self { diff --git a/crates/polars-arrow/src/array/values.rs b/crates/polars-arrow/src/array/values.rs index 78fd14927187..9864e4f4c129 100644 --- a/crates/polars-arrow/src/array/values.rs +++ b/crates/polars-arrow/src/array/values.rs @@ -1,4 +1,6 @@ -use crate::array::{ArrayRef, BinaryArray, FixedSizeListArray, ListArray, Utf8Array}; +use crate::array::{ + ArrayRef, BinaryArray, BinaryViewArray, FixedSizeListArray, ListArray, Utf8Array, Utf8ViewArray, +}; use crate::datatypes::ArrowDataType; use crate::offset::Offset; @@ -73,6 +75,16 @@ impl ValueSize for ArrayRef { .downcast_ref::>() .unwrap() .get_values_size(), + ArrowDataType::Utf8View => self + .as_any() + .downcast_ref::() + .unwrap() + .total_bytes_len(), + ArrowDataType::BinaryView => self + .as_any() + .downcast_ref::() + .unwrap() + .total_bytes_len(), _ => unimplemented!(), } } diff --git a/crates/polars-arrow/src/compute/arithmetics/basic/mod.rs b/crates/polars-arrow/src/compute/arithmetics/basic/mod.rs index faa55af6bbd9..0b384f1767f7 100644 --- a/crates/polars-arrow/src/compute/arithmetics/basic/mod.rs +++ b/crates/polars-arrow/src/compute/arithmetics/basic/mod.rs @@ -36,6 +36,7 @@ impl NativeArithmetics for i8 {} impl NativeArithmetics for i16 {} impl NativeArithmetics for i32 {} impl NativeArithmetics for i64 {} +impl NativeArithmetics for i128 {} impl NativeArithmetics for f32 {} impl NativeArithmetics for f64 {} diff --git a/crates/polars-arrow/src/compute/cast/binary_to.rs b/crates/polars-arrow/src/compute/cast/binary_to.rs index 548912e7a1e7..e75be2d54d49 100644 --- a/crates/polars-arrow/src/compute/cast/binary_to.rs +++ b/crates/polars-arrow/src/compute/cast/binary_to.rs @@ -177,6 +177,11 @@ pub fn fixed_size_binary_binary( ) } +pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray { + let mutable = MutableBinaryViewArray::from_values_iter(from.values_iter()); + mutable.freeze().with_validity(from.validity().cloned()) +} + /// Conversion of binary pub fn binary_to_list( from: &BinaryArray, diff --git a/crates/polars-arrow/src/compute/cast/binview_to.rs b/crates/polars-arrow/src/compute/cast/binview_to.rs index cf1759669f5e..f3c0a7de2b7c 100644 --- a/crates/polars-arrow/src/compute/cast/binview_to.rs +++ b/crates/polars-arrow/src/compute/cast/binview_to.rs @@ -1,9 +1,21 @@ +use chrono::Datelike; +use polars_error::PolarsResult; + use crate::array::*; +use crate::compute::cast::binary_to::Parse; +use crate::compute::cast::CastOptions; +use crate::datatypes::{ArrowDataType, TimeUnit}; +#[cfg(feature = "dtype-decimal")] +use crate::legacy::compute::decimal::deserialize_decimal; use crate::offset::Offset; +use crate::temporal_conversions::EPOCH_DAYS_FROM_CE; +use crate::types::NativeType; + +pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; pub(super) fn view_to_binary(array: &BinaryViewArray) -> BinaryArray { let len: usize = Array::len(array); - let mut mutable = MutableBinaryValuesArray::::with_capacities(len, len * 12); + let mut mutable = MutableBinaryValuesArray::::with_capacities(len, array.total_bytes_len()); for slice in array.values_iter() { mutable.push(slice) } @@ -11,7 +23,7 @@ pub(super) fn view_to_binary(array: &BinaryViewArray) -> BinaryArray< out.with_validity(array.validity().cloned()) } -pub(super) fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { +pub fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { let array = array.to_binview(); let out = view_to_binary::(&array); @@ -25,3 +37,76 @@ pub(super) fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array ) } } +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub(super) fn binview_to_primitive( + from: &BinaryViewArray, + to: &ArrowDataType, +) -> PrimitiveArray +where + T: NativeType + Parse, +{ + let iter = from.iter().map(|x| x.and_then::(|x| T::parse(x))); + + PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) +} + +pub(super) fn binview_to_primitive_dyn( + from: &dyn Array, + to: &ArrowDataType, + options: CastOptions, +) -> PolarsResult> +where + T: NativeType + Parse, +{ + let from = from.as_any().downcast_ref().unwrap(); + if options.partial { + unimplemented!() + } else { + Ok(Box::new(binview_to_primitive::(from, to))) + } +} + +#[cfg(feature = "dtype-decimal")] +pub fn binview_to_decimal( + array: &BinaryViewArray, + precision: Option, + scale: usize, +) -> PrimitiveArray { + let precision = precision.map(|p| p as u8); + array + .iter() + .map(|val| val.and_then(|val| deserialize_decimal(val, precision, scale as u8))) + .collect() +} + +pub(super) fn utf8view_to_naive_timestamp_dyn( + from: &dyn Array, + time_unit: TimeUnit, +) -> PolarsResult> { + let from = from.as_any().downcast_ref().unwrap(); + Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit))) +} + +/// [`crate::temporal_conversions::utf8view_to_timestamp`] applied for RFC3339 formatting +pub fn utf8view_to_naive_timestamp( + from: &Utf8ViewArray, + time_unit: TimeUnit, +) -> PrimitiveArray { + crate::temporal_conversions::utf8view_to_naive_timestamp(from, RFC3339, time_unit) +} + +pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray { + let iter = from.iter().map(|x| { + x.and_then(|x| { + x.parse::() + .ok() + .map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + }) + }); + PrimitiveArray::::from_trusted_len_iter(iter).to(ArrowDataType::Date32) +} + +pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> PolarsResult> { + let from = from.as_any().downcast_ref().unwrap(); + Ok(Box::new(utf8view_to_date32(from))) +} diff --git a/crates/polars-arrow/src/compute/cast/mod.rs b/crates/polars-arrow/src/compute/cast/mod.rs index 74eb33808f8a..70989f287918 100644 --- a/crates/polars-arrow/src/compute/cast/mod.rs +++ b/crates/polars-arrow/src/compute/cast/mod.rs @@ -9,6 +9,10 @@ mod primitive_to; mod utf8_to; pub use binary_to::*; +#[cfg(feature = "dtype-decimal")] +pub use binview_to::binview_to_decimal; +use binview_to::binview_to_primitive_dyn; +pub use binview_to::utf8view_to_utf8; pub use boolean_to::*; pub use decimal_to::*; pub use dictionary_to::*; @@ -17,9 +21,14 @@ pub use primitive_to::*; pub use utf8_to::*; use crate::array::*; +use crate::compute::cast::binview_to::{ + utf8view_to_date32_dyn, utf8view_to_naive_timestamp_dyn, view_to_binary, +}; use crate::datatypes::*; +use crate::legacy::index::IdxSize; use crate::match_integer_type; use crate::offset::{Offset, Offsets}; +use crate::temporal_conversions::utf8view_to_timestamp; /// options defining how Cast kernels behave #[derive(Clone, Copy, Debug, Default)] @@ -33,6 +42,15 @@ pub struct CastOptions { pub partial: bool, } +impl CastOptions { + pub fn unchecked() -> Self { + Self { + wrapped: true, + partial: false, + } + } +} + impl CastOptions { fn with_wrapped(&self, v: bool) -> Self { let mut option = *self; @@ -172,7 +190,7 @@ fn cast_list_to_fixed_size_list( // Build take indices for the values. This is used to fill in the null slots. let mut indices = - MutablePrimitiveArray::::with_capacity(list.values().len() + null_cnt * size); + MutablePrimitiveArray::::with_capacity(list.values().len() + null_cnt * size); for i in 0..list.len() { if list.is_null(i) { indices.extend_constant(size, None) @@ -180,11 +198,15 @@ fn cast_list_to_fixed_size_list( // SAFETY: we know the index is in bound. let current_offset = unsafe { *offsets.get_unchecked(i) }; for j in 0..size { - indices.push(Some(current_offset + O::from_as_usize(j))); + indices.push(Some( + (current_offset + O::from_as_usize(j)).to_usize() as IdxSize + )); } } } - let take_values = crate::compute::take::take(list.values().as_ref(), &indices.into())?; + let take_values = unsafe { + crate::legacy::compute::take::take_unchecked(list.values().as_ref(), &indices.freeze()) + }; cast(take_values.as_ref(), inner.data_type(), options)? }; @@ -196,6 +218,14 @@ fn cast_list_to_fixed_size_list( .map_err(|_| polars_err!(ComputeError: "not all elements have the specified width {size}")) } +pub fn cast_default(array: &dyn Array, to_type: &ArrowDataType) -> PolarsResult> { + cast(array, to_type, Default::default()) +} + +pub fn cast_unchecked(array: &dyn Array, to_type: &ArrowDataType) -> PolarsResult> { + cast(array, to_type, CastOptions::unchecked()) +} + /// Cast `array` to the provided data type and return a new [`Array`] with /// type `to_type`, if possible. /// @@ -238,13 +268,14 @@ pub fn cast( (Struct(_), _) | (_, Struct(_)) => polars_bail!(InvalidOperation: "Cannot cast from struct to other types" ), - (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( - array.as_any().downcast_ref().unwrap(), - inner.as_ref(), - *size, - options, - ) - .map(|x| x.boxed()), + // not supported by polars + // (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( + // array.as_any().downcast_ref().unwrap(), + // inner.as_ref(), + // *size, + // options, + // ) + // .map(|x| x.boxed()), (LargeList(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( array.as_any().downcast_ref().unwrap(), inner.as_ref(), @@ -264,9 +295,39 @@ pub fn cast( options, ) .map(|x| x.boxed()), - (List(_), List(_)) => { - cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) - .map(|x| x.boxed()) + // not supported by polars + // (List(_), List(_)) => { + // cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) + // .map(|x| x.boxed()) + // }, + (BinaryView, _) => match to_type { + Utf8View => array + .as_any() + .downcast_ref::() + .unwrap() + .to_utf8view() + .map(|arr| arr.boxed()), + LargeBinary => Ok(binview_to::view_to_binary::( + array.as_any().downcast_ref().unwrap(), + ) + .boxed()), + UInt8 => binview_to_primitive_dyn::(array, to_type, options), + UInt16 => binview_to_primitive_dyn::(array, to_type, options), + UInt32 => binview_to_primitive_dyn::(array, to_type, options), + UInt64 => binview_to_primitive_dyn::(array, to_type, options), + Int8 => binview_to_primitive_dyn::(array, to_type, options), + Int16 => binview_to_primitive_dyn::(array, to_type, options), + Int32 => binview_to_primitive_dyn::(array, to_type, options), + Int64 => binview_to_primitive_dyn::(array, to_type, options), + Float32 => binview_to_primitive_dyn::(array, to_type, options), + Float64 => binview_to_primitive_dyn::(array, to_type, options), + LargeList(inner) if matches!(inner.data_type, ArrowDataType::UInt8) => { + let bin_array = view_to_binary::(array.as_any().downcast_ref().unwrap()); + Ok(binary_to_list(&bin_array, to_type.clone()).boxed()) + }, + _ => polars_bail!(InvalidOperation: + "casting from {from_type:?} to {to_type:?} not supported", + ), }, (LargeList(_), LargeList(_)) => { cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) @@ -310,6 +371,40 @@ pub fn cast( Ok(Box::new(list_array)) }, + (Utf8View, _) => { + let arr = array.as_any().downcast_ref::().unwrap(); + + match to_type { + BinaryView => Ok(arr.to_binview().boxed()), + LargeUtf8 => Ok(binview_to::utf8view_to_utf8::(arr).boxed()), + UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float32 + | Float64 + | Decimal(_, _) => cast(&arr.to_binview(), to_type, options), + Timestamp(time_unit, None) => { + utf8view_to_naive_timestamp_dyn(array, time_unit.to_owned()) + }, + Timestamp(time_unit, Some(time_zone)) => utf8view_to_timestamp( + array.as_any().downcast_ref().unwrap(), + RFC3339, + time_zone.clone(), + time_unit.to_owned(), + ) + .map(|arr| arr.boxed()), + Date32 => utf8view_to_date32_dyn(array), + _ => polars_bail!(InvalidOperation: + "casting from {from_type:?} to {to_type:?} not supported", + ), + } + }, + (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { dictionary_cast_dyn::<$T>(array, to_type, options) }), @@ -348,35 +443,17 @@ pub fn cast( "casting from {from_type:?} to {to_type:?} not supported", ), }, - (Utf8View, _) => match to_type { - BinaryView => Ok(array - .as_any() - .downcast_ref::() - .unwrap() - .to_binview() - .boxed()), - LargeUtf8 => Ok(binview_to::utf8view_to_utf8::( - array.as_any().downcast_ref().unwrap(), + (_, BinaryView) => from_to_binview(array, from_type, to_type).map(|arr| arr.boxed()), + (_, Utf8View) => match from_type { + LargeUtf8 => Ok(utf8_to_utf8view( + array.as_any().downcast_ref::>().unwrap(), ) .boxed()), - _ => polars_bail!(InvalidOperation: - "casting from {from_type:?} to {to_type:?} not supported", - ), - }, - (BinaryView, _) => match to_type { - BinaryView => array - .as_any() - .downcast_ref::() - .unwrap() - .to_utf8view() - .map(|arr| arr.boxed()), - LargeBinary => Ok(binview_to::view_to_binary::( - array.as_any().downcast_ref().unwrap(), - ) - .boxed()), - _ => polars_bail!(InvalidOperation: - "casting from {from_type:?} to {to_type:?} not supported", + Utf8 => Ok( + utf8_to_utf8view(array.as_any().downcast_ref::>().unwrap()).boxed(), ), + _ => from_to_binview(array, from_type, to_type) + .map(|arr| unsafe { arr.to_utf8view_unchecked() }.boxed()), }, (Utf8, _) => match to_type { LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( @@ -387,95 +464,27 @@ pub fn cast( ), }, (LargeUtf8, _) => match to_type { - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => { - let binary = utf8_to_binary::( - array.as_any().downcast_ref().unwrap(), - ArrowDataType::LargeBinary, - ); - cast(&binary, to_type, options) - }, - Date32 => utf8_to_date32_dyn::(array), - Date64 => utf8_to_date64_dyn::(array), - Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()).map(|x| x.boxed()), LargeBinary => Ok(utf8_to_binary::( array.as_any().downcast_ref().unwrap(), to_type.clone(), ) .boxed()), - Timestamp(time_unit, None) => { - utf8_to_naive_timestamp_dyn::(array, time_unit.to_owned()) - }, - Timestamp(time_unit, Some(time_zone)) => { - utf8_to_timestamp_dyn::(array, time_zone.clone(), time_unit.to_owned()) - }, _ => polars_bail!(InvalidOperation: "casting from {from_type:?} to {to_type:?} not supported", ), }, - - (_, Utf8) => match from_type { - UInt8 => primitive_to_utf8_dyn::(array), - UInt16 => primitive_to_utf8_dyn::(array), - UInt32 => primitive_to_utf8_dyn::(array), - UInt64 => primitive_to_utf8_dyn::(array), - Int8 => primitive_to_utf8_dyn::(array), - Int16 => primitive_to_utf8_dyn::(array), - Int32 => primitive_to_utf8_dyn::(array), - Int64 => primitive_to_utf8_dyn::(array), - Float32 => primitive_to_utf8_dyn::(array), - Float64 => primitive_to_utf8_dyn::(array), - Timestamp(from_unit, Some(tz)) => { - let from = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(timestamp_to_utf8::(from, *from_unit, tz)?)) - }, - Timestamp(from_unit, None) => { - let from = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(naive_timestamp_to_utf8::(from, *from_unit))) - }, - _ => polars_bail!(InvalidOperation: - "casting from {from_type:?} to {to_type:?} not supported", - ), - }, - (_, LargeUtf8) => match from_type { UInt8 => primitive_to_utf8_dyn::(array), - UInt16 => primitive_to_utf8_dyn::(array), - UInt32 => primitive_to_utf8_dyn::(array), - UInt64 => primitive_to_utf8_dyn::(array), - Int8 => primitive_to_utf8_dyn::(array), - Int16 => primitive_to_utf8_dyn::(array), - Int32 => primitive_to_utf8_dyn::(array), - Int64 => primitive_to_utf8_dyn::(array), - Float32 => primitive_to_utf8_dyn::(array), - Float64 => primitive_to_utf8_dyn::(array), LargeBinary => { binary_to_utf8::(array.as_any().downcast_ref().unwrap(), to_type.clone()) .map(|x| x.boxed()) }, - Timestamp(from_unit, Some(tz)) => { - let from = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(timestamp_to_utf8::(from, *from_unit, tz)?)) - }, - Timestamp(from_unit, None) => { - let from = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(naive_timestamp_to_utf8::(from, *from_unit))) - }, _ => polars_bail!(InvalidOperation: "casting from {from_type:?} to {to_type:?} not supported", ), }, (Binary, _) => match to_type { - UInt8 => binary_to_primitive_dyn::(array, to_type, options), - UInt16 => binary_to_primitive_dyn::(array, to_type, options), - UInt32 => binary_to_primitive_dyn::(array, to_type, options), - UInt64 => binary_to_primitive_dyn::(array, to_type, options), - Int8 => binary_to_primitive_dyn::(array, to_type, options), - Int16 => binary_to_primitive_dyn::(array, to_type, options), - Int32 => binary_to_primitive_dyn::(array, to_type, options), - Int64 => binary_to_primitive_dyn::(array, to_type, options), - Float32 => binary_to_primitive_dyn::(array, to_type, options), - Float64 => binary_to_primitive_dyn::(array, to_type, options), LargeBinary => Ok(Box::new(binary_to_large_binary( array.as_any().downcast_ref().unwrap(), to_type.clone(), @@ -504,10 +513,6 @@ pub fn cast( binary_to_utf8::(array.as_any().downcast_ref().unwrap(), to_type.clone()) .map(|x| x.boxed()) }, - LargeList(inner) if matches!(inner.data_type, ArrowDataType::UInt8) => Ok( - binary_to_list::(array.as_any().downcast_ref().unwrap(), to_type.clone()) - .boxed(), - ), _ => polars_bail!(InvalidOperation: "casting from {from_type:?} to {to_type:?} not supported", ), @@ -527,39 +532,6 @@ pub fn cast( "casting from {from_type:?} to {to_type:?} not supported", ), }, - - (_, Binary) => match from_type { - UInt8 => primitive_to_binary_dyn::(array), - UInt16 => primitive_to_binary_dyn::(array), - UInt32 => primitive_to_binary_dyn::(array), - UInt64 => primitive_to_binary_dyn::(array), - Int8 => primitive_to_binary_dyn::(array), - Int16 => primitive_to_binary_dyn::(array), - Int32 => primitive_to_binary_dyn::(array), - Int64 => primitive_to_binary_dyn::(array), - Float32 => primitive_to_binary_dyn::(array), - Float64 => primitive_to_binary_dyn::(array), - _ => polars_bail!(InvalidOperation: - "casting from {from_type:?} to {to_type:?} not supported", - ), - }, - - (_, LargeBinary) => match from_type { - UInt8 => primitive_to_binary_dyn::(array), - UInt16 => primitive_to_binary_dyn::(array), - UInt32 => primitive_to_binary_dyn::(array), - UInt64 => primitive_to_binary_dyn::(array), - Int8 => primitive_to_binary_dyn::(array), - Int16 => primitive_to_binary_dyn::(array), - Int32 => primitive_to_binary_dyn::(array), - Int64 => primitive_to_binary_dyn::(array), - Float32 => primitive_to_binary_dyn::(array), - Float64 => primitive_to_binary_dyn::(array), - _ => polars_bail!(InvalidOperation: - "casting from {from_type:?} to {to_type:?} not supported", - ), - }, - // start numeric casts (UInt8, UInt16) => primitive_to_primitive_dyn::(array, to_type, as_options), (UInt8, UInt32) => primitive_to_primitive_dyn::(array, to_type, as_options), @@ -742,13 +714,13 @@ pub fn cast( (Int64, Duration(_)) => primitive_to_same_primitive_dyn::(array, to_type), (Duration(_), Int64) => primitive_to_same_primitive_dyn::(array, to_type), - (Interval(IntervalUnit::DayTime), Interval(IntervalUnit::MonthDayNano)) => { - primitive_dyn!(array, days_ms_to_months_days_ns) - }, - (Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::MonthDayNano)) => { - primitive_dyn!(array, months_to_months_days_ns) - }, - + // Not supported by Polars. + // (Interval(IntervalUnit::DayTime), Interval(IntervalUnit::MonthDayNano)) => { + // primitive_dyn!(array, days_ms_to_months_days_ns) + // }, + // (Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::MonthDayNano)) => { + // primitive_dyn!(array, months_to_months_days_ns) + // }, _ => polars_bail!(InvalidOperation: "casting from {from_type:?} to {to_type:?} not supported", ), @@ -785,3 +757,30 @@ fn cast_to_dictionary( ), } } + +fn from_to_binview( + array: &dyn Array, + from_type: &ArrowDataType, + to_type: &ArrowDataType, +) -> PolarsResult { + use ArrowDataType::*; + let binview = match from_type { + UInt8 => primitive_to_binview_dyn::(array), + UInt16 => primitive_to_binview_dyn::(array), + UInt32 => primitive_to_binview_dyn::(array), + UInt64 => primitive_to_binview_dyn::(array), + Int8 => primitive_to_binview_dyn::(array), + Int16 => primitive_to_binview_dyn::(array), + Int32 => primitive_to_binview_dyn::(array), + Int64 => primitive_to_binview_dyn::(array), + Float32 => primitive_to_binview_dyn::(array), + Float64 => primitive_to_binview_dyn::(array), + Binary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), + FixedSizeBinary(_) => fixed_size_binary_to_binview(array.as_any().downcast_ref().unwrap()), + LargeBinary => binary_to_binview::(array.as_any().downcast_ref().unwrap()), + _ => polars_bail!(InvalidOperation: + "casting from {from_type:?} to {to_type:?} not supported", + ), + }; + Ok(binview) +} diff --git a/crates/polars-arrow/src/compute/cast/primitive_to.rs b/crates/polars-arrow/src/compute/cast/primitive_to.rs index 3db6cfa142f7..1522729e8f3f 100644 --- a/crates/polars-arrow/src/compute/cast/primitive_to.rs +++ b/crates/polars-arrow/src/compute/cast/primitive_to.rs @@ -92,29 +92,6 @@ fn primitive_to_values_and_offsets( } } -/// Returns a [`BinaryArray`] where every element is the binary representation of the number. -pub(super) fn primitive_to_binary( - from: &PrimitiveArray, -) -> BinaryArray { - let (values, offsets) = primitive_to_values_and_offsets(from); - - BinaryArray::::new( - BinaryArray::::default_data_type(), - offsets.into(), - values.into(), - from.validity().cloned(), - ) -} - -pub(super) fn primitive_to_binary_dyn(from: &dyn Array) -> PolarsResult> -where - O: Offset, - T: NativeType + SerPrimitive, -{ - let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(primitive_to_binary::(from))) -} - /// Returns a [`BooleanArray`] where every element is different from zero. /// Validity is preserved. pub fn primitive_to_boolean( @@ -646,3 +623,27 @@ pub fn months_to_months_days_ns(from: &PrimitiveArray) -> PrimitiveArray) -> PrimitiveArray { unary(from, |x| x.to_f32(), ArrowDataType::Float32) } + +/// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. +pub(super) fn primitive_to_binview( + from: &PrimitiveArray, +) -> BinaryViewArray { + let mut mutable = MutableBinaryViewArray::with_capacity(from.len()); + + let mut scratch = vec![]; + for &x in from.values().iter() { + unsafe { scratch.set_len(0) }; + T::write(&mut scratch, x); + mutable.push_value_ignore_validity(&scratch) + } + + mutable.freeze().with_validity(from.validity().cloned()) +} + +pub(super) fn primitive_to_binview_dyn(from: &dyn Array) -> BinaryViewArray +where + T: NativeType + SerPrimitive, +{ + let from = from.as_any().downcast_ref().unwrap(); + primitive_to_binview::(from) +} diff --git a/crates/polars-arrow/src/compute/cast/utf8_to.rs b/crates/polars-arrow/src/compute/cast/utf8_to.rs index 79e970e82280..f3c8d980c2ce 100644 --- a/crates/polars-arrow/src/compute/cast/utf8_to.rs +++ b/crates/polars-arrow/src/compute/cast/utf8_to.rs @@ -1,49 +1,14 @@ -use chrono::Datelike; +use std::sync::Arc; + use polars_error::PolarsResult; +use polars_utils::slice::GetSaferUnchecked; +use polars_utils::vec::PushUnchecked; use crate::array::*; -use crate::datatypes::{ArrowDataType, TimeUnit}; +use crate::datatypes::ArrowDataType; use crate::offset::Offset; -use crate::temporal_conversions::{ - utf8_to_naive_timestamp as utf8_to_naive_timestamp_, utf8_to_timestamp as utf8_to_timestamp_, - EPOCH_DAYS_FROM_CE, -}; - -const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; - -/// Casts a [`Utf8Array`] to a Date32 primitive, making any uncastable value a Null. -pub fn utf8_to_date32(from: &Utf8Array) -> PrimitiveArray { - let iter = from.iter().map(|x| { - x.and_then(|x| { - x.parse::() - .ok() - .map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - }) - }); - PrimitiveArray::::from_trusted_len_iter(iter).to(ArrowDataType::Date32) -} - -pub(super) fn utf8_to_date32_dyn(from: &dyn Array) -> PolarsResult> { - let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_date32::(from))) -} -/// Casts a [`Utf8Array`] to a Date64 primitive, making any uncastable value a Null. -pub fn utf8_to_date64(from: &Utf8Array) -> PrimitiveArray { - let iter = from.iter().map(|x| { - x.and_then(|x| { - x.parse::() - .ok() - .map(|x| (x.num_days_from_ce() - EPOCH_DAYS_FROM_CE) as i64 * 86400000) - }) - }); - PrimitiveArray::from_trusted_len_iter(iter).to(ArrowDataType::Date64) -} - -pub(super) fn utf8_to_date64_dyn(from: &dyn Array) -> PolarsResult> { - let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_date64::(from))) -} +pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; pub(super) fn utf8_to_dictionary_dyn( from: &dyn Array, @@ -65,42 +30,6 @@ pub fn utf8_to_dictionary( Ok(array.into()) } -pub(super) fn utf8_to_naive_timestamp_dyn( - from: &dyn Array, - time_unit: TimeUnit, -) -> PolarsResult> { - let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_naive_timestamp::(from, time_unit))) -} - -/// [`crate::temporal_conversions::utf8_to_timestamp`] applied for RFC3339 formatting -pub fn utf8_to_naive_timestamp( - from: &Utf8Array, - time_unit: TimeUnit, -) -> PrimitiveArray { - utf8_to_naive_timestamp_(from, RFC3339, time_unit) -} - -pub(super) fn utf8_to_timestamp_dyn( - from: &dyn Array, - timezone: String, - time_unit: TimeUnit, -) -> PolarsResult> { - let from = from.as_any().downcast_ref().unwrap(); - utf8_to_timestamp::(from, timezone, time_unit) - .map(Box::new) - .map(|x| x as Box) -} - -/// [`crate::temporal_conversions::utf8_to_timestamp`] applied for RFC3339 formatting -pub fn utf8_to_timestamp( - from: &Utf8Array, - timezone: String, - time_unit: TimeUnit, -) -> PolarsResult> { - utf8_to_timestamp_(from, RFC3339, timezone, time_unit) -} - /// Conversion of utf8 pub fn utf8_to_large_utf8(from: &Utf8Array) -> Utf8Array { let data_type = Utf8Array::::default_data_type(); @@ -138,3 +67,48 @@ pub fn utf8_to_binary( ) } } + +pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { + let buffer_idx = 0_u32; + let base_ptr = arr.values().as_ptr() as usize; + + let mut views = Vec::with_capacity(arr.len()); + let mut uses_buffer = false; + for bytes in arr.values_iter() { + let len: u32 = bytes.len().try_into().unwrap(); + + let mut payload = [0; 16]; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + + if len <= 12 { + payload[4..4 + bytes.len()].copy_from_slice(bytes); + } else { + uses_buffer = true; + unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked_release(0..4)) }; + let offset = (bytes.as_ptr() as usize - base_ptr) as u32; + payload[0..4].copy_from_slice(&len.to_le_bytes()); + payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); + payload[12..16].copy_from_slice(&offset.to_le_bytes()); + } + + let value = u128::from_le_bytes(payload); + unsafe { views.push_unchecked(value) }; + } + let buffers = if uses_buffer { + Arc::from([arr.values().clone()]) + } else { + Arc::from([]) + }; + unsafe { + BinaryViewArray::new_unchecked_unknown_md( + ArrowDataType::BinaryView, + views.into(), + buffers, + arr.validity().cloned(), + ) + } +} + +pub fn utf8_to_utf8view(arr: &Utf8Array) -> Utf8ViewArray { + unsafe { binary_to_binview(&arr.to_binary()).to_utf8view_unchecked() } +} diff --git a/crates/polars-arrow/src/compute/filter.rs b/crates/polars-arrow/src/compute/filter.rs index 647a5a74cec2..abe9fc58bbd9 100644 --- a/crates/polars-arrow/src/compute/filter.rs +++ b/crates/polars-arrow/src/compute/filter.rs @@ -272,6 +272,22 @@ pub fn filter(array: &dyn Array, filter: &BooleanArray) -> PolarsResult(array, filter))) }), + BinaryView => { + let iter = SlicesIterator::new(filter.values()); + let mut mutable = growable::GrowableBinaryViewArray::new( + vec![array.as_any().downcast_ref::().unwrap()], + false, + iter.slots(), + ); + unsafe { + iter.for_each(|(start, len)| mutable.extend_unchecked(0, start, len)); + } + Ok(mutable.as_box()) + }, + // Should go via BinaryView + Utf8View => { + unreachable!() + }, _ => { let iter = SlicesIterator::new(filter.values()); let mut mutable = make_growable(&[array], false, iter.slots()); diff --git a/crates/polars-arrow/src/io/ipc/write/common.rs b/crates/polars-arrow/src/io/ipc/write/common.rs index d4d0fd4a259c..f4b8a1c015e7 100644 --- a/crates/polars-arrow/src/io/ipc/write/common.rs +++ b/crates/polars-arrow/src/io/ipc/write/common.rs @@ -10,6 +10,7 @@ use crate::chunk::Chunk; use crate::datatypes::*; use crate::io::ipc::endianness::is_native_little_endian; use crate::io::ipc::read::Dictionaries; +use crate::legacy::prelude::LargeListArray; use crate::match_integer_type; /// Compression codec @@ -229,6 +230,34 @@ fn serialize_compression( } } +fn set_variadic_buffer_counts(counts: &mut Vec, array: &dyn Array) { + match array.data_type() { + ArrowDataType::Utf8View => { + let array = array.as_any().downcast_ref::().unwrap(); + counts.push(array.data_buffers().len() as i64); + }, + ArrowDataType::BinaryView => { + let array = array.as_any().downcast_ref::().unwrap(); + counts.push(array.data_buffers().len() as i64); + }, + ArrowDataType::Struct(_) => { + let array = array.as_any().downcast_ref::().unwrap(); + for array in array.values() { + set_variadic_buffer_counts(counts, array.as_ref()) + } + }, + ArrowDataType::LargeList(_) => { + let array = array.as_any().downcast_ref::().unwrap(); + set_variadic_buffer_counts(counts, array.values().as_ref()) + }, + ArrowDataType::FixedSizeList(_, _) => { + let array = array.as_any().downcast_ref::().unwrap(); + set_variadic_buffer_counts(counts, array.values().as_ref()) + }, + _ => (), + } +} + /// Write [`Chunk`] into two sets of bytes, one for the header (ipc::Schema::Message) and the /// other for the batch's data fn chunk_to_bytes_amortized( @@ -244,20 +273,7 @@ fn chunk_to_bytes_amortized( let mut offset = 0; let mut variadic_buffer_counts = vec![]; for array in chunk.arrays() { - let dtype = array.data_type(); - if dtype.is_view() { - match dtype { - ArrowDataType::Utf8View => { - let array = array.as_any().downcast_ref::().unwrap(); - variadic_buffer_counts.push(array.data_buffers().len() as i64); - }, - ArrowDataType::BinaryView => { - let array = array.as_any().downcast_ref::().unwrap(); - variadic_buffer_counts.push(array.data_buffers().len() as i64); - }, - _ => {}, - } - } + set_variadic_buffer_counts(&mut variadic_buffer_counts, array.as_ref()); write( array.as_ref(), diff --git a/crates/polars-arrow/src/legacy/array/mod.rs b/crates/polars-arrow/src/legacy/array/mod.rs index a33ab61f215e..1e6d59bb430d 100644 --- a/crates/polars-arrow/src/legacy/array/mod.rs +++ b/crates/polars-arrow/src/legacy/array/mod.rs @@ -1,6 +1,6 @@ use crate::array::{ - new_null_array, Array, BinaryArray, BooleanArray, FixedSizeListArray, ListArray, - PrimitiveArray, StructArray, Utf8Array, + new_null_array, Array, BooleanArray, FixedSizeListArray, ListArray, MutableBinaryViewArray, + PrimitiveArray, StructArray, ViewType, }; use crate::bitmap::MutableBitmap; use crate::datatypes::ArrowDataType; @@ -107,16 +107,16 @@ pub trait ListFromIter { ) } - /// Create a list-array from an iterator. - /// Used in group_by agg-list - /// /// # Safety /// Will produce incorrect arrays if size hint is incorrect. - unsafe fn from_iter_utf8_trusted_len(iter: I, n_elements: usize) -> ListArray + unsafe fn from_iter_binview_trusted_len( + iter: I, + n_elements: usize, + ) -> ListArray where I: IntoIterator>, P: IntoIterator>, - Ref: AsRef, + Ref: AsRef, { let iterator = iter.into_iter(); let (lower, _) = iterator.size_hint(); @@ -125,7 +125,8 @@ pub trait ListFromIter { let mut offsets = Vec::::with_capacity(lower + 1); let mut length_so_far = 0i64; offsets.push(length_so_far); - let values: Utf8Array = iterator + + let values: MutableBinaryViewArray = iterator .filter_map(|opt_iter| match opt_iter { Some(x) => { let it = x.into_iter(); @@ -147,13 +148,27 @@ pub trait ListFromIter { // Safety: // offsets are monotonically increasing ListArray::new( - ListArray::::default_datatype(ArrowDataType::LargeUtf8), + ListArray::::default_datatype(T::DATA_TYPE), Offsets::new_unchecked(offsets).into(), - Box::new(values), + values.freeze().boxed(), Some(validity.into()), ) } + /// Create a list-array from an iterator. + /// Used in group_by agg-list + /// + /// # Safety + /// Will produce incorrect arrays if size hint is incorrect. + unsafe fn from_iter_utf8_trusted_len(iter: I, n_elements: usize) -> ListArray + where + I: IntoIterator>, + P: IntoIterator>, + Ref: AsRef, + { + Self::from_iter_binview_trusted_len(iter, n_elements) + } + /// Create a list-array from an iterator. /// Used in group_by agg-list /// @@ -165,40 +180,7 @@ pub trait ListFromIter { P: IntoIterator>, Ref: AsRef<[u8]>, { - let iterator = iter.into_iter(); - let (lower, _) = iterator.size_hint(); - - let mut validity = MutableBitmap::with_capacity(lower); - let mut offsets = Vec::::with_capacity(lower + 1); - let mut length_so_far = 0i64; - offsets.push(length_so_far); - let values: BinaryArray = iterator - .filter_map(|opt_iter| match opt_iter { - Some(x) => { - let it = x.into_iter(); - length_so_far += it.size_hint().0 as i64; - validity.push(true); - offsets.push(length_so_far); - Some(it) - }, - None => { - validity.push(false); - offsets.push(length_so_far); - None - }, - }) - .flatten() - .trust_my_length(n_elements) - .collect(); - - // Safety: - // offsets are monotonically increasing - ListArray::new( - ListArray::::default_datatype(ArrowDataType::LargeBinary), - Offsets::new_unchecked(offsets).into(), - Box::new(values), - Some(validity.into()), - ) + Self::from_iter_binview_trusted_len(iter, n_elements) } } impl ListFromIter for ListArray {} diff --git a/crates/polars-arrow/src/legacy/compute/cast.rs b/crates/polars-arrow/src/legacy/compute/cast.rs deleted file mode 100644 index 84d54edfe453..000000000000 --- a/crates/polars-arrow/src/legacy/compute/cast.rs +++ /dev/null @@ -1,40 +0,0 @@ -use polars_error::PolarsResult; - -use crate::array::Array; -use crate::datatypes::ArrowDataType; - -pub fn cast(array: &dyn Array, to_type: &ArrowDataType) -> PolarsResult> { - match to_type { - #[cfg(feature = "dtype-decimal")] - ArrowDataType::Decimal(precision, scale) - if matches!(array.data_type(), ArrowDataType::LargeUtf8) => - { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Box::new(cast_utf8_to_decimal( - array, - Some(*precision), - *scale, - ))) - }, - _ => crate::compute::cast::cast(array, to_type, Default::default()), - } -} - -#[cfg(feature = "dtype-decimal")] -use super::decimal::*; -#[cfg(feature = "dtype-decimal")] -use crate::array::{PrimitiveArray, Utf8Array}; -#[cfg(feature = "dtype-decimal")] -use crate::legacy::prelude::LargeStringArray; -#[cfg(feature = "dtype-decimal")] -pub fn cast_utf8_to_decimal( - array: &Utf8Array, - precision: Option, - scale: usize, -) -> PrimitiveArray { - let precision = precision.map(|p| p as u8); - array - .iter() - .map(|val| val.and_then(|val| deserialize_decimal(val.as_bytes(), precision, scale as u8))) - .collect() -} diff --git a/crates/polars-arrow/src/legacy/compute/decimal.rs b/crates/polars-arrow/src/legacy/compute/decimal.rs index 690b191ca9da..d3f5d7862cc9 100644 --- a/crates/polars-arrow/src/legacy/compute/decimal.rs +++ b/crates/polars-arrow/src/legacy/compute/decimal.rs @@ -35,7 +35,7 @@ pub fn infer_scale(bytes: &[u8]) -> u8 { /// requires precision >= 7 and scale >= 3. Returns None if the number is not well-formed, or does not /// fit. Only b'.' is allowed as a decimal separator (issue #6698). #[inline] -pub(super) fn deserialize_decimal( +pub(crate) fn deserialize_decimal( mut bytes: &[u8], precision: Option, scale: u8, diff --git a/crates/polars-arrow/src/legacy/compute/mod.rs b/crates/polars-arrow/src/legacy/compute/mod.rs index 95d75f957e53..9bdba88e7d7a 100644 --- a/crates/polars-arrow/src/legacy/compute/mod.rs +++ b/crates/polars-arrow/src/legacy/compute/mod.rs @@ -5,8 +5,6 @@ use crate::types::NativeType; pub mod arithmetics; pub mod bitwise; -#[cfg(feature = "compute_cast")] -pub mod cast; #[cfg(feature = "dtype-decimal")] pub mod decimal; pub mod take; diff --git a/crates/polars-arrow/src/legacy/compute/take/mod.rs b/crates/polars-arrow/src/legacy/compute/take/mod.rs index af2dde7056e8..566dc0c30519 100644 --- a/crates/polars-arrow/src/legacy/compute/take/mod.rs +++ b/crates/polars-arrow/src/legacy/compute/take/mod.rs @@ -3,13 +3,14 @@ mod boolean; #[cfg(feature = "dtype-array")] mod fixed_size_list; +use polars_utils::slice::GetSaferUnchecked; + use crate::array::*; -use crate::bitmap::MutableBitmap; +use crate::bitmap::{Bitmap, MutableBitmap}; use crate::buffer::Buffer; -use crate::datatypes::{ArrowDataType, PhysicalType}; +use crate::datatypes::PhysicalType; use crate::legacy::bit_util::unset_bit_raw; use crate::legacy::prelude::*; -use crate::legacy::trusted_len::{TrustedLen, TrustedLenPush}; use crate::legacy::utils::CustomIterTools; use crate::offset::Offsets; use crate::types::NativeType; @@ -25,16 +26,8 @@ pub unsafe fn take_unchecked(arr: &dyn Array, idx: &IdxArr) -> ArrayRef { match arr.data_type().to_physical_type() { Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { let arr: &PrimitiveArray<$T> = arr.as_any().downcast_ref().unwrap(); - if arr.null_count() > 0 { - take_primitive_unchecked::<$T>(arr, idx) - } else { - take_no_null_primitive_unchecked::<$T>(arr, idx) - } + take_primitive_unchecked::<$T>(arr, idx).boxed() }), - LargeUtf8 => { - let arr = arr.as_any().downcast_ref().unwrap(); - take_utf8_unchecked(arr, idx) - }, Boolean => { let arr = arr.as_any().downcast_ref().unwrap(); Box::new(boolean::take_unchecked(arr, idx)) @@ -44,6 +37,17 @@ pub unsafe fn take_unchecked(arr: &dyn Array, idx: &IdxArr) -> ArrayRef { let arr = arr.as_any().downcast_ref().unwrap(); Box::new(fixed_size_list::take_unchecked(arr, idx)) }, + BinaryView => take_binview_unchecked(arr.as_any().downcast_ref().unwrap(), idx).boxed(), + Utf8View => { + let arr: &Utf8ViewArray = arr.as_any().downcast_ref().unwrap(); + take_binview_unchecked(&arr.to_binview(), idx) + .to_utf8view_unchecked() + .boxed() + }, + Struct => { + let array = arr.as_any().downcast_ref().unwrap(); + take_struct_unchecked(array, idx).boxed() + }, // TODO! implement proper unchecked version #[cfg(feature = "compute")] _ => { @@ -57,637 +61,116 @@ pub unsafe fn take_unchecked(arr: &dyn Array, idx: &IdxArr) -> ArrayRef { } } +unsafe fn take_validity_unchecked(validity: Option<&Bitmap>, indices: &IdxArr) -> Option { + let indices_validity = indices.validity(); + match (validity, indices_validity) { + (None, _) => indices_validity.cloned(), + (Some(validity), None) => { + let iter = indices + .values() + .iter() + .map(|index| validity.get_bit_unchecked(*index as usize)); + MutableBitmap::from_trusted_len_iter(iter).into() + }, + (Some(validity), _) => { + let iter = indices.iter().map(|x| match x { + Some(index) => validity.get_bit_unchecked(*index as usize), + None => false, + }); + MutableBitmap::from_trusted_len_iter(iter).into() + }, + } +} + +/// # Safety +/// No bound checks +pub unsafe fn take_struct_unchecked(array: &StructArray, indices: &IdxArr) -> StructArray { + let values: Vec> = array + .values() + .iter() + .map(|a| take_unchecked(a.as_ref(), indices)) + .collect(); + let validity = take_validity_unchecked(array.validity(), indices); + StructArray::new(array.data_type().clone(), values, validity) +} + +/// # Safety +/// No bound checks +unsafe fn take_binview_unchecked(arr: &BinaryViewArray, indices: &IdxArr) -> BinaryViewArray { + let views = arr.views().clone(); + // PrimitiveArray is not supported, so we go via i128 + let views = std::mem::transmute::, Buffer>(views); + let views = PrimitiveArray::from_data_default(views, arr.validity().cloned()); + let taken_views = take_primitive_unchecked(&views, indices); + let taken_views_values = taken_views.values().clone(); + let taken_views_values = std::mem::transmute::, Buffer>(taken_views_values); + BinaryViewArray::new_unchecked_unknown_md( + arr.data_type().clone(), + taken_views_values, + arr.data_buffers().clone(), + taken_views.validity().cloned(), + ) + .maybe_gc() +} + /// Take kernel for single chunk with nulls and arrow array as index that may have nulls. /// # Safety /// caller must ensure indices are in bounds pub unsafe fn take_primitive_unchecked( arr: &PrimitiveArray, indices: &IdxArr, -) -> Box> { +) -> PrimitiveArray { let array_values = arr.values().as_slice(); let index_values = indices.values().as_slice(); - let validity_values = arr.validity().expect("should have nulls"); // first take the values, these are always needed let values: Vec = index_values .iter() - .map(|idx| { - debug_assert!((*idx as usize) < array_values.len()); - *array_values.get_unchecked(*idx as usize) - }) + .map(|idx| *array_values.get_unchecked_release(*idx as usize)) .collect_trusted(); - // the validity buffer we will fill with all valid. And we unset the ones that are null - // in later checks - // this is in the assumption that most values will be valid. - // Maybe we could add another branch based on the null count - let mut validity = MutableBitmap::with_capacity(indices.len()); - validity.extend_constant(indices.len(), true); - let validity_ptr = validity.as_slice().as_ptr() as *mut u8; - - if let Some(validity_indices) = indices.validity().as_ref() { - index_values.iter().enumerate().for_each(|(i, idx)| { - // i is iteration count - // idx is the index that we take from the values array. - let idx = *idx as usize; - if !validity_indices.get_bit_unchecked(i) || !validity_values.get_bit_unchecked(idx) { - unset_bit_raw(validity_ptr, i); - } - }); - } else { - index_values.iter().enumerate().for_each(|(i, idx)| { - let idx = *idx as usize; - if !validity_values.get_bit_unchecked(idx) { - unset_bit_raw(validity_ptr, i); - } - }); - }; - let arr = PrimitiveArray::new(T::PRIMITIVE.into(), values.into(), Some(validity.into())); - - Box::new(arr) -} - -/// Take kernel for single chunk without nulls and arrow array as index. -/// # Safety -/// caller must ensure indices are in bounds -pub unsafe fn take_no_null_primitive_unchecked( - arr: &PrimitiveArray, - indices: &IdxArr, -) -> Box> { - debug_assert!(arr.null_count() == 0); - let array_values = arr.values().as_slice(); - let index_values = indices.values().as_slice(); - - let iter = index_values.iter().map(|idx| { - debug_assert!((*idx as usize) < array_values.len()); - *array_values.get_unchecked(*idx as usize) - }); - - let values: Buffer<_> = Vec::from_trusted_len_iter(iter).into(); - let validity = indices.validity().cloned(); - Box::new(PrimitiveArray::new(T::PRIMITIVE.into(), values, validity)) -} - -/// Take kernel for single chunk without nulls and an iterator as index. -/// -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_primitive_iter_unchecked>( - arr: &PrimitiveArray, - indices: I, -) -> Box> { - debug_assert!(!arr.has_validity()); - let array_values = arr.values().as_slice(); - - let iter = indices.into_iter().map(|idx| { - debug_assert!((idx) < array_values.len()); - *array_values.get_unchecked(idx) - }); - - let values: Buffer<_> = Vec::from_trusted_len_iter(iter).into(); - Box::new(PrimitiveArray::new(T::PRIMITIVE.into(), values, None)) -} - -/// Take kernel for a single chunk with null values and an iterator as index. -/// -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_primitive_iter_unchecked>( - arr: &PrimitiveArray, - indices: I, -) -> Box> { - let array_values = arr.values().as_slice(); - let validity = arr.validity().expect("should have nulls"); - - let iter = indices.into_iter().map(|idx| { - if validity.get_bit_unchecked(idx) { - Some(*array_values.get_unchecked(idx)) - } else { - None - } - }); - - let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter); - Box::new(arr) -} - -/// Take kernel for a single chunk without nulls and an iterator that can produce None values. -/// This is used in join operations. -/// -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_primitive_opt_iter_unchecked< - T: NativeType, - I: IntoIterator>, ->( - arr: &PrimitiveArray, - indices: I, -) -> Box> { - let array_values = arr.values().as_slice(); - - let iter = indices.into_iter().map(|opt_idx| { - opt_idx.map(|idx| { - debug_assert!(idx < array_values.len()); - *array_values.get_unchecked(idx) - }) - }); - let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter).to(T::PRIMITIVE.into()); - - Box::new(arr) -} - -/// Take kernel for a single chunk and an iterator that can produce None values. -/// This is used in join operations. -/// -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_primitive_opt_iter_unchecked< - T: NativeType, - I: IntoIterator>, ->( - arr: &PrimitiveArray, - indices: I, -) -> Box> { - let array_values = arr.values().as_slice(); - let validity = arr.validity().expect("should have nulls"); - - let iter = indices.into_iter().map(|opt_idx| { - opt_idx.and_then(|idx| { - if validity.get_bit_unchecked(idx) { - debug_assert!(idx < array_values.len()); - Some(*array_values.get_unchecked(idx)) - } else { - None - } - }) - }); - let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter).to(T::PRIMITIVE.into()); - - Box::new(arr) -} - -/// Take kernel for single chunk without nulls and an iterator as index. -/// -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_bool_iter_unchecked>( - arr: &BooleanArray, - indices: I, -) -> Box { - debug_assert!(!arr.has_validity()); - let values = arr.values(); - - let iter = indices.into_iter().map(|idx| { - debug_assert!(idx < values.len()); - values.get_bit_unchecked(idx) - }); - let mutable = MutableBitmap::from_trusted_len_iter_unchecked(iter); - Box::new(BooleanArray::new( - ArrowDataType::Boolean, - mutable.into(), - None, - )) -} - -/// Take kernel for single chunk and an iterator as index. -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_bool_iter_unchecked>( - arr: &BooleanArray, - indices: I, -) -> Box { - let validity = arr.validity().expect("should have nulls"); - - let iter = indices.into_iter().map(|idx| { - if validity.get_bit_unchecked(idx) { - Some(arr.value_unchecked(idx)) - } else { - None - } - }); - - Box::new(BooleanArray::from_trusted_len_iter_unchecked(iter)) -} - -/// Take kernel for single chunk and an iterator as index. -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_bool_opt_iter_unchecked>>( - arr: &BooleanArray, - indices: I, -) -> Box { - let validity = arr.validity().expect("should have nulls"); - let iter = indices.into_iter().map(|opt_idx| { - opt_idx.and_then(|idx| { - if validity.get_bit_unchecked(idx) { - Some(arr.value_unchecked(idx)) - } else { - None - } - }) - }); - - Box::new(BooleanArray::from_trusted_len_iter_unchecked(iter)) -} - -/// Take kernel for single chunk without null values and an iterator as index that may produce None values. -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_bool_opt_iter_unchecked>>( - arr: &BooleanArray, - indices: I, -) -> Box { - let iter = indices - .into_iter() - .map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx))); - - Box::new(BooleanArray::from_trusted_len_iter_unchecked(iter)) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_utf8_iter_unchecked>( - arr: &LargeStringArray, - indices: I, -) -> Box { - let iter = indices.into_iter().map(|idx| { - debug_assert!(idx < arr.len()); - arr.value_unchecked(idx) - }); - Box::new(MutableUtf8Array::::from_trusted_len_values_iter_unchecked(iter).into()) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_binary_iter_unchecked>( - arr: &LargeBinaryArray, - indices: I, -) -> Box { - let iter = indices.into_iter().map(|idx| { - debug_assert!(idx < arr.len()); - arr.value_unchecked(idx) - }); - Box::new(MutableBinaryArray::::from_trusted_len_values_iter_unchecked(iter).into()) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_utf8_iter_unchecked>( - arr: &LargeStringArray, - indices: I, -) -> Box { - let validity = arr.validity().expect("should have nulls"); - let iter = indices.into_iter().map(|idx| { - debug_assert!(idx < arr.len()); - if validity.get_bit_unchecked(idx) { - Some(arr.value_unchecked(idx)) - } else { - None - } - }); - - Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter)) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_binary_iter_unchecked>( - arr: &LargeBinaryArray, - indices: I, -) -> Box { - let validity = arr.validity().expect("should have nulls"); - let iter = indices.into_iter().map(|idx| { - debug_assert!(idx < arr.len()); - if validity.get_bit_unchecked(idx) { - Some(arr.value_unchecked(idx)) - } else { - None - } - }); - - Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_utf8_opt_iter_unchecked>>( - arr: &LargeStringArray, - indices: I, -) -> Box { - let iter = indices - .into_iter() - .map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx))); - - Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter)) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_no_null_binary_opt_iter_unchecked>>( - arr: &LargeBinaryArray, - indices: I, -) -> Box { - let iter = indices - .into_iter() - .map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx))); - - Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_utf8_opt_iter_unchecked>>( - arr: &LargeStringArray, - indices: I, -) -> Box { - let validity = arr.validity().expect("should have nulls"); - let iter = indices.into_iter().map(|opt_idx| { - opt_idx.and_then(|idx| { - if validity.get_bit_unchecked(idx) { - Some(arr.value_unchecked(idx)) - } else { - None - } - }) - }); - Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter)) -} - -/// # Safety -/// - no bounds checks -/// - iterator must be TrustedLen -#[inline] -pub unsafe fn take_binary_opt_iter_unchecked>>( - arr: &LargeBinaryArray, - indices: I, -) -> Box { - let validity = arr.validity().expect("should have nulls"); - let iter = indices.into_iter().map(|opt_idx| { - opt_idx.and_then(|idx| { - if validity.get_bit_unchecked(idx) { - Some(arr.value_unchecked(idx)) - } else { - None - } - }) - }); - Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) -} - -/// # Safety -/// caller must ensure indices are in bounds -pub unsafe fn take_utf8_unchecked( - arr: &LargeStringArray, - indices: &IdxArr, -) -> Box { - let data_len = indices.len(); - - let mut offset_buf = vec![0; data_len + 1]; - let offset_typed = offset_buf.as_mut_slice(); - - let mut length_so_far = 0; - offset_typed[0] = length_so_far; - - let validity; - - // The required size is yet unknown - // Allocate 2.0 times the expected size. - // where expected size is the length of bytes multiplied by the factor (take_len / current_len) - let mut values_capacity = if arr.len() > 0 { - ((arr.len() as f32 * 2.0) as usize) / arr.len() * indices.len() - } else { - 0 - }; - - // 16 bytes per string as default alloc - let mut values_buf = Vec::::with_capacity(values_capacity); - - // both 0 nulls - if !arr.has_validity() && !indices.has_validity() { - offset_typed - .iter_mut() - .skip(1) - .enumerate() - .for_each(|(idx, offset)| { - let index = indices.value_unchecked(idx) as usize; - let s = arr.value_unchecked(index); - length_so_far += s.len() as i64; - *offset = length_so_far; - - if length_so_far as usize >= values_capacity { - values_buf.reserve(values_capacity); - values_capacity *= 2; + let arr = if arr.null_count() > 0 { + let validity_values = arr.validity().unwrap(); + // the validity buffer we will fill with all valid. And we unset the ones that are null + // in later checks + // this is in the assumption that most values will be valid. + // Maybe we could add another branch based on the null count + let mut validity = MutableBitmap::with_capacity(indices.len()); + validity.extend_constant(indices.len(), true); + let validity_ptr = validity.as_slice().as_ptr() as *mut u8; + + if let Some(validity_indices) = indices.validity().as_ref() { + index_values.iter().enumerate().for_each(|(i, idx)| { + // i is iteration count + // idx is the index that we take from the values array. + let idx = *idx as usize; + if !validity_indices.get_bit_unchecked(i) || !validity_values.get_bit_unchecked(idx) + { + unset_bit_raw(validity_ptr, i); } - - values_buf.extend_from_slice(s.as_bytes()) - }); - validity = None; - } else if !arr.has_validity() { - offset_typed - .iter_mut() - .skip(1) - .enumerate() - .for_each(|(idx, offset)| { - if indices.is_valid(idx) { - let index = indices.value_unchecked(idx) as usize; - let s = arr.value_unchecked(index); - length_so_far += s.len() as i64; - - if length_so_far as usize >= values_capacity { - values_buf.reserve(values_capacity); - values_capacity *= 2; - } - - values_buf.extend_from_slice(s.as_bytes()) - } - *offset = length_so_far; - }); - validity = indices.validity().cloned(); - } else { - let mut builder = MutableUtf8Array::with_capacities(data_len, length_so_far as usize); - let validity_arr = arr.validity().expect("should have nulls"); - - if !indices.has_validity() { - (0..data_len).for_each(|idx| { - let index = indices.value_unchecked(idx) as usize; - builder.push(if validity_arr.get_bit_unchecked(index) { - let s = arr.value_unchecked(index); - Some(s) - } else { - None - }); }); } else { - let validity_indices = indices.validity().expect("should have nulls"); - (0..data_len).for_each(|idx| { - if validity_indices.get_bit_unchecked(idx) { - let index = indices.value_unchecked(idx) as usize; - - if validity_arr.get_bit_unchecked(index) { - let s = arr.value_unchecked(index); - builder.push(Some(s)); - } else { - builder.push_null(); - } - } else { - builder.push_null(); + index_values.iter().enumerate().for_each(|(i, idx)| { + let idx = *idx as usize; + if !validity_values.get_bit_unchecked(idx) { + unset_bit_raw(validity_ptr, i); } }); - } - - let array: Utf8Array = builder.into(); - return Box::new(array); - } - - // Safety: all "values" are &str, and thus valid utf8 - Box::new(Utf8Array::::from_data_unchecked_default( - offset_buf.into(), - values_buf.into(), - validity, - )) -} - -/// # Safety -/// caller must ensure indices are in bounds -pub unsafe fn take_binary_unchecked( - arr: &LargeBinaryArray, - indices: &IdxArr, -) -> Box { - let data_len = indices.len(); - - let mut offset_buf = vec![0; data_len + 1]; - let offset_typed = offset_buf.as_mut_slice(); - - let mut length_so_far = 0; - offset_typed[0] = length_so_far; - - let validity; - - // The required size is yet unknown - // Allocate 2.0 times the expected size. - // where expected size is the length of bytes multiplied by the factor (take_len / current_len) - let mut values_capacity = if arr.len() > 0 { - ((arr.len() as f32 * 2.0) as usize) / arr.len() * indices.len() + }; + PrimitiveArray::new_unchecked( + arr.data_type().clone(), + values.into(), + Some(validity.into()), + ) } else { - 0 + PrimitiveArray::new_unchecked( + arr.data_type().clone(), + values.into(), + indices.validity().cloned(), + ) }; - // 16 bytes per string as default alloc - let mut values_buf = Vec::::with_capacity(values_capacity); - - // both 0 nulls - if !arr.has_validity() && !indices.has_validity() { - offset_typed - .iter_mut() - .skip(1) - .enumerate() - .for_each(|(idx, offset)| { - let index = indices.value_unchecked(idx) as usize; - let s = arr.value_unchecked(index); - length_so_far += s.len() as i64; - *offset = length_so_far; - - if length_so_far as usize >= values_capacity { - values_buf.reserve(values_capacity); - values_capacity *= 2; - } - - values_buf.extend_from_slice(s) - }); - validity = None; - } else if !arr.has_validity() { - offset_typed - .iter_mut() - .skip(1) - .enumerate() - .for_each(|(idx, offset)| { - if indices.is_valid(idx) { - let index = indices.value_unchecked(idx) as usize; - let s = arr.value_unchecked(index); - length_so_far += s.len() as i64; - - if length_so_far as usize >= values_capacity { - values_buf.reserve(values_capacity); - values_capacity *= 2; - } - - values_buf.extend_from_slice(s) - } - *offset = length_so_far; - }); - validity = indices.validity().cloned(); - } else { - let mut builder = MutableBinaryArray::with_capacities(data_len, length_so_far as usize); - let validity_arr = arr.validity().expect("should have nulls"); - - if !indices.has_validity() { - (0..data_len).for_each(|idx| { - let index = indices.value_unchecked(idx) as usize; - builder.push(if validity_arr.get_bit_unchecked(index) { - let s = arr.value_unchecked(index); - Some(s) - } else { - None - }); - }); - } else { - let validity_indices = indices.validity().expect("should have nulls"); - (0..data_len).for_each(|idx| { - if validity_indices.get_bit_unchecked(idx) { - let index = indices.value_unchecked(idx) as usize; - - if validity_arr.get_bit_unchecked(index) { - let s = arr.value_unchecked(index); - builder.push(Some(s)); - } else { - builder.push_null(); - } - } else { - builder.push_null(); - } - }); - } - - let array: BinaryArray = builder.into(); - return Box::new(array); - } - - // Safety: all "values" are &str, and thus valid utf8 - Box::new(BinaryArray::::from_data_unchecked_default( - offset_buf.into(), - values_buf.into(), - validity, - )) + arr } /// Forked and adapted from arrow-rs @@ -766,24 +249,3 @@ pub unsafe fn take_value_indices_from_list( ) } } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_utf8_kernel() { - let s = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]); - unsafe { - let out = take_utf8_unchecked(&s, &IdxArr::from_slice([1, 2])); - assert!(out.is_null(0)); - assert!(out.is_valid(1)); - let out = take_utf8_unchecked(&s, &IdxArr::from(vec![None, Some(2)])); - assert!(out.is_null(0)); - assert!(out.is_valid(1)); - let out = take_utf8_unchecked(&s, &IdxArr::from(vec![None, None])); - assert!(out.is_null(0)); - assert!(out.is_null(1)); - } - } -} diff --git a/crates/polars-arrow/src/legacy/kernels/ewm/average.rs b/crates/polars-arrow/src/legacy/kernels/ewm/average.rs index a24c0f554494..04be94939cbe 100644 --- a/crates/polars-arrow/src/legacy/kernels/ewm/average.rs +++ b/crates/polars-arrow/src/legacy/kernels/ewm/average.rs @@ -3,8 +3,8 @@ use std::ops::{AddAssign, MulAssign}; use num_traits::Float; use crate::array::PrimitiveArray; -use crate::legacy::trusted_len::TrustedLen; use crate::legacy::utils::CustomIterTools; +use crate::trusted_len::TrustedLen; use crate::types::NativeType; pub fn ewm_mean( diff --git a/crates/polars-arrow/src/legacy/kernels/ewm/variance.rs b/crates/polars-arrow/src/legacy/kernels/ewm/variance.rs index 4a54a77f1c0b..0aabb72c10a3 100644 --- a/crates/polars-arrow/src/legacy/kernels/ewm/variance.rs +++ b/crates/polars-arrow/src/legacy/kernels/ewm/variance.rs @@ -3,8 +3,8 @@ use std::ops::{AddAssign, DivAssign, MulAssign}; use num_traits::Float; use crate::array::PrimitiveArray; -use crate::legacy::trusted_len::TrustedLen; use crate::legacy::utils::CustomIterTools; +use crate::trusted_len::TrustedLen; use crate::types::NativeType; #[allow(clippy::too_many_arguments)] diff --git a/crates/polars-arrow/src/legacy/kernels/string.rs b/crates/polars-arrow/src/legacy/kernels/string.rs index d31a574dadef..84c42c2c4827 100644 --- a/crates/polars-arrow/src/legacy/kernels/string.rs +++ b/crates/polars-arrow/src/legacy/kernels/string.rs @@ -1,20 +1,16 @@ -use crate::array::{ArrayRef, UInt32Array, Utf8Array}; +use crate::array::{Array, ArrayRef, UInt32Array, Utf8ViewArray}; use crate::buffer::Buffer; use crate::datatypes::ArrowDataType; use crate::legacy::trusted_len::TrustedLenPush; -pub fn string_len_bytes(array: &Utf8Array) -> ArrayRef { - let values = array - .offsets() - .as_slice() - .windows(2) - .map(|x| (x[1] - x[0]) as u32); - let values: Buffer<_> = Vec::from_trusted_len_iter(values).into(); +pub fn utf8view_len_bytes(array: &Utf8ViewArray) -> ArrayRef { + let values = array.views().iter().map(|v| *v as u32).collect::>(); + let values: Buffer<_> = values.into(); let array = UInt32Array::new(ArrowDataType::UInt32, values, array.validity().cloned()); Box::new(array) } -pub fn string_len_chars(array: &Utf8Array) -> ArrayRef { +pub fn string_len_chars(array: &Utf8ViewArray) -> ArrayRef { let values = array.values_iter().map(|x| x.chars().count() as u32); let values: Buffer<_> = Vec::from_trusted_len_iter(values).into(); let array = UInt32Array::new(ArrowDataType::UInt32, values, array.validity().cloned()); diff --git a/crates/polars-arrow/src/legacy/kernels/take_agg/mod.rs b/crates/polars-arrow/src/legacy/kernels/take_agg/mod.rs index ce58332de766..77213de4d8bf 100644 --- a/crates/polars-arrow/src/legacy/kernels/take_agg/mod.rs +++ b/crates/polars-arrow/src/legacy/kernels/take_agg/mod.rs @@ -6,7 +6,7 @@ pub use boolean::*; use num_traits::{NumCast, ToPrimitive}; pub use var::*; -use crate::array::{Array, BooleanArray, PrimitiveArray, Utf8Array}; +use crate::array::{Array, BinaryViewArray, BooleanArray, PrimitiveArray}; use crate::legacy::index::IdxSize; use crate::types::NativeType; @@ -98,16 +98,16 @@ pub unsafe fn take_agg_primitive_iter_unchecked_count_nulls< /// # Safety /// caller must ensure iterators indexes are in bounds #[inline] -pub unsafe fn take_agg_utf8_iter_unchecked< +pub unsafe fn take_agg_bin_iter_unchecked< 'a, I: IntoIterator, - F: Fn(&'a str, &'a str) -> &'a str, + F: Fn(&'a [u8], &'a [u8]) -> &'a [u8], >( - arr: &'a Utf8Array, + arr: &'a BinaryViewArray, indices: I, f: F, len: IdxSize, -) -> Option<&str> { +) -> Option<&[u8]> { let mut null_count = 0 as IdxSize; let validity = arr.validity().unwrap(); @@ -139,15 +139,15 @@ pub unsafe fn take_agg_utf8_iter_unchecked< /// # Safety /// caller must ensure iterators indexes are in bounds #[inline] -pub unsafe fn take_agg_utf8_iter_unchecked_no_null< +pub unsafe fn take_agg_bin_iter_unchecked_no_null< 'a, I: IntoIterator, - F: Fn(&'a str, &'a str) -> &'a str, + F: Fn(&'a [u8], &'a [u8]) -> &'a [u8], >( - arr: &'a Utf8Array, + arr: &'a BinaryViewArray, indices: I, f: F, -) -> Option<&str> { +) -> Option<&[u8]> { indices .into_iter() .map(|idx| arr.value_unchecked(idx)) diff --git a/crates/polars-arrow/src/legacy/trusted_len/boolean.rs b/crates/polars-arrow/src/legacy/trusted_len/boolean.rs index 31191bd9cb82..daf5bee2ad1d 100644 --- a/crates/polars-arrow/src/legacy/trusted_len/boolean.rs +++ b/crates/polars-arrow/src/legacy/trusted_len/boolean.rs @@ -3,8 +3,9 @@ use crate::bitmap::MutableBitmap; use crate::datatypes::ArrowDataType; use crate::legacy::array::default_arrays::FromData; use crate::legacy::bit_util::{set_bit_raw, unset_bit_raw}; -use crate::legacy::trusted_len::{FromIteratorReversed, TrustedLen}; +use crate::legacy::trusted_len::FromIteratorReversed; use crate::legacy::utils::FromTrustedLenIterator; +use crate::trusted_len::TrustedLen; impl FromTrustedLenIterator> for BooleanArray { fn from_iter_trusted_length>>(iter: I) -> Self diff --git a/crates/polars-arrow/src/legacy/trusted_len/mod.rs b/crates/polars-arrow/src/legacy/trusted_len/mod.rs index 94d9473cf143..9967ecebc594 100644 --- a/crates/polars-arrow/src/legacy/trusted_len/mod.rs +++ b/crates/polars-arrow/src/legacy/trusted_len/mod.rs @@ -2,91 +2,5 @@ mod boolean; mod push_unchecked; mod rev; -use std::iter::Scan; -use std::slice::Iter; - pub use push_unchecked::*; pub use rev::FromIteratorReversed; - -use crate::array::FixedSizeListArray; -use crate::bitmap::utils::{BitmapIter, ZipValidity, ZipValidityIter}; -use crate::legacy::utils::TrustMyLength; - -/// An iterator of known, fixed size. -/// A trait denoting Rusts' unstable [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). -/// This is re-defined here and implemented for some iterators until `std::iter::TrustedLen` -/// is stabilized. -/// *Implementation from Jorge Leitao on Arrow2 -/// # Safety -/// length of the iterator must be correct -pub unsafe trait TrustedLen: Iterator {} - -unsafe impl TrustedLen for &mut dyn TrustedLen {} -unsafe impl TrustedLen for Box + '_> {} - -unsafe impl TrustedLen for Iter<'_, T> {} - -unsafe impl B> TrustedLen for std::iter::Map {} - -unsafe impl<'a, I, T: 'a> TrustedLen for std::iter::Copied -where - I: TrustedLen, - T: Copy, -{ -} - -unsafe impl TrustedLen for std::iter::Enumerate where I: TrustedLen {} - -unsafe impl TrustedLen for std::iter::Zip -where - A: TrustedLen, - B: TrustedLen, -{ -} - -unsafe impl TrustedLen for std::slice::Windows<'_, T> {} - -unsafe impl TrustedLen for std::iter::Chain -where - A: TrustedLen, - B: TrustedLen, -{ -} - -unsafe impl TrustedLen for std::iter::Once {} - -unsafe impl TrustedLen for std::vec::IntoIter {} - -unsafe impl TrustedLen for std::iter::Repeat {} -unsafe impl A> TrustedLen for std::iter::RepeatWith {} -unsafe impl TrustedLen for std::iter::Take {} - -unsafe impl TrustedLen for std::iter::Rev {} - -unsafe impl, J> TrustedLen for TrustMyLength {} -unsafe impl TrustedLen for std::ops::Range where std::ops::Range: Iterator {} -unsafe impl TrustedLen for std::ops::RangeInclusive where std::ops::RangeInclusive: Iterator -{} -unsafe impl TrustedLen for crate::array::Utf8ValuesIter<'_, i64> {} -unsafe impl TrustedLen for crate::array::BinaryValueIter<'_, i64> {} -unsafe impl TrustedLen for crate::array::ListValuesIter<'_, i64> {} -unsafe impl TrustedLen for crate::array::ArrayValuesIter<'_, FixedSizeListArray> {} -unsafe impl, V: TrustedLen + Iterator> TrustedLen - for ZipValidityIter -{ -} -unsafe impl, V: TrustedLen + Iterator> TrustedLen - for ZipValidity -{ -} -unsafe impl TrustedLen for BitmapIter<'_> {} -unsafe impl TrustedLen for std::iter::StepBy {} - -unsafe impl TrustedLen for Scan -where - F: FnMut(&mut St, I::Item) -> Option, - I: TrustedLen + Iterator, -{ -} - -unsafe impl TrustedLen for hashbrown::hash_map::IntoIter {} diff --git a/crates/polars-arrow/src/legacy/trusted_len/push_unchecked.rs b/crates/polars-arrow/src/legacy/trusted_len/push_unchecked.rs index f3d830f76fa1..1264f8865ba0 100644 --- a/crates/polars-arrow/src/legacy/trusted_len/push_unchecked.rs +++ b/crates/polars-arrow/src/legacy/trusted_len/push_unchecked.rs @@ -1,4 +1,4 @@ -use super::*; +use crate::trusted_len::TrustedLen; pub trait TrustedLenPush { /// Will push an item and not check if there is enough capacity. diff --git a/crates/polars-arrow/src/legacy/trusted_len/rev.rs b/crates/polars-arrow/src/legacy/trusted_len/rev.rs index 1bbee41f2a60..0677ced9f7df 100644 --- a/crates/polars-arrow/src/legacy/trusted_len/rev.rs +++ b/crates/polars-arrow/src/legacy/trusted_len/rev.rs @@ -1,4 +1,4 @@ -use crate::legacy::trusted_len::TrustedLen; +use crate::trusted_len::TrustedLen; pub trait FromIteratorReversed: Sized { fn from_trusted_len_iter_rev>(iter: I) -> Self; diff --git a/crates/polars-arrow/src/legacy/utils.rs b/crates/polars-arrow/src/legacy/utils.rs index 73f626286a4d..502b03d6087f 100644 --- a/crates/polars-arrow/src/legacy/utils.rs +++ b/crates/polars-arrow/src/legacy/utils.rs @@ -2,55 +2,10 @@ use crate::array::PrimitiveArray; use crate::bitmap::MutableBitmap; use crate::datatypes::ArrowDataType; use crate::legacy::bit_util::unset_bit_raw; -use crate::legacy::trusted_len::{FromIteratorReversed, TrustedLen, TrustedLenPush}; +use crate::legacy::trusted_len::{FromIteratorReversed, TrustedLenPush}; +use crate::trusted_len::{TrustMyLength, TrustedLen}; use crate::types::NativeType; -#[derive(Clone)] -pub struct TrustMyLength, J> { - iter: I, - len: usize, -} - -impl TrustMyLength -where - I: Iterator, -{ - #[inline] - pub fn new(iter: I, len: usize) -> Self { - Self { iter, len } - } -} - -impl Iterator for TrustMyLength -where - I: Iterator, -{ - type Item = J; - - #[inline] - fn next(&mut self) -> Option { - self.iter.next() - } - - fn size_hint(&self) -> (usize, Option) { - (self.len, Some(self.len)) - } -} - -impl ExactSizeIterator for TrustMyLength where I: Iterator {} - -impl DoubleEndedIterator for TrustMyLength -where - I: Iterator + DoubleEndedIterator, -{ - #[inline] - fn next_back(&mut self) -> Option { - self.iter.next_back() - } -} - -unsafe impl crate::trusted_len::TrustedLen for TrustMyLength where I: Iterator {} - pub trait CustomIterTools: Iterator { /// Turn any iterator in a trusted length iterator /// diff --git a/crates/polars-arrow/src/temporal_conversions.rs b/crates/polars-arrow/src/temporal_conversions.rs index dd580259c8a4..8a9a792ed992 100644 --- a/crates/polars-arrow/src/temporal_conversions.rs +++ b/crates/polars-arrow/src/temporal_conversions.rs @@ -1,13 +1,11 @@ //! Conversion methods for dates and times. use chrono::format::{parse, Parsed, StrftimeItems}; -use chrono::{Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; +use chrono::{Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; use polars_error::{polars_err, PolarsResult}; -use crate::array::{PrimitiveArray, Utf8Array}; +use crate::array::{PrimitiveArray, Utf8ViewArray}; use crate::datatypes::{ArrowDataType, TimeUnit}; -use crate::offset::Offset; -use crate::types::months_days_ns; /// Number of seconds in a day pub const SECONDS_IN_DAY: i64 = 86_400; @@ -251,7 +249,10 @@ pub fn timestamp_ns_to_datetime_opt(v: i64) -> Option { /// Converts a timestamp in `time_unit` and `timezone` into [`chrono::DateTime`]. #[inline] -pub fn timestamp_to_naive_datetime(timestamp: i64, time_unit: TimeUnit) -> chrono::NaiveDateTime { +pub(crate) fn timestamp_to_naive_datetime( + timestamp: i64, + time_unit: TimeUnit, +) -> chrono::NaiveDateTime { match time_unit { TimeUnit::Second => timestamp_s_to_datetime(timestamp), TimeUnit::Millisecond => timestamp_ms_to_datetime(timestamp), @@ -369,8 +370,8 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> .ok() } -fn utf8_to_timestamp_impl( - array: &Utf8Array, +fn utf8view_to_timestamp_impl( + array: &Utf8ViewArray, fmt: &str, time_zone: String, tz: T, @@ -387,7 +388,7 @@ fn utf8_to_timestamp_impl( /// Parses `value` to a [`chrono_tz::Tz`] with the Arrow's definition of timestamp with a timezone. #[cfg(feature = "chrono-tz")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] -pub fn parse_offset_tz(timezone: &str) -> PolarsResult { +pub(crate) fn parse_offset_tz(timezone: &str) -> PolarsResult { timezone .parse::() .map_err(|_| polars_err!(InvalidOperation: "timezone \"{timezone}\" cannot be parsed")) @@ -395,19 +396,21 @@ pub fn parse_offset_tz(timezone: &str) -> PolarsResult { #[cfg(feature = "chrono-tz")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] -fn chrono_tz_utf_to_timestamp( - array: &Utf8Array, +fn chrono_tz_utf_to_timestamp( + array: &Utf8ViewArray, fmt: &str, time_zone: String, time_unit: TimeUnit, ) -> PolarsResult> { let tz = parse_offset_tz(&time_zone)?; - Ok(utf8_to_timestamp_impl(array, fmt, time_zone, tz, time_unit)) + Ok(utf8view_to_timestamp_impl( + array, fmt, time_zone, tz, time_unit, + )) } #[cfg(not(feature = "chrono-tz"))] -fn chrono_tz_utf_to_timestamp( - _: &Utf8Array, +fn chrono_tz_utf_to_timestamp( + _: &Utf8ViewArray, _: &str, timezone: String, _: TimeUnit, @@ -423,8 +426,8 @@ fn chrono_tz_utf_to_timestamp( /// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. /// # Error /// This function errors iff `timezone` is not parsable to an offset. -pub fn utf8_to_timestamp( - array: &Utf8Array, +pub(crate) fn utf8view_to_timestamp( + array: &Utf8ViewArray, fmt: &str, time_zone: String, time_unit: TimeUnit, @@ -432,7 +435,9 @@ pub fn utf8_to_timestamp( let tz = parse_offset(time_zone.as_str()); if let Ok(tz) = tz { - Ok(utf8_to_timestamp_impl(array, fmt, time_zone, tz, time_unit)) + Ok(utf8view_to_timestamp_impl( + array, fmt, time_zone, tz, time_unit, + )) } else { chrono_tz_utf_to_timestamp(array, fmt, time_zone, time_unit) } @@ -442,8 +447,8 @@ pub fn utf8_to_timestamp( /// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. /// Timezones are ignored. /// Null elements remain null; non-parsable elements are set to null. -pub fn utf8_to_naive_timestamp( - array: &Utf8Array, +pub(crate) fn utf8view_to_naive_timestamp( + array: &Utf8ViewArray, fmt: &str, time_unit: TimeUnit, ) -> PrimitiveArray { @@ -453,75 +458,3 @@ pub fn utf8_to_naive_timestamp( PrimitiveArray::from_trusted_len_iter(iter).to(ArrowDataType::Timestamp(time_unit, None)) } - -fn add_month(year: i32, month: u32, months: i32) -> chrono::NaiveDate { - let new_year = (year * 12 + (month - 1) as i32 + months) / 12; - let new_month = (year * 12 + (month - 1) as i32 + months) % 12 + 1; - chrono::NaiveDate::from_ymd_opt(new_year, new_month as u32, 1) - .expect("invalid or out-of-range date") -} - -fn get_days_between_months(year: i32, month: u32, months: i32) -> i64 { - add_month(year, month, months) - .signed_duration_since( - chrono::NaiveDate::from_ymd_opt(year, month, 1).expect("invalid or out-of-range date"), - ) - .num_days() -} - -/// Adds an `interval` to a `timestamp` in `time_unit` units without timezone. -#[inline] -pub fn add_naive_interval(timestamp: i64, time_unit: TimeUnit, interval: months_days_ns) -> i64 { - // convert seconds to a DateTime of a given offset. - let datetime = match time_unit { - TimeUnit::Second => timestamp_s_to_datetime(timestamp), - TimeUnit::Millisecond => timestamp_ms_to_datetime(timestamp), - TimeUnit::Microsecond => timestamp_us_to_datetime(timestamp), - TimeUnit::Nanosecond => timestamp_ns_to_datetime(timestamp), - }; - - // compute the number of days in the interval, which depends on the particular year and month (leap days) - let delta_days = get_days_between_months(datetime.year(), datetime.month(), interval.months()) - + interval.days() as i64; - - // add; no leap hours are considered - let new_datetime_tz = datetime - + chrono::Duration::nanoseconds(delta_days * 24 * 60 * 60 * 1_000_000_000 + interval.ns()); - - // convert back to the target unit - match time_unit { - TimeUnit::Second => new_datetime_tz.timestamp_millis() / 1000, - TimeUnit::Millisecond => new_datetime_tz.timestamp_millis(), - TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos_opt().unwrap() / 1000, - TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos_opt().unwrap(), - } -} - -/// Adds an `interval` to a `timestamp` in `time_unit` units and timezone `timezone`. -#[inline] -pub fn add_interval( - timestamp: i64, - time_unit: TimeUnit, - interval: months_days_ns, - timezone: &T, -) -> i64 { - // convert seconds to a DateTime of a given offset. - let datetime_tz = timestamp_to_datetime(timestamp, time_unit, timezone); - - // compute the number of days in the interval, which depends on the particular year and month (leap days) - let delta_days = - get_days_between_months(datetime_tz.year(), datetime_tz.month(), interval.months()) - + interval.days() as i64; - - // add; tz will take care of leap hours - let new_datetime_tz = datetime_tz - + chrono::Duration::nanoseconds(delta_days * 24 * 60 * 60 * 1_000_000_000 + interval.ns()); - - // convert back to the target unit - match time_unit { - TimeUnit::Second => new_datetime_tz.timestamp_millis() / 1000, - TimeUnit::Millisecond => new_datetime_tz.timestamp_millis(), - TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos_opt().unwrap() / 1000, - TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos_opt().unwrap(), - } -} diff --git a/crates/polars-arrow/src/trusted_len.rs b/crates/polars-arrow/src/trusted_len.rs index a1c38bd51c71..4bdce32e4990 100644 --- a/crates/polars-arrow/src/trusted_len.rs +++ b/crates/polars-arrow/src/trusted_len.rs @@ -1,4 +1,5 @@ //! Declares [`TrustedLen`]. +use std::iter::Scan; use std::slice::Iter; /// An iterator of known, fixed size. @@ -13,8 +14,6 @@ pub unsafe trait TrustedLen: Iterator {} unsafe impl TrustedLen for Iter<'_, T> {} -unsafe impl B> TrustedLen for std::iter::Map {} - unsafe impl<'a, I, T: 'a> TrustedLen for std::iter::Copied where I: TrustedLen, @@ -55,3 +54,69 @@ unsafe impl TrustedLen for std::vec::IntoIter {} unsafe impl TrustedLen for std::iter::Repeat {} unsafe impl A> TrustedLen for std::iter::RepeatWith {} unsafe impl TrustedLen for std::iter::Take {} + +unsafe impl TrustedLen for &mut dyn TrustedLen {} +unsafe impl TrustedLen for Box + '_> {} + +unsafe impl B> TrustedLen for std::iter::Map {} + +unsafe impl TrustedLen for std::iter::Rev {} + +unsafe impl, J> TrustedLen for TrustMyLength {} +unsafe impl TrustedLen for std::ops::Range where std::ops::Range: Iterator {} +unsafe impl TrustedLen for std::ops::RangeInclusive where std::ops::RangeInclusive: Iterator +{} +unsafe impl TrustedLen for std::iter::StepBy {} + +unsafe impl TrustedLen for Scan +where + F: FnMut(&mut St, I::Item) -> Option, + I: TrustedLen + Iterator, +{ +} + +unsafe impl TrustedLen for hashbrown::hash_map::IntoIter {} + +#[derive(Clone)] +pub struct TrustMyLength, J> { + iter: I, + len: usize, +} + +impl TrustMyLength +where + I: Iterator, +{ + #[inline] + pub fn new(iter: I, len: usize) -> Self { + Self { iter, len } + } +} + +impl Iterator for TrustMyLength +where + I: Iterator, +{ + type Item = J; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + fn size_hint(&self) -> (usize, Option) { + (self.len, Some(self.len)) + } +} + +impl ExactSizeIterator for TrustMyLength where I: Iterator {} + +impl DoubleEndedIterator for TrustMyLength +where + I: Iterator + DoubleEndedIterator, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.iter.next_back() + } +} diff --git a/crates/polars-arrow/tests/it/io/ipc/mod.rs b/crates/polars-arrow/tests/it/io/ipc/mod.rs index fe490da7886a..202eaf0cdfb2 100644 --- a/crates/polars-arrow/tests/it/io/ipc/mod.rs +++ b/crates/polars-arrow/tests/it/io/ipc/mod.rs @@ -73,7 +73,7 @@ fn write_sliced_utf8() -> PolarsResult<()> { #[test] fn write_binview() -> PolarsResult<()> { - let array = Utf8ViewArray::from([Some("foo"), Some("bar"), None, Some("hamlet")]).boxed(); + let array = Utf8ViewArray::from_slice([Some("foo"), Some("bar"), None, Some("hamlet")]).boxed(); let schema = prep_schema(array.as_ref()); let columns = Chunk::try_new(vec![array])?; round_trip(columns, schema, None, Some(Compression::ZSTD)) diff --git a/crates/polars-compute/src/comparisons/array.rs b/crates/polars-compute/src/comparisons/array.rs index 257ed902298d..f643cb0a9043 100644 --- a/crates/polars-compute/src/comparisons/array.rs +++ b/crates/polars-compute/src/comparisons/array.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BinaryArray, FixedSizeListArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryViewArray, FixedSizeListArray, PrimitiveArray, Utf8ViewArray}; use arrow::bitmap::utils::count_zeros; use arrow::bitmap::Bitmap; use arrow::datatypes::ArrowDataType; @@ -55,8 +55,8 @@ macro_rules! compare { match lhs_type.data_type().to_physical_type() { // Boolean => call_binary!(BooleanArray, lhs, rhs, $op), Boolean => todo!(), - LargeUtf8 => call_binary!(Utf8Array, lv, rv, $op), - LargeBinary => call_binary!(BinaryArray, lv, rv, $op), + BinaryView => call_binary!(BinaryViewArray, lv, rv, $op), + Utf8View => call_binary!(Utf8ViewArray, lv, rv, $op), Primitive(Int8) => call_binary!(PrimitiveArray, lv, rv, $op), Primitive(Int16) => call_binary!(PrimitiveArray, lv, rv, $op), Primitive(Int32) => call_binary!(PrimitiveArray, lv, rv, $op), @@ -68,10 +68,7 @@ macro_rules! compare { Primitive(UInt64) => call_binary!(PrimitiveArray, lv, rv, $op), Primitive(Float32) => call_binary!(PrimitiveArray, lv, rv, $op), Primitive(Float64) => call_binary!(PrimitiveArray, lv, rv, $op), - _ => todo!( - "Comparison between {:?} are not yet supported", - lhs.data_type().to_physical_type() - ), + dt => todo!("Comparison of Arrays with {:?} are not yet supported", dt), } }}; } diff --git a/crates/polars-compute/src/min_max/scalar.rs b/crates/polars-compute/src/min_max/scalar.rs index 6eb03d18db37..32e630c02803 100644 --- a/crates/polars-compute/src/min_max/scalar.rs +++ b/crates/polars-compute/src/min_max/scalar.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BinaryArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryViewArray, PrimitiveArray, Utf8ViewArray}; use arrow::types::NativeType; use polars_utils::min_max::MinMax; @@ -56,7 +56,7 @@ impl MinMaxKernel for [T] { } } -impl MinMaxKernel for BinaryArray { +impl MinMaxKernel for BinaryViewArray { type Scalar<'a> = &'a [u8]; fn min_ignore_nan_kernel(&self) -> Option> { @@ -86,12 +86,12 @@ impl MinMaxKernel for BinaryArray { } } -impl MinMaxKernel for Utf8Array { +impl MinMaxKernel for Utf8ViewArray { type Scalar<'a> = &'a str; #[inline(always)] fn min_ignore_nan_kernel(&self) -> Option> { - self.to_binary().min_ignore_nan_kernel().map(|s| unsafe { + self.to_binview().min_ignore_nan_kernel().map(|s| unsafe { // SAFETY: the lifetime is the same, and it is valid UTF-8. #[allow(clippy::transmute_bytes_to_str)] std::mem::transmute::<&[u8], &str>(s) @@ -100,7 +100,7 @@ impl MinMaxKernel for Utf8Array { #[inline(always)] fn max_ignore_nan_kernel(&self) -> Option> { - self.to_binary().max_ignore_nan_kernel().map(|s| unsafe { + self.to_binview().max_ignore_nan_kernel().map(|s| unsafe { // SAFETY: the lifetime is the same, and it is valid UTF-8. #[allow(clippy::transmute_bytes_to_str)] std::mem::transmute::<&[u8], &str>(s) diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs index f727d6c9b8a0..306fa2e6ba3a 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs @@ -64,6 +64,7 @@ macro_rules! native_array_arithmetics { native_array_arithmetics!(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64); +#[inline] fn concat_binary_arrs(l: &[u8], r: &[u8], buf: &mut Vec) { buf.clear(); @@ -95,20 +96,18 @@ impl Add<&str> for &StringChunked { } } -fn concat_binary(a: &BinaryArray, b: &BinaryArray) -> BinaryArray { +fn concat_binview(a: &BinaryViewArray, b: &BinaryViewArray) -> BinaryViewArray { let validity = combine_validities_and(a.validity(), b.validity()); - let mut values = Vec::with_capacity(a.get_values_size() + b.get_values_size()); - let mut offsets = Vec::with_capacity(a.len() + 1); - let mut offset_so_far = 0i64; - offsets.push(offset_so_far); + let mut mutable = MutableBinaryViewArray::with_capacity(a.len()); + + let mut scratch = vec![]; for (a, b) in a.values_iter().zip(b.values_iter()) { - values.extend_from_slice(a); - values.extend_from_slice(b); - offset_so_far = values.len() as i64; - offsets.push(offset_so_far) + concat_binary_arrs(a, b, &mut scratch); + mutable.push_value(&scratch) } - unsafe { BinaryArray::from_data_unchecked_default(offsets.into(), values.into(), validity) } + + mutable.freeze().with_validity(validity) } impl Add for &BinaryChunked { @@ -148,7 +147,7 @@ impl Add for &BinaryChunked { }; } - arity::binary(self, rhs, concat_binary) + arity::binary(self, rhs, concat_binview) } } @@ -164,7 +163,7 @@ impl Add<&[u8]> for &BinaryChunked { type Output = BinaryChunked; fn add(self, rhs: &[u8]) -> Self::Output { - let arr = BinaryArray::::from_slice([rhs]); + let arr = BinaryViewArray::from_slice_values([rhs]); let rhs: BinaryChunked = arr.into(); self.add(&rhs) } diff --git a/crates/polars-core/src/chunked_array/builder/binary.rs b/crates/polars-core/src/chunked_array/builder/binary.rs deleted file mode 100644 index bed05a434ba1..000000000000 --- a/crates/polars-core/src/chunked_array/builder/binary.rs +++ /dev/null @@ -1,93 +0,0 @@ -use polars_error::constants::LENGTH_LIMIT_MSG; - -use super::*; - -pub struct BinaryChunkedBuilder { - pub(crate) builder: MutableBinaryArray, - pub capacity: usize, - field: Field, -} - -impl BinaryChunkedBuilder { - /// Create a new UtfChunkedBuilder - /// - /// # Arguments - /// - /// * `capacity` - Number of string elements in the final array. - /// * `bytes_capacity` - Number of bytes needed to store the string values. - pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self { - BinaryChunkedBuilder { - builder: MutableBinaryArray::::with_capacities(capacity, bytes_capacity), - capacity, - field: Field::new(name, DataType::Binary), - } - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value>(&mut self, v: S) { - self.builder.push(Some(v.as_ref())); - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.builder.push::<&[u8]>(None); - } - - #[inline] - pub fn append_option>(&mut self, opt: Option) { - self.builder.push(opt); - } - - pub fn finish(mut self) -> BinaryChunked { - let arr = self.builder.as_box(); - let length = IdxSize::try_from(arr.len()).expect(LENGTH_LIMIT_MSG); - let null_count = arr.null_count() as IdxSize; - - ChunkedArray { - field: Arc::new(self.field), - chunks: vec![arr], - phantom: PhantomData, - bit_settings: Default::default(), - length, - null_count, - } - } - - fn shrink_to_fit(&mut self) { - self.builder.shrink_to_fit() - } -} - -pub struct BinaryChunkedBuilderCow { - builder: BinaryChunkedBuilder, -} - -impl BinaryChunkedBuilderCow { - pub fn new(name: &str, capacity: usize) -> Self { - BinaryChunkedBuilderCow { - builder: BinaryChunkedBuilder::new(name, capacity, capacity), - } - } -} - -impl ChunkedBuilder, BinaryType> for BinaryChunkedBuilderCow { - #[inline] - fn append_value(&mut self, val: Cow<'_, [u8]>) { - self.builder.append_value(val.as_ref()) - } - - #[inline] - fn append_null(&mut self) { - self.builder.append_null() - } - - fn finish(self) -> ChunkedArray { - self.builder.finish() - } - - fn shrink_to_fit(&mut self) { - self.builder.shrink_to_fit() - } -} diff --git a/crates/polars-core/src/chunked_array/builder/list/anonymous.rs b/crates/polars-core/src/chunked_array/builder/list/anonymous.rs index 787309555dbf..1fb5393db1df 100644 --- a/crates/polars-core/src/chunked_array/builder/list/anonymous.rs +++ b/crates/polars-core/src/chunked_array/builder/list/anonymous.rs @@ -161,7 +161,7 @@ impl ListBuilderTrait for AnonymousOwnedListBuilder { let arr = slf.builder.finish(inner_dtype_physical.as_ref()).unwrap(); let list_dtype_logical = match inner_dtype { - None => DataType::from(arr.data_type()), + None => DataType::from_arrow(arr.data_type(), false), Some(dt) => DataType::List(Box::new(dt)), }; diff --git a/crates/polars-core/src/chunked_array/builder/list/binary.rs b/crates/polars-core/src/chunked_array/builder/list/binary.rs index 02af7e2fe153..9c7f9ee6c872 100644 --- a/crates/polars-core/src/chunked_array/builder/list/binary.rs +++ b/crates/polars-core/src/chunked_array/builder/list/binary.rs @@ -1,15 +1,15 @@ use super::*; pub struct ListStringChunkedBuilder { - builder: LargeListUtf8Builder, + builder: LargeListBinViewBuilder, field: Field, fast_explode: bool, } impl ListStringChunkedBuilder { pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { - let values = MutableUtf8Array::::with_capacity(values_capacity); - let builder = LargeListUtf8Builder::new_with_capacity(values, capacity); + let values = MutableBinaryViewArray::with_capacity(values_capacity); + let builder = LargeListBinViewBuilder::new_with_capacity(values, capacity); let field = Field::new(name, DataType::List(Box::new(DataType::String))); ListStringChunkedBuilder { @@ -24,25 +24,21 @@ impl ListStringChunkedBuilder { &mut self, iter: I, ) { - let values = self.builder.mut_values(); - if iter.size_hint().0 == 0 { self.fast_explode = false; } // Safety // trusted len, trust the type system - unsafe { values.extend_trusted_len_unchecked(iter) }; + self.builder.mut_values().extend_trusted_len(iter); self.builder.try_push_valid().unwrap(); } #[inline] pub fn append_values_iter<'a, I: Iterator>(&mut self, iter: I) { - let values = self.builder.mut_values(); - if iter.size_hint().0 == 0 { self.fast_explode = false; } - values.extend_values(iter); + self.builder.mut_values().extend_values(iter); self.builder.try_push_valid().unwrap(); } @@ -51,8 +47,15 @@ impl ListStringChunkedBuilder { if ca.is_empty() { self.fast_explode = false; } - let value_builder = self.builder.mut_values(); - value_builder.try_extend(ca).unwrap(); + for arr in ca.downcast_iter() { + if arr.null_count() == 0 { + self.builder + .mut_values() + .extend_values(arr.non_null_values_iter()); + } else { + self.builder.mut_values().extend_trusted_len(arr.iter()) + } + } self.builder.try_push_valid().unwrap(); } } @@ -88,15 +91,15 @@ impl ListBuilderTrait for ListStringChunkedBuilder { } pub struct ListBinaryChunkedBuilder { - builder: LargeListBinaryBuilder, + builder: LargeListBinViewBuilder<[u8]>, field: Field, fast_explode: bool, } impl ListBinaryChunkedBuilder { pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { - let values = MutableBinaryArray::::with_capacity(values_capacity); - let builder = LargeListBinaryBuilder::new_with_capacity(values, capacity); + let values = MutablePlBinary::with_capacity(values_capacity); + let builder = LargeListBinViewBuilder::new_with_capacity(values, capacity); let field = Field::new(name, DataType::List(Box::new(DataType::Binary))); ListBinaryChunkedBuilder { @@ -110,30 +113,36 @@ impl ListBinaryChunkedBuilder { &mut self, iter: I, ) { - let values = self.builder.mut_values(); - if iter.size_hint().0 == 0 { self.fast_explode = false; } // Safety // trusted len, trust the type system - unsafe { values.extend_trusted_len_unchecked(iter) }; + self.builder.mut_values().extend_trusted_len(iter); self.builder.try_push_valid().unwrap(); } pub fn append_values_iter<'a, I: Iterator>(&mut self, iter: I) { - let values = self.builder.mut_values(); - if iter.size_hint().0 == 0 { self.fast_explode = false; } - values.extend_values(iter); + self.builder.mut_values().extend_values(iter); self.builder.try_push_valid().unwrap(); } pub(crate) fn append(&mut self, ca: &BinaryChunked) { - let value_builder = self.builder.mut_values(); - value_builder.try_extend(ca).unwrap(); + if ca.is_empty() { + self.fast_explode = false; + } + for arr in ca.downcast_iter() { + if arr.null_count() == 0 { + self.builder + .mut_values() + .extend_values(arr.non_null_values_iter()); + } else { + self.builder.mut_values().extend_trusted_len(arr.iter()) + } + } self.builder.try_push_valid().unwrap(); } } diff --git a/crates/polars-core/src/chunked_array/builder/list/mod.rs b/crates/polars-core/src/chunked_array/builder/list/mod.rs index 596834ae93db..9abcbcaa158a 100644 --- a/crates/polars-core/src/chunked_array/builder/list/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/list/mod.rs @@ -83,8 +83,7 @@ where } type LargePrimitiveBuilder = MutableListArray>; -type LargeListUtf8Builder = MutableListArray>; -type LargeListBinaryBuilder = MutableListArray>; +type LargeListBinViewBuilder = MutableListArray>; type LargeListBooleanBuilder = MutableListArray; type LargeListNullBuilder = MutableListArray; diff --git a/crates/polars-core/src/chunked_array/builder/mod.rs b/crates/polars-core/src/chunked_array/builder/mod.rs index 270db2a8bf45..e31fa2968b7c 100644 --- a/crates/polars-core/src/chunked_array/builder/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/mod.rs @@ -1,4 +1,3 @@ -mod binary; mod boolean; #[cfg(feature = "dtype-array")] pub mod fixed_size_list; @@ -7,14 +6,12 @@ mod null; mod primitive; mod string; -use std::borrow::Cow; use std::iter::FromIterator; use std::marker::PhantomData; use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; -pub use binary::*; pub use boolean::*; #[cfg(feature = "dtype-array")] pub(crate) use fixed_size_list::*; @@ -128,37 +125,24 @@ where S: AsRef, { fn from_slice(name: &str, v: &[S]) -> Self { - let values_size = v.iter().fold(0, |acc, s| acc + s.as_ref().len()); - let mut builder = MutableUtf8Array::::with_capacities(v.len(), values_size); - builder.extend_trusted_len_values(v.iter().map(|s| s.as_ref())); - let imm: Utf8Array = builder.into(); - ChunkedArray::with_chunk(name, imm) + let arr = Utf8ViewArray::from_slice_values(v); + ChunkedArray::with_chunk(name, arr) } fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { - let values_size = opt_v.iter().fold(0, |acc, s| match s { - Some(s) => acc + s.as_ref().len(), - None => acc, - }); - let mut builder = MutableUtf8Array::::with_capacities(opt_v.len(), values_size); - builder.extend_trusted_len(opt_v.iter().map(|s| s.as_ref())); - let imm: Utf8Array = builder.into(); - ChunkedArray::with_chunk(name, imm) + let arr = Utf8ViewArray::from_slice(opt_v); + ChunkedArray::with_chunk(name, arr) } fn from_iter_options(name: &str, it: impl Iterator>) -> Self { - let cap = get_iter_capacity(&it); - let mut builder = StringChunkedBuilder::new(name, cap, cap * 5); - it.for_each(|opt| builder.append_option(opt)); - builder.finish() + let arr = MutableBinaryViewArray::from_iterator(it).freeze(); + ChunkedArray::with_chunk(name, arr) } /// Create a new ChunkedArray from an iterator. fn from_iter_values(name: &str, it: impl Iterator) -> Self { - let cap = get_iter_capacity(&it); - let mut builder = StringChunkedBuilder::new(name, cap, cap * 5); - it.for_each(|v| builder.append_value(v)); - builder.finish() + let arr = MutableBinaryViewArray::from_values_iter(it).freeze(); + ChunkedArray::with_chunk(name, arr) } } @@ -167,37 +151,24 @@ where B: AsRef<[u8]>, { fn from_slice(name: &str, v: &[B]) -> Self { - let values_size = v.iter().fold(0, |acc, s| acc + s.as_ref().len()); - let mut builder = MutableBinaryArray::::with_capacities(v.len(), values_size); - builder.extend_trusted_len_values(v.iter().map(|s| s.as_ref())); - let imm: BinaryArray = builder.into(); - ChunkedArray::with_chunk(name, imm) + let arr = BinaryViewArray::from_slice_values(v); + ChunkedArray::with_chunk(name, arr) } fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { - let values_size = opt_v.iter().fold(0, |acc, s| match s { - Some(s) => acc + s.as_ref().len(), - None => acc, - }); - let mut builder = MutableBinaryArray::::with_capacities(opt_v.len(), values_size); - builder.extend_trusted_len(opt_v.iter().map(|s| s.as_ref())); - let imm: BinaryArray = builder.into(); - ChunkedArray::with_chunk(name, imm) + let arr = BinaryViewArray::from_slice(opt_v); + ChunkedArray::with_chunk(name, arr) } fn from_iter_options(name: &str, it: impl Iterator>) -> Self { - let cap = get_iter_capacity(&it); - let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5); - it.for_each(|opt| builder.append_option(opt)); - builder.finish() + let arr = MutableBinaryViewArray::from_iterator(it).freeze(); + ChunkedArray::with_chunk(name, arr) } /// Create a new ChunkedArray from an iterator. fn from_iter_values(name: &str, it: impl Iterator) -> Self { - let cap = get_iter_capacity(&it); - let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5); - it.for_each(|v| builder.append_value(v)); - builder.finish() + let arr = MutableBinaryViewArray::from_values_iter(it).freeze(); + ChunkedArray::with_chunk(name, arr) } } diff --git a/crates/polars-core/src/chunked_array/builder/string.rs b/crates/polars-core/src/chunked_array/builder/string.rs index d8ef4a092359..0a927d2afd3e 100644 --- a/crates/polars-core/src/chunked_array/builder/string.rs +++ b/crates/polars-core/src/chunked_array/builder/string.rs @@ -1,49 +1,60 @@ use super::*; -#[derive(Clone)] -pub struct StringChunkedBuilder { - pub(crate) builder: MutableUtf8Array, - pub capacity: usize, - pub(crate) field: Field, +pub struct BinViewChunkedBuilder { + pub(crate) chunk_builder: MutableBinaryViewArray, + pub(crate) field: FieldRef, } -impl StringChunkedBuilder { +impl Clone for BinViewChunkedBuilder { + fn clone(&self) -> Self { + Self { + chunk_builder: self.chunk_builder.clone(), + field: self.field.clone(), + } + } +} + +pub type StringChunkedBuilder = BinViewChunkedBuilder; +pub type BinaryChunkedBuilder = BinViewChunkedBuilder<[u8]>; + +impl BinViewChunkedBuilder { /// Create a new StringChunkedBuilder /// /// # Arguments /// /// * `capacity` - Number of string elements in the final array. /// * `bytes_capacity` - Number of bytes needed to store the string values. - pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self { - StringChunkedBuilder { - builder: MutableUtf8Array::::with_capacities(capacity, bytes_capacity), - capacity, - field: Field::new(name, DataType::String), + pub fn new(name: &str, capacity: usize) -> Self { + Self { + chunk_builder: MutableBinaryViewArray::with_capacity(capacity), + field: Arc::new(Field::new(name, DataType::from(&T::DATA_TYPE))), } } /// Appends a value of type `T` into the builder #[inline] - pub fn append_value>(&mut self, v: S) { - self.builder.push(Some(v.as_ref())); + pub fn append_value>(&mut self, v: S) { + self.chunk_builder.push_value(v.as_ref()); } /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.builder.push::<&str>(None); + self.chunk_builder.push_null() } #[inline] - pub fn append_option>(&mut self, opt: Option) { - self.builder.push(opt); + pub fn append_option>(&mut self, opt: Option) { + self.chunk_builder.push(opt); } +} +impl StringChunkedBuilder { pub fn finish(mut self) -> StringChunked { - let arr = self.builder.as_box(); + let arr = self.chunk_builder.as_box(); let mut ca = ChunkedArray { - field: Arc::new(self.field), + field: self.field, chunks: vec![arr], phantom: PhantomData, bit_settings: Default::default(), @@ -53,40 +64,20 @@ impl StringChunkedBuilder { ca.compute_len(); ca } - - fn shrink_to_fit(&mut self) { - self.builder.shrink_to_fit() - } -} - -pub struct StringChunkedBuilderCow { - builder: StringChunkedBuilder, -} - -impl StringChunkedBuilderCow { - pub fn new(name: &str, capacity: usize) -> Self { - StringChunkedBuilderCow { - builder: StringChunkedBuilder::new(name, capacity, capacity), - } - } } +impl BinaryChunkedBuilder { + pub fn finish(mut self) -> BinaryChunked { + let arr = self.chunk_builder.as_box(); -impl ChunkedBuilder, StringType> for StringChunkedBuilderCow { - #[inline] - fn append_value(&mut self, val: Cow<'_, str>) { - self.builder.append_value(val.as_ref()) - } - - #[inline] - fn append_null(&mut self) { - self.builder.append_null() - } - - fn finish(self) -> ChunkedArray { - self.builder.finish() - } - - fn shrink_to_fit(&mut self) { - self.builder.shrink_to_fit() + let mut ca = ChunkedArray { + field: self.field, + chunks: vec![arr], + phantom: PhantomData, + bit_settings: Default::default(), + length: 0, + null_count: 0, + }; + ca.compute_len(); + ca } } diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index ecc8a6ae6ea4..16ccf8cff19d 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -227,7 +227,11 @@ impl ChunkCast for StringChunked { DataType::Decimal(precision, scale) => match (precision, scale) { (precision, Some(scale)) => { let chunks = self.downcast_iter().map(|arr| { - arrow::legacy::compute::cast::cast_utf8_to_decimal(arr, *precision, *scale) + arrow::compute::cast::binview_to_decimal( + &arr.to_binview(), + *precision, + *scale, + ) }); Ok(Int128Chunked::from_chunk_iter(self.name(), chunks) .into_decimal_unchecked(*precision, *scale) @@ -274,24 +278,13 @@ impl ChunkCast for StringChunked { } } -unsafe fn binary_to_utf8_unchecked(from: &BinaryArray) -> Utf8Array { - let values = from.values().clone(); - let offsets = from.offsets().clone(); - Utf8Array::::new_unchecked( - ArrowDataType::LargeUtf8, - offsets, - values, - from.validity().cloned(), - ) -} - impl BinaryChunked { /// # Safety /// String is not validated pub unsafe fn to_string(&self) -> StringChunked { let chunks = self .downcast_iter() - .map(|arr| Box::new(binary_to_utf8_unchecked(arr)) as ArrayRef) + .map(|arr| arr.to_utf8view_unchecked().boxed()) .collect(); let field = Arc::new(Field::new(self.name(), DataType::String)); StringChunked::from_chunks_and_metadata(chunks, field, self.bit_settings, true, true) @@ -302,12 +295,7 @@ impl StringChunked { pub fn as_binary(&self) -> BinaryChunked { let chunks = self .downcast_iter() - .map(|arr| { - Box::new(arrow::compute::cast::utf8_to_binary( - arr, - ArrowDataType::LargeBinary, - )) as ArrayRef - }) + .map(|arr| arr.to_binview().boxed()) .collect(); let field = Arc::new(Field::new(self.name(), DataType::Binary)); unsafe { @@ -333,6 +321,20 @@ impl ChunkCast for BinaryChunked { } } +impl ChunkCast for BinaryOffsetChunked { + fn cast(&self, data_type: &DataType) -> PolarsResult { + match data_type { + #[cfg(feature = "dtype-struct")] + DataType::Struct(fields) => cast_single_to_struct(self.name(), &self.chunks, fields), + _ => cast_impl(self.name(), &self.chunks, data_type), + } + } + + unsafe fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult { + self.cast(data_type) + } +} + fn boolean_to_string(ca: &BooleanChunked) -> StringChunked { ca.into_iter() .map(|opt_b| match opt_b { @@ -501,7 +503,7 @@ fn cast_list(ca: &ListChunked, child_type: &DataType) -> PolarsResult<(ArrayRef, new_values, arr.validity().cloned(), ); - Ok((Box::new(new_arr), inner_dtype)) + Ok((new_arr.boxed(), inner_dtype)) } unsafe fn cast_list_unchecked(ca: &ListChunked, child_type: &DataType) -> PolarsResult { diff --git a/crates/polars-core/src/chunked_array/collect.rs b/crates/polars-core/src/chunked_array/collect.rs index 076293a38440..2d0226029236 100644 --- a/crates/polars-core/src/chunked_array/collect.rs +++ b/crates/polars-core/src/chunked_array/collect.rs @@ -13,7 +13,7 @@ use std::sync::Arc; use arrow::datatypes::ArrowDataType; -use arrow::legacy::trusted_len::TrustedLen; +use arrow::trusted_len::TrustedLen; use crate::chunked_array::ChunkedArray; use crate::datatypes::{ diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs index 24c1501929b8..5d2a335921e4 100644 --- a/crates/polars-core/src/chunked_array/comparison/mod.rs +++ b/crates/polars-core/src/chunked_array/comparison/mod.rs @@ -815,7 +815,8 @@ where debug_assert!(self.dtype() == other.dtype()); let ca_other = &*(ca_other as *const ChunkedArray); // Should be get and not get_unchecked, because there could be nulls - self.get(idx_self).tot_eq(&ca_other.get(idx_other)) + self.get_unchecked(idx_self) + .tot_eq(&ca_other.get_unchecked(idx_other)) } } @@ -824,7 +825,7 @@ impl ChunkEqualElement for BooleanChunked { let ca_other = other.as_ref().as_ref(); debug_assert!(self.dtype() == other.dtype()); let ca_other = &*(ca_other as *const BooleanChunked); - self.get(idx_self) == ca_other.get(idx_other) + self.get_unchecked(idx_self) == ca_other.get_unchecked(idx_other) } } @@ -833,7 +834,7 @@ impl ChunkEqualElement for StringChunked { let ca_other = other.as_ref().as_ref(); debug_assert!(self.dtype() == other.dtype()); let ca_other = &*(ca_other as *const StringChunked); - self.get(idx_self) == ca_other.get(idx_other) + self.get_unchecked(idx_self) == ca_other.get_unchecked(idx_other) } } @@ -842,7 +843,16 @@ impl ChunkEqualElement for BinaryChunked { let ca_other = other.as_ref().as_ref(); debug_assert!(self.dtype() == other.dtype()); let ca_other = &*(ca_other as *const BinaryChunked); - self.get(idx_self) == ca_other.get(idx_other) + self.get_unchecked(idx_self) == ca_other.get_unchecked(idx_other) + } +} + +impl ChunkEqualElement for BinaryOffsetChunked { + unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { + let ca_other = other.as_ref().as_ref(); + debug_assert!(self.dtype() == other.dtype()); + let ca_other = &*(ca_other as *const BinaryOffsetChunked); + self.get_unchecked(idx_self) == ca_other.get_unchecked(idx_other) } } diff --git a/crates/polars-core/src/chunked_array/iterator/mod.rs b/crates/polars-core/src/chunked_array/iterator/mod.rs index d89948b01a3e..d9cc32d40b0b 100644 --- a/crates/polars-core/src/chunked_array/iterator/mod.rs +++ b/crates/polars-core/src/chunked_array/iterator/mod.rs @@ -1,5 +1,3 @@ -use std::convert::TryFrom; - use arrow::array::*; use crate::prelude::*; @@ -7,9 +5,6 @@ use crate::prelude::*; use crate::series::iterator::SeriesIter; use crate::utils::CustomIterTools; -type LargeUtf8Array = Utf8Array; -type LargeBinaryArray = BinaryArray; -type LargeListArray = ListArray; pub mod par; /// A [`PolarsIterator`] is an iterator over a [`ChunkedArray`] which contains polars types. A [`PolarsIterator`] @@ -132,58 +127,6 @@ impl<'a> IntoIterator for &'a StringChunked { } } -pub struct Utf8IterNoNull<'a> { - array: &'a LargeUtf8Array, - current: usize, - current_end: usize, -} - -impl<'a> Utf8IterNoNull<'a> { - /// create a new iterator - pub fn new(array: &'a LargeUtf8Array) -> Self { - Utf8IterNoNull { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a> Iterator for Utf8IterNoNull<'a> { - type Item = &'a str; - - fn next(&mut self) -> Option { - if self.current == self.current_end { - None - } else { - let old = self.current; - self.current += 1; - unsafe { Some(self.array.value_unchecked(old)) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } -} - -impl<'a> DoubleEndedIterator for Utf8IterNoNull<'a> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - unsafe { Some(self.array.value_unchecked(self.current_end)) } - } - } -} - -/// all arrays have known size. -impl<'a> ExactSizeIterator for Utf8IterNoNull<'a> {} - impl StringChunked { #[allow(clippy::wrong_self_convention)] #[doc(hidden)] @@ -194,7 +137,7 @@ impl StringChunked { // we know that we only iterate over length == self.len() unsafe { self.downcast_iter() - .flat_map(Utf8IterNoNull::new) + .flat_map(|arr| arr.values_iter()) .trust_my_length(self.len()) } } @@ -209,59 +152,32 @@ impl<'a> IntoIterator for &'a BinaryChunked { } } -pub struct BinaryIterNoNull<'a> { - array: &'a LargeBinaryArray, - current: usize, - current_end: usize, -} - -impl<'a> BinaryIterNoNull<'a> { - /// create a new iterator - pub fn new(array: &'a LargeBinaryArray) -> Self { - BinaryIterNoNull { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a> Iterator for BinaryIterNoNull<'a> { - type Item = &'a [u8]; - - fn next(&mut self) -> Option { - if self.current == self.current_end { - None - } else { - let old = self.current; - self.current += 1; - unsafe { Some(self.array.value_unchecked(old)) } +impl BinaryChunked { + #[allow(clippy::wrong_self_convention)] + #[doc(hidden)] + pub fn into_no_null_iter( + &self, + ) -> impl '_ + Send + Sync + ExactSizeIterator + DoubleEndedIterator + TrustedLen + { + // we know that we only iterate over length == self.len() + unsafe { + self.downcast_iter() + .flat_map(|arr| arr.values_iter()) + .trust_my_length(self.len()) } } - - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } } -impl<'a> DoubleEndedIterator for BinaryIterNoNull<'a> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - unsafe { Some(self.array.value_unchecked(self.current_end)) } - } +impl<'a> IntoIterator for &'a BinaryOffsetChunked { + type Item = Option<&'a [u8]>; + type IntoIter = Box + 'a>; + fn into_iter(self) -> Self::IntoIter { + // we know that we only iterate over length == self.len() + unsafe { Box::new(self.downcast_iter().flatten().trust_my_length(self.len())) } } } -/// all arrays have known size. -impl<'a> ExactSizeIterator for BinaryIterNoNull<'a> {} - -impl BinaryChunked { +impl BinaryOffsetChunked { #[allow(clippy::wrong_self_convention)] #[doc(hidden)] pub fn into_no_null_iter( @@ -271,7 +187,7 @@ impl BinaryChunked { // we know that we only iterate over length == self.len() unsafe { self.downcast_iter() - .flat_map(BinaryIterNoNull::new) + .flat_map(|arr| arr.values_iter()) .trust_my_length(self.len()) } } @@ -317,68 +233,6 @@ impl<'a> IntoIterator for &'a ListChunked { } } -pub struct ListIterNoNull<'a> { - array: &'a LargeListArray, - inner_type: DataType, - current: usize, - current_end: usize, -} - -impl<'a> ListIterNoNull<'a> { - /// create a new iterator - pub fn new(array: &'a LargeListArray, inner_type: DataType) -> Self { - ListIterNoNull { - array, - inner_type, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a> Iterator for ListIterNoNull<'a> { - type Item = Series; - - fn next(&mut self) -> Option { - if self.current == self.current_end { - None - } else { - let old = self.current; - self.current += 1; - unsafe { - Some(Series::from_chunks_and_dtype_unchecked( - "", - vec![self.array.value_unchecked(old)], - &self.inner_type, - )) - } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } -} - -impl<'a> DoubleEndedIterator for ListIterNoNull<'a> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - unsafe { - Some(Series::try_from(("", self.array.value_unchecked(self.current_end))).unwrap()) - } - } - } -} - -/// all arrays have known size. -impl<'a> ExactSizeIterator for ListIterNoNull<'a> {} - impl ListChunked { #[allow(clippy::wrong_self_convention)] #[doc(hidden)] @@ -386,11 +240,11 @@ impl ListChunked { &self, ) -> impl '_ + Send + Sync + ExactSizeIterator + DoubleEndedIterator + TrustedLen { - // we know that we only iterate over length == self.len() let inner_type = self.inner_dtype(); unsafe { self.downcast_iter() - .flat_map(move |arr| ListIterNoNull::new(arr, inner_type.clone())) + .flat_map(|arr| arr.values_iter()) + .map(move |arr| Series::from_chunks_and_dtype_unchecked("", vec![arr], &inner_type)) .trust_my_length(self.len()) } } diff --git a/crates/polars-core/src/chunked_array/iterator/par/string.rs b/crates/polars-core/src/chunked_array/iterator/par/string.rs index f6cd068063cc..8480b0d32339 100644 --- a/crates/polars-core/src/chunked_array/iterator/par/string.rs +++ b/crates/polars-core/src/chunked_array/iterator/par/string.rs @@ -2,7 +2,8 @@ use rayon::prelude::*; use crate::prelude::*; -unsafe fn idx_to_str(idx: usize, arr: &Utf8Array) -> Option<&str> { +#[inline] +unsafe fn idx_to_str(idx: usize, arr: &Utf8ViewArray) -> Option<&str> { if arr.is_valid(idx) { Some(arr.value_unchecked(idx)) } else { @@ -17,7 +18,7 @@ impl StringChunked { // Safety: // guarded by the type system - let arr = unsafe { &*(arr as *const dyn Array as *const Utf8Array) }; + let arr = unsafe { &*(arr as *const dyn Array as *const Utf8ViewArray) }; (0..arr.len()) .into_par_iter() .map(move |idx| unsafe { idx_to_str(idx, arr) }) @@ -28,7 +29,7 @@ impl StringChunked { // Safety: // guarded by the type system let arr = &**arr; - let arr = unsafe { &*(arr as *const dyn Array as *const Utf8Array) }; + let arr = unsafe { &*(arr as *const dyn Array as *const Utf8ViewArray) }; (0..arr.len()) .into_par_iter() .map(move |idx| unsafe { idx_to_str(idx, arr) }) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/from.rs b/crates/polars-core/src/chunked_array/logical/categorical/from.rs index d3226f5dc16b..3bc3394aa2e2 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/from.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/from.rs @@ -1,6 +1,6 @@ use arrow::array::DictionaryArray; +use arrow::compute::cast::{cast, CastOptions}; use arrow::datatypes::IntegerType; -use arrow::legacy::compute::cast::cast; use super::*; @@ -55,7 +55,7 @@ impl From<&CategoricalChunked> for DictionaryArray { RevMapping::Local(arr, _) | RevMapping::Enum(arr, _) => unsafe { DictionaryArray::try_new_unchecked( dtype, - cast(keys, &ArrowDataType::Int64) + cast(keys, &ArrowDataType::Int64, CastOptions::unchecked()) .unwrap() .as_any() .downcast_ref::>() diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 17a0c9354dc8..9e7a2605a8ad 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -306,8 +306,7 @@ impl LogicalType for CategoricalChunked { DataType::String => { let mapping = &**self.get_rev_map(); - let mut builder = - StringChunkedBuilder::new(self.physical.name(), self.len(), self.len() * 5); + let mut builder = StringChunkedBuilder::new(self.physical.name(), self.len()); let f = |idx: u32| mapping.get(idx); diff --git a/crates/polars-core/src/chunked_array/logical/struct_/mod.rs b/crates/polars-core/src/chunked_array/logical/struct_/mod.rs index c308eb4d198b..b96c5c031036 100644 --- a/crates/polars-core/src/chunked_array/logical/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/struct_/mod.rs @@ -293,11 +293,11 @@ impl StructChunked { self.into() } - pub(crate) fn to_arrow(&self, i: usize) -> ArrayRef { + pub(crate) fn to_arrow(&self, i: usize, pl_flavor: bool) -> ArrayRef { let values = self .fields .iter() - .map(|s| s.to_arrow(i, true)) + .map(|s| s.to_arrow(i, pl_flavor)) .collect::>(); // we determine fields from arrays as there might be object arrays diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index 43be8d5fa6c4..98e8c5c96b10 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -649,7 +649,7 @@ impl ValueSize for StringChunked { } } -impl ValueSize for BinaryChunked { +impl ValueSize for BinaryOffsetChunked { fn get_values_size(&self) -> usize { self.chunks .iter() @@ -847,7 +847,7 @@ pub(crate) mod test { #[test] #[ignore] fn test_shrink_to_fit() { - let mut builder = StringChunkedBuilder::new("foo", 2048, 100 * 2048); + let mut builder = StringChunkedBuilder::new("foo", 2048); builder.append_value("foo"); let mut arr = builder.finish(); let before = arr diff --git a/crates/polars-core/src/chunked_array/object/iterator.rs b/crates/polars-core/src/chunked_array/object/iterator.rs index 6d2b3731e8e5..5433f048be46 100644 --- a/crates/polars-core/src/chunked_array/object/iterator.rs +++ b/crates/polars-core/src/chunked_array/object/iterator.rs @@ -1,5 +1,5 @@ use arrow::array::Array; -use arrow::legacy::trusted_len::TrustedLen; +use arrow::trusted_len::TrustedLen; use crate::chunked_array::object::{ObjectArray, PolarsObject}; diff --git a/crates/polars-core/src/chunked_array/ops/any_value.rs b/crates/polars-core/src/chunked_array/ops/any_value.rs index 878ef327f75b..92732bb3422f 100644 --- a/crates/polars-core/src/chunked_array/ops/any_value.rs +++ b/crates/polars-core/src/chunked_array/ops/any_value.rs @@ -32,8 +32,8 @@ pub(crate) unsafe fn arr_to_any_value<'a>( }}; } match dtype { - DataType::String => downcast_and_pack!(LargeStringArray, String), - DataType::Binary => downcast_and_pack!(LargeBinaryArray, Binary), + DataType::String => downcast_and_pack!(Utf8ViewArray, String), + DataType::Binary => downcast_and_pack!(BinaryViewArray, Binary), DataType::Boolean => downcast_and_pack!(BooleanArray, Boolean), DataType::UInt8 => downcast_and_pack!(UInt8Array, UInt8), DataType::UInt16 => downcast_and_pack!(UInt16Array, UInt16), @@ -119,6 +119,7 @@ pub(crate) unsafe fn arr_to_any_value<'a>( PolarsExtension::arr_to_av(arr, idx) }, DataType::Null => AnyValue::Null, + DataType::BinaryOffset => downcast_and_pack!(LargeBinaryArray, Binary), dt => panic!("not implemented for {dt:?}"), } } @@ -243,6 +244,17 @@ impl ChunkAnyValue for BinaryChunked { } } +impl ChunkAnyValue for BinaryOffsetChunked { + #[inline] + unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue { + get_any_value_unchecked!(self, index) + } + + fn get_any_value(&self, index: usize) -> PolarsResult { + get_any_value!(self, index) + } +} + impl ChunkAnyValue for ListChunked { #[inline] unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue { diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index ac255b42a3f9..b70aa05c8970 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -386,11 +386,9 @@ impl StringChunked { where F: FnMut(&'a str) -> &'a str, { - use arrow::legacy::array::utf8::Utf8FromIter; let chunks = self.downcast_iter().map(|arr| { let iter = arr.values_iter().map(&mut f); - let value_size = (arr.get_values_size() as f64 * 1.3) as usize; - let new = Utf8Array::::from_values_iter(iter, arr.len(), value_size); + let new = Utf8ViewArray::arr_from_iter(iter); new.with_validity(arr.validity().cloned()) }); StringChunked::from_chunk_iter(self.name(), chunks) @@ -417,11 +415,9 @@ impl BinaryChunked { where F: FnMut(&'a [u8]) -> &'a [u8], { - use arrow::legacy::array::utf8::BinaryFromIter; let chunks = self.downcast_iter().map(|arr| { let iter = arr.values_iter().map(&mut f); - let value_size = (arr.get_values_size() as f64 * 1.3) as usize; - let new = BinaryArray::::from_values_iter(iter, arr.len(), value_size); + let new = BinaryViewArray::arr_from_iter(iter); new.with_validity(arr.validity().cloned()) }); BinaryChunked::from_chunk_iter(self.name(), chunks) @@ -548,12 +544,12 @@ where } } -impl ChunkApplyKernel for StringChunked { - fn apply_kernel(&self, f: &dyn Fn(&LargeStringArray) -> ArrayRef) -> Self { +impl ChunkApplyKernel for StringChunked { + fn apply_kernel(&self, f: &dyn Fn(&Utf8ViewArray) -> ArrayRef) -> Self { self.apply_kernel_cast(&f) } - fn apply_kernel_cast(&self, f: &dyn Fn(&LargeStringArray) -> ArrayRef) -> ChunkedArray + fn apply_kernel_cast(&self, f: &dyn Fn(&Utf8ViewArray) -> ArrayRef) -> ChunkedArray where S: PolarsDataType, { @@ -562,12 +558,12 @@ impl ChunkApplyKernel for StringChunked { } } -impl ChunkApplyKernel for BinaryChunked { - fn apply_kernel(&self, f: &dyn Fn(&LargeBinaryArray) -> ArrayRef) -> Self { +impl ChunkApplyKernel for BinaryChunked { + fn apply_kernel(&self, f: &dyn Fn(&BinaryViewArray) -> ArrayRef) -> Self { self.apply_kernel_cast(&f) } - fn apply_kernel_cast(&self, f: &dyn Fn(&LargeBinaryArray) -> ArrayRef) -> ChunkedArray + fn apply_kernel_cast(&self, f: &dyn Fn(&BinaryViewArray) -> ArrayRef) -> ChunkedArray where S: PolarsDataType, { diff --git a/crates/polars-core/src/chunked_array/ops/explode.rs b/crates/polars-core/src/chunked_array/ops/explode.rs index bab873cfa5af..5e54a5fda1ad 100644 --- a/crates/polars-core/src/chunked_array/ops/explode.rs +++ b/crates/polars-core/src/chunked_array/ops/explode.rs @@ -349,8 +349,7 @@ impl ExplodeByOffsets for BinaryChunked { let arr = self.downcast_iter().next().unwrap(); let cap = get_capacity(offsets); - let bytes_size = self.get_values_size(); - let mut builder = BinaryChunkedBuilder::new(self.name(), cap, bytes_size); + let mut builder = BinaryChunkedBuilder::new(self.name(), cap); let mut start = offsets[0] as usize; let mut last = start; @@ -361,10 +360,10 @@ impl ExplodeByOffsets for BinaryChunked { let vals = arr.slice_typed(start, last - start); if vals.null_count() == 0 { builder - .builder + .chunk_builder .extend_trusted_len_values(vals.values_iter()) } else { - builder.builder.extend_trusted_len(vals.into_iter()); + builder.chunk_builder.extend_trusted_len(vals.into_iter()); } } builder.append_null(); @@ -375,10 +374,10 @@ impl ExplodeByOffsets for BinaryChunked { let vals = arr.slice_typed(start, last - start); if vals.null_count() == 0 { builder - .builder + .chunk_builder .extend_trusted_len_values(vals.values_iter()) } else { - builder.builder.extend_trusted_len(vals.into_iter()); + builder.chunk_builder.extend_trusted_len(vals.into_iter()); } builder.finish().into() } diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index 2f6ed6705205..d9fb61926610 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -1,4 +1,6 @@ use arrow::bitmap::MutableBitmap; +use arrow::compute::cast::utf8view_to_utf8; +use polars_utils::vec::PushUnchecked; use super::*; @@ -80,21 +82,63 @@ impl ChunkExplode for ListChunked { } } -impl ChunkExplode for StringChunked { +#[cfg(feature = "dtype-array")] +impl ChunkExplode for ArrayChunked { fn offsets(&self) -> PolarsResult> { - let ca = self.rechunk(); - let array: &Utf8Array = ca.downcast_iter().next().unwrap(); - let offsets = array.offsets().clone(); + let width = self.width() as i64; + let offsets = (0..self.len() + 1) + .map(|i| { + let i = i as i64; + i * width + }) + .collect::>(); + // safety: monotonically increasing + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) }; Ok(offsets) } + fn explode(&self) -> PolarsResult { + let ca = self.rechunk(); + let arr = ca.downcast_iter().next().unwrap(); + Ok(Series::try_from((self.name(), arr.values().clone())).unwrap()) + } + + fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { + let s = self.explode().unwrap(); + + Ok((s, self.offsets()?)) + } +} + +impl ChunkExplode for StringChunked { + fn offsets(&self) -> PolarsResult> { + let mut offsets = Vec::with_capacity(self.len() + 1); + let mut length_so_far = 0; + offsets.push(length_so_far); + + for arr in self.downcast_iter() { + for len in arr.len_iter() { + // SAFETY: + // pre-allocated + unsafe { offsets.push_unchecked(length_so_far) }; + length_so_far += len as i64; + } + } + + // SAFETY: + // Monotonically increasing. + unsafe { Ok(OffsetsBuffer::new_unchecked(offsets.into())) } + } + fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { // A list array's memory layout is actually already 'exploded', so we can just take the values array // of the list. And we also return a slice of the offsets. This slice can be used to find the old // list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation let ca = self.rechunk(); - let array: &Utf8Array = ca.downcast_iter().next().unwrap(); + let array = ca.downcast_iter().next().unwrap(); + // TODO! maybe optimize for new utf8view? + let array = utf8view_to_utf8(array); let values = array.values(); let old_offsets = array.offsets().clone(); @@ -198,32 +242,3 @@ impl ChunkExplode for StringChunked { Ok((s, old_offsets)) } } - -#[cfg(feature = "dtype-array")] -impl ChunkExplode for ArrayChunked { - fn offsets(&self) -> PolarsResult> { - let width = self.width() as i64; - let offsets = (0..self.len() + 1) - .map(|i| { - let i = i as i64; - i * width - }) - .collect::>(); - // safety: monotonically increasing - let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) }; - - Ok(offsets) - } - - fn explode(&self) -> PolarsResult { - let ca = self.rechunk(); - let arr = ca.downcast_iter().next().unwrap(); - Ok(Series::try_from((self.name(), arr.values().clone())).unwrap()) - } - - fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { - let s = self.explode().unwrap(); - - Ok((s, self.offsets()?)) - } -} diff --git a/crates/polars-core/src/chunked_array/ops/extend.rs b/crates/polars-core/src/chunked_array/ops/extend.rs index 423174cc9d43..5a2b509a0c06 100644 --- a/crates/polars-core/src/chunked_array/ops/extend.rs +++ b/crates/polars-core/src/chunked_array/ops/extend.rs @@ -90,77 +90,24 @@ where #[doc(hidden)] impl StringChunked { pub fn extend(&mut self, other: &Self) { - update_sorted_flag_before_append::(self, other); - if self.chunks.len() > 1 { - self.append(other); - *self = self.rechunk(); - return; - } - let arr = self.downcast_iter().next().unwrap(); - - // increments 1 - let arr = arr.clone(); - - // now we drop our owned ArrayRefs so that - // decrements 1 - { - self.chunks.clear(); - } - - use Either::*; - - match arr.into_mut() { - Left(immutable) => { - extend_immutable(&immutable, &mut self.chunks, &other.chunks); - }, - Right(mut mutable) => { - for arr in other.downcast_iter() { - mutable.extend_trusted_len(arr.into_iter()) - } - let arr: Utf8Array = mutable.into(); - self.chunks.push(Box::new(arr) as ArrayRef) - }, - } - self.compute_len(); self.set_sorted_flag(IsSorted::Not); + self.append(other) } } #[doc(hidden)] impl BinaryChunked { pub fn extend(&mut self, other: &Self) { - update_sorted_flag_before_append::(self, other); - if self.chunks.len() > 1 { - self.append(other); - *self = self.rechunk(); - return; - } - let arr = self.downcast_iter().next().unwrap(); - - // increments 1 - let arr = arr.clone(); - - // now we drop our owned ArrayRefs so that - // decrements 1 - { - self.chunks.clear(); - } - - use Either::*; + self.set_sorted_flag(IsSorted::Not); + self.append(other) + } +} - match arr.into_mut() { - Left(immutable) => { - extend_immutable(&immutable, &mut self.chunks, &other.chunks); - }, - Right(mut mutable) => { - for arr in other.downcast_iter() { - mutable.extend_trusted_len(arr.into_iter()) - } - let arr: BinaryArray = mutable.into(); - self.chunks.push(Box::new(arr) as ArrayRef) - }, - } - self.compute_len(); +#[doc(hidden)] +impl BinaryOffsetChunked { + pub fn extend(&mut self, other: &Self) { + self.set_sorted_flag(IsSorted::Not); + self.append(other) } } diff --git a/crates/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs index 920d71f6b885..82b476fcb54b 100644 --- a/crates/polars-core/src/chunked_array/ops/filter.rs +++ b/crates/polars-core/src/chunked_array/ops/filter.rs @@ -92,6 +92,28 @@ impl ChunkFilter for BinaryChunked { } } +impl ChunkFilter for BinaryOffsetChunked { + fn filter(&self, filter: &BooleanChunked) -> PolarsResult { + // Broadcast. + if filter.len() == 1 { + return match filter.get(0) { + Some(true) => Ok(self.clone()), + _ => Ok(BinaryOffsetChunked::full_null(self.name(), 0)), + }; + } + check_filter_len!(self, filter); + Ok(unsafe { + arity::binary_unchecked_same_type( + self, + filter, + |left, mask| filter_fn(left, mask).unwrap(), + true, + true, + ) + }) + } +} + impl ChunkFilter for ListChunked { fn filter(&self, filter: &BooleanChunked) -> PolarsResult { // Broadcast. diff --git a/crates/polars-core/src/chunked_array/ops/full.rs b/crates/polars-core/src/chunked_array/ops/full.rs index 3d15e9f85488..de1c9d734dc2 100644 --- a/crates/polars-core/src/chunked_array/ops/full.rs +++ b/crates/polars-core/src/chunked_array/ops/full.rs @@ -46,11 +46,8 @@ impl ChunkFullNull for BooleanChunked { impl<'a> ChunkFull<&'a str> for StringChunked { fn full(name: &str, value: &'a str, length: usize) -> Self { - let mut builder = StringChunkedBuilder::new(name, length, length * value.len()); - - for _ in 0..length { - builder.append_value(value); - } + let mut builder = StringChunkedBuilder::new(name, length); + builder.chunk_builder.extend_constant(length, Some(value)); let mut out = builder.finish(); out.set_sorted_flag(IsSorted::Ascending); out @@ -59,18 +56,15 @@ impl<'a> ChunkFull<&'a str> for StringChunked { impl ChunkFullNull for StringChunked { fn full_null(name: &str, length: usize) -> Self { - let arr = Utf8Array::new_null(DataType::String.to_arrow(true), length); + let arr = Utf8ViewArray::new_null(DataType::String.to_arrow(true), length); ChunkedArray::with_chunk(name, arr) } } impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { fn full(name: &str, value: &'a [u8], length: usize) -> Self { - let mut builder = BinaryChunkedBuilder::new(name, length, length * value.len()); - - for _ in 0..length { - builder.append_value(value); - } + let mut builder = BinaryChunkedBuilder::new(name, length); + builder.chunk_builder.extend_constant(length, Some(value)); let mut out = builder.finish(); out.set_sorted_flag(IsSorted::Ascending); out @@ -79,7 +73,25 @@ impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { impl ChunkFullNull for BinaryChunked { fn full_null(name: &str, length: usize) -> Self { - let arr = BinaryArray::new_null(DataType::Binary.to_arrow(true), length); + let arr = BinaryViewArray::new_null(DataType::Binary.to_arrow(true), length); + ChunkedArray::with_chunk(name, arr) + } +} + +impl<'a> ChunkFull<&'a [u8]> for BinaryOffsetChunked { + fn full(name: &str, value: &'a [u8], length: usize) -> Self { + let mut mutable = MutableBinaryArray::with_capacities(length, length * value.len()); + mutable.extend_values(std::iter::repeat(value).take(length)); + let arr: BinaryArray = mutable.into(); + let mut out = ChunkedArray::with_chunk(name, arr); + out.set_sorted_flag(IsSorted::Ascending); + out + } +} + +impl ChunkFullNull for BinaryOffsetChunked { + fn full_null(name: &str, length: usize) -> Self { + let arr = BinaryArray::::new_null(DataType::BinaryOffset.to_arrow(true), length); ChunkedArray::with_chunk(name, arr) } } diff --git a/crates/polars-core/src/chunked_array/ops/gather.rs b/crates/polars-core/src/chunked_array/ops/gather.rs index 8fa0222b6f31..d017f4c45abe 100644 --- a/crates/polars-core/src/chunked_array/ops/gather.rs +++ b/crates/polars-core/src/chunked_array/ops/gather.rs @@ -1,5 +1,6 @@ use arrow::array::Array; use arrow::bitmap::bitmask::BitMask; +use arrow::legacy::compute::take::take_unchecked; use polars_error::{polars_bail, polars_ensure, PolarsResult}; use polars_utils::index::check_bounds; @@ -163,7 +164,30 @@ impl + ?Sized> ChunkTakeUnchecked for } } -impl ChunkTakeUnchecked for ChunkedArray { +trait NotSpecialized {} +impl NotSpecialized for Int8Type {} +impl NotSpecialized for Int16Type {} +impl NotSpecialized for Int32Type {} +impl NotSpecialized for Int64Type {} +#[cfg(feature = "dtype-decimal")] +impl NotSpecialized for Int128Type {} +impl NotSpecialized for UInt8Type {} +impl NotSpecialized for UInt16Type {} +impl NotSpecialized for UInt32Type {} +impl NotSpecialized for UInt64Type {} +impl NotSpecialized for Float32Type {} +impl NotSpecialized for Float64Type {} +impl NotSpecialized for BooleanType {} +impl NotSpecialized for ListType {} +#[cfg(feature = "dtype-array")] +impl NotSpecialized for FixedSizeListType {} +impl NotSpecialized for BinaryOffsetType {} +#[cfg(feature = "dtype-decimal")] +impl NotSpecialized for DecimalType {} +#[cfg(feature = "object")] +impl NotSpecialized for ObjectType {} + +impl ChunkTakeUnchecked for ChunkedArray { /// Gather values from ChunkedArray by index. unsafe fn take_unchecked(&self, indices: &IdxCa) -> Self { let rechunked; @@ -223,3 +247,37 @@ impl ChunkTakeUnchecked for ChunkedArray { out } } + +impl ChunkTakeUnchecked for BinaryChunked { + /// Gather values from ChunkedArray by index. + unsafe fn take_unchecked(&self, indices: &IdxCa) -> Self { + let rechunked = self.rechunk(); + let indices = indices.rechunk(); + let indices_arr = indices.downcast_iter().next().unwrap(); + let chunks = rechunked + .chunks() + .iter() + .map(|arr| take_unchecked(arr.as_ref(), indices_arr)) + .collect::>(); + + let mut out = ChunkedArray::from_chunks(self.name(), chunks); + + use crate::series::IsSorted::*; + let sorted_flag = match (self.is_sorted_flag(), indices.is_sorted_flag()) { + (_, Not) => Not, + (Not, _) => Not, + (Ascending, Ascending) => Ascending, + (Ascending, Descending) => Descending, + (Descending, Ascending) => Descending, + (Descending, Descending) => Ascending, + }; + out.set_sorted_flag(sorted_flag); + out + } +} + +impl ChunkTakeUnchecked for StringChunked { + unsafe fn take_unchecked(&self, indices: &IdxCa) -> Self { + self.as_binary().take_unchecked(indices).to_string() + } +} diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index f2d6e6da1cba..8070f30b6958 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -527,6 +527,14 @@ impl ChunkExpandAtIndex for BinaryChunked { } } +impl ChunkExpandAtIndex for BinaryOffsetChunked { + fn new_from_index(&self, index: usize, length: usize) -> BinaryOffsetChunked { + let mut out = impl_chunk_expand!(self, length, index); + out.set_sorted_flag(IsSorted::Ascending); + out + } +} + impl ChunkExpandAtIndex for ListChunked { fn new_from_index(&self, index: usize, length: usize) -> ListChunked { let opt_val = self.get_as_series(index); diff --git a/crates/polars-core/src/chunked_array/ops/reverse.rs b/crates/polars-core/src/chunked_array/ops/reverse.rs index 8446dc77b7c9..8e238c3a2e4f 100644 --- a/crates/polars-core/src/chunked_array/ops/reverse.rs +++ b/crates/polars-core/src/chunked_array/ops/reverse.rs @@ -42,6 +42,7 @@ macro_rules! impl_reverse { impl_reverse!(BooleanType, BooleanChunked); impl_reverse!(StringType, StringChunked); impl_reverse!(BinaryType, BinaryChunked); +impl_reverse!(BinaryOffsetType, BinaryOffsetChunked); impl_reverse!(ListType, ListChunked); #[cfg(feature = "dtype-array")] diff --git a/crates/polars-core/src/chunked_array/ops/set.rs b/crates/polars-core/src/chunked_array/ops/set.rs index 94cebae40721..0c9cdbd0f4aa 100644 --- a/crates/polars-core/src/chunked_array/ops/set.rs +++ b/crates/polars-core/src/chunked_array/ops/set.rs @@ -1,4 +1,3 @@ -use arrow::array::ValueSize; use arrow::bitmap::MutableBitmap; use arrow::legacy::kernels::set::{scatter_single_non_null, set_with_mask}; use arrow::legacy::prelude::FromData; @@ -184,8 +183,7 @@ impl<'a> ChunkSet<'a, &'a str, String> for StringChunked { { let idx_iter = idx.into_iter(); let mut ca_iter = self.into_iter().enumerate(); - let mut builder = - StringChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + let mut builder = StringChunkedBuilder::new(self.name(), self.len()); for current_idx in idx_iter.into_iter().map(|i| i as usize) { polars_ensure!(current_idx < self.len(), oob = current_idx, self.len()); @@ -216,8 +214,7 @@ impl<'a> ChunkSet<'a, &'a str, String> for StringChunked { Self: Sized, F: Fn(Option<&'a str>) -> Option, { - let mut builder = - StringChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + let mut builder = StringChunkedBuilder::new(self.name(), self.len()); impl_scatter_with!(self, builder, idx, f) } @@ -249,8 +246,7 @@ impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { Self: Sized, { let mut ca_iter = self.into_iter().enumerate(); - let mut builder = - BinaryChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + let mut builder = BinaryChunkedBuilder::new(self.name(), self.len()); for current_idx in idx.into_iter().map(|i| i as usize) { polars_ensure!(current_idx < self.len(), oob = current_idx, self.len()); @@ -281,8 +277,7 @@ impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { Self: Sized, F: Fn(Option<&'a [u8]>) -> Option>, { - let mut builder = - BinaryChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + let mut builder = BinaryChunkedBuilder::new(self.name(), self.len()); impl_scatter_with!(self, builder, idx, f) } diff --git a/crates/polars-core/src/chunked_array/ops/shift.rs b/crates/polars-core/src/chunked_array/ops/shift.rs index 87b3533acbbc..50e793cef9c4 100644 --- a/crates/polars-core/src/chunked_array/ops/shift.rs +++ b/crates/polars-core/src/chunked_array/ops/shift.rs @@ -76,6 +76,12 @@ impl ChunkShiftFill> for BinaryChunked { } } +impl ChunkShiftFill> for BinaryOffsetChunked { + fn shift_and_fill(&self, periods: i64, fill_value: Option<&[u8]>) -> BinaryOffsetChunked { + impl_shift_fill!(self, periods, fill_value) + } +} + impl ChunkShift for StringChunked { fn shift(&self, periods: i64) -> Self { self.shift_and_fill(periods, None) @@ -88,6 +94,12 @@ impl ChunkShift for BinaryChunked { } } +impl ChunkShift for BinaryOffsetChunked { + fn shift(&self, periods: i64) -> Self { + self.shift_and_fill(periods, None) + } +} + impl ChunkShiftFill> for ListChunked { fn shift_and_fill(&self, periods: i64, fill_value: Option<&Series>) -> ListChunked { // This has its own implementation because a ListChunked cannot have a full-null without diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs index dc67b3d0c7fe..66d21b2e9440 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs @@ -83,7 +83,7 @@ pub fn _get_rows_encoded_compat_array(by: &Series) -> PolarsResult { } #[cfg(feature = "dtype-struct")] -pub(crate) fn encode_rows_vertical(by: &[Series]) -> PolarsResult { +pub(crate) fn encode_rows_vertical(by: &[Series]) -> PolarsResult { let n_threads = POOL.current_num_threads(); let len = by[0].len(); let splits = _split_offsets(len, n_threads); @@ -101,7 +101,7 @@ pub(crate) fn encode_rows_vertical(by: &[Series]) -> PolarsResult }) .collect(); - Ok(BinaryChunked::from_chunk_iter("", chunks?)) + Ok(BinaryOffsetChunked::from_chunk_iter("", chunks?)) } pub fn _get_rows_encoded( @@ -142,9 +142,9 @@ pub fn _get_rows_encoded_ca( by: &[Series], descending: &[bool], nulls_last: bool, -) -> PolarsResult { +) -> PolarsResult { _get_rows_encoded(by, descending, nulls_last) - .map(|rows| BinaryChunked::with_chunk(name, rows.into_array())) + .map(|rows| BinaryOffsetChunked::with_chunk(name, rows.into_array())) } pub(crate) fn argsort_multiple_row_fmt( diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index 02ea83057ccc..8c9f53ccf77b 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -5,7 +5,6 @@ pub mod arg_sort_multiple; mod categorical; use std::cmp::Ordering; -use std::iter::FromIterator; pub(crate) use arg_sort_multiple::argsort_multiple_row_fmt; use arrow::array::ValueSize; @@ -323,11 +322,89 @@ impl ChunkSort for StringChunked { impl ChunkSort for BinaryChunked { fn sort_with(&self, options: SortOptions) -> ChunkedArray { sort_with_fast_path!(self, options); - let mut v: Vec<&[u8]> = if self.null_count() > 0 { - Vec::from_iter(self.into_iter().flatten()) + + let mut v: Vec<&[u8]> = Vec::with_capacity(self.len()); + for arr in self.downcast_iter() { + v.extend(arr.non_null_values_iter()); + } + sort_unstable_by_branch( + v.as_mut_slice(), + options.descending, + Ord::cmp, + options.multithreaded, + ); + + let len = self.len(); + let null_count = self.null_count(); + let mut mutable = MutableBinaryViewArray::with_capacity(len); + + if options.nulls_last { + for row in v { + mutable.push_value_ignore_validity(row) + } + mutable.extend_null(null_count); + } else { + mutable.extend_null(null_count); + for row in v { + mutable.push_value(row) + } + } + let mut ca = ChunkedArray::with_chunk(self.name(), mutable.into()); + + let s = if options.descending { + IsSorted::Descending } else { - Vec::from_iter(self.into_no_null_iter()) + IsSorted::Ascending }; + ca.set_sorted_flag(s); + ca + } + + fn sort(&self, descending: bool) -> ChunkedArray { + self.sort_with(SortOptions { + descending, + nulls_last: false, + multithreaded: true, + maintain_order: false, + }) + } + + fn arg_sort(&self, options: SortOptions) -> IdxCa { + arg_sort::arg_sort( + self.name(), + self.downcast_iter().map(|arr| arr.iter()), + options, + self.null_count(), + self.len(), + ) + } + + fn arg_sort_multiple(&self, options: &SortMultipleOptions) -> PolarsResult { + args_validate(self, &options.other, &options.descending)?; + + let mut count: IdxSize = 0; + + let mut vals = Vec::with_capacity(self.len()); + for arr in self.downcast_iter() { + for v in arr { + let i = count; + count += 1; + vals.push((i, v)) + } + } + + arg_sort_multiple_impl(vals, options) + } +} + +impl ChunkSort for BinaryOffsetChunked { + fn sort_with(&self, options: SortOptions) -> BinaryOffsetChunked { + sort_with_fast_path!(self, options); + + let mut v: Vec<&[u8]> = Vec::with_capacity(self.len()); + for arr in self.downcast_iter() { + v.extend(arr.non_null_values_iter()); + } sort_unstable_by_branch( v.as_mut_slice(), @@ -410,7 +487,7 @@ impl ChunkSort for BinaryChunked { ca } - fn sort(&self, descending: bool) -> BinaryChunked { + fn sort(&self, descending: bool) -> BinaryOffsetChunked { self.sort_with(SortOptions { descending, nulls_last: false, @@ -440,14 +517,16 @@ impl ChunkSort for BinaryChunked { args_validate(self, &options.other, &options.descending)?; let mut count: IdxSize = 0; - let vals: Vec<_> = self - .into_iter() - .map(|v| { + + let mut vals = Vec::with_capacity(self.len()); + for arr in self.downcast_iter() { + for v in arr { let i = count; count += 1; - (i, v) - }) - .collect_trusted(); + vals.push((i, v)) + } + } + arg_sort_multiple_impl(vals, options) } } diff --git a/crates/polars-core/src/chunked_array/ops/take/take_chunked.rs b/crates/polars-core/src/chunked_array/ops/take/take_chunked.rs index 55dfa7367fd2..e977b7b5f1c0 100644 --- a/crates/polars-core/src/chunked_array/ops/take/take_chunked.rs +++ b/crates/polars-core/src/chunked_array/ops/take/take_chunked.rs @@ -72,6 +72,38 @@ impl TakeChunked for StringChunked { } } +impl TakeChunked for BinaryOffsetChunked { + unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { + let arrs = self.downcast_iter().collect::>(); + let mut ca: Self = by + .iter() + .map(|[chunk_idx, array_idx]| { + let arr = arrs.get_unchecked(*chunk_idx as usize); + arr.get_unchecked(*array_idx as usize) + }) + .collect_trusted(); + ca.rename(self.name()); + ca.set_sorted_flag(sorted); + ca + } + + unsafe fn take_opt_chunked_unchecked(&self, by: &[Option]) -> Self { + let arrs = self.downcast_iter().collect::>(); + let mut ca: Self = by + .iter() + .map(|opt_idx| { + opt_idx.and_then(|[chunk_idx, array_idx]| { + let arr = arrs.get_unchecked(chunk_idx as usize); + arr.get_unchecked(array_idx as usize) + }) + }) + .collect_trusted(); + + ca.rename(self.name()); + ca + } +} + impl TakeChunked for BinaryChunked { unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { let arrs = self.downcast_iter().collect::>(); diff --git a/crates/polars-core/src/chunked_array/temporal/date.rs b/crates/polars-core/src/chunked_array/temporal/date.rs index c737cf1a02ab..7f6146fa921b 100644 --- a/crates/polars-core/src/chunked_array/temporal/date.rs +++ b/crates/polars-core/src/chunked_array/temporal/date.rs @@ -33,13 +33,9 @@ impl DateChunked { /// Convert from Date into String with the given format. /// See [chrono strftime/strptime](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html). pub fn to_string(&self, format: &str) -> StringChunked { - let date = NaiveDate::from_ymd_opt(2001, 1, 1).unwrap(); - let fmted = format!("{}", date.format(format)); - let mut ca: StringChunked = self.apply_kernel_cast(&|arr| { let mut buf = String::new(); - let mut mutarr = - MutableUtf8Array::with_capacities(arr.len(), arr.len() * fmted.len() + 1); + let mut mutarr = MutablePlString::with_capacity(arr.len()); for opt in arr.into_iter() { match opt { @@ -48,13 +44,12 @@ impl DateChunked { buf.clear(); let datefmt = date32_to_date(*v).format(format); write!(buf, "{datefmt}").unwrap(); - mutarr.push(Some(&buf)) + mutarr.push_value(&buf) }, } } - let arr: Utf8Array = mutarr.into(); - Box::new(arr) + mutarr.freeze().boxed() }); ca.rename(self.name()); ca diff --git a/crates/polars-core/src/chunked_array/temporal/datetime.rs b/crates/polars-core/src/chunked_array/temporal/datetime.rs index 23083ac56484..3fcea28a66ce 100644 --- a/crates/polars-core/src/chunked_array/temporal/datetime.rs +++ b/crates/polars-core/src/chunked_array/temporal/datetime.rs @@ -19,12 +19,11 @@ use crate::prelude::*; fn apply_datefmt_f<'a>( arr: &PrimitiveArray, - fmted: &'a str, conversion_f: fn(i64) -> NaiveDateTime, datefmt_f: impl Fn(NaiveDateTime) -> DelayedFormat>, ) -> ArrayRef { let mut buf = String::new(); - let mut mutarr = MutableUtf8Array::with_capacities(arr.len(), arr.len() * fmted.len() + 1); + let mut mutarr = MutableBinaryViewArray::::with_capacity(arr.len()); for opt in arr.into_iter() { match opt { None => mutarr.push_null(), @@ -33,12 +32,11 @@ fn apply_datefmt_f<'a>( let converted = conversion_f(*v); let datefmt = datefmt_f(converted); write!(buf, "{datefmt}").unwrap(); - mutarr.push(Some(&buf)) + mutarr.push_value(&buf) }, } } - let arr: Utf8Array = mutarr.into(); - Box::new(arr) + mutarr.freeze().boxed() } #[cfg(feature = "timezones")] @@ -46,20 +44,18 @@ fn format_tz( tz: Tz, arr: &PrimitiveArray, fmt: &str, - fmted: &str, conversion_f: fn(i64) -> NaiveDateTime, ) -> ArrayRef { let datefmt_f = |ndt| tz.from_utc_datetime(&ndt).format(fmt); - apply_datefmt_f(arr, fmted, conversion_f, datefmt_f) + apply_datefmt_f(arr, conversion_f, datefmt_f) } fn format_naive( arr: &PrimitiveArray, fmt: &str, - fmted: &str, conversion_f: fn(i64) -> NaiveDateTime, ) -> ArrayRef { let datefmt_f = |ndt: NaiveDateTime| ndt.format(fmt); - apply_datefmt_f(arr, fmted, conversion_f, datefmt_f) + apply_datefmt_f(arr, conversion_f, datefmt_f) } impl DatetimeChunked { @@ -121,20 +117,13 @@ impl DatetimeChunked { |_| polars_err!(ComputeError: "cannot format NaiveDateTime with format '{}'", format), )?, }; - let fmted = fmted; // discard mut let mut ca: StringChunked = match self.time_zone() { #[cfg(feature = "timezones")] Some(time_zone) => self.apply_kernel_cast(&|arr| { - format_tz( - time_zone.parse::().unwrap(), - arr, - format, - &fmted, - conversion_f, - ) + format_tz(time_zone.parse::().unwrap(), arr, format, conversion_f) }), - _ => self.apply_kernel_cast(&|arr| format_naive(arr, format, &fmted, conversion_f)), + _ => self.apply_kernel_cast(&|arr| format_naive(arr, format, conversion_f)), }; ca.rename(self.name()); Ok(ca) diff --git a/crates/polars-core/src/chunked_array/temporal/time.rs b/crates/polars-core/src/chunked_array/temporal/time.rs index 97d6bd52f875..3627189052a5 100644 --- a/crates/polars-core/src/chunked_array/temporal/time.rs +++ b/crates/polars-core/src/chunked_array/temporal/time.rs @@ -21,13 +21,9 @@ impl TimeChunked { /// Convert from Time into String with the given format. /// See [chrono strftime/strptime](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html). pub fn to_string(&self, format: &str) -> StringChunked { - let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); - let fmted = format!("{}", time.format(format)); - let mut ca: StringChunked = self.apply_kernel_cast(&|arr| { let mut buf = String::new(); - let mut mutarr = - MutableUtf8Array::with_capacities(arr.len(), arr.len() * fmted.len() + 1); + let mut mutarr = MutablePlString::with_capacity(arr.len()); for opt in arr.into_iter() { match opt { @@ -36,13 +32,12 @@ impl TimeChunked { buf.clear(); let timefmt = time64ns_to_time(*v).format(format); write!(buf, "{timefmt}").unwrap(); - mutarr.push(Some(&buf)) + mutarr.push_value(&buf) }, } } - let arr: Utf8Array = mutarr.into(); - Box::new(arr) + mutarr.freeze().boxed() }); ca.rename(self.name()); diff --git a/crates/polars-core/src/chunked_array/trusted_len.rs b/crates/polars-core/src/chunked_array/trusted_len.rs index 69180a9b2e6b..a241e0432569 100644 --- a/crates/polars-core/src/chunked_array/trusted_len.rs +++ b/crates/polars-core/src/chunked_array/trusted_len.rs @@ -161,6 +161,27 @@ where } } +impl FromTrustedLenIterator for BinaryOffsetChunked +where + Ptr: PolarsAsRef<[u8]>, +{ + fn from_iter_trusted_length>(iter: I) -> Self { + let arr = BinaryArray::from_iter_values(iter.into_iter()); + ChunkedArray::with_chunk("", arr) + } +} + +impl FromTrustedLenIterator> for BinaryOffsetChunked +where + Ptr: AsRef<[u8]>, +{ + fn from_iter_trusted_length>>(iter: I) -> Self { + let iter = iter.into_iter(); + let arr = BinaryArray::from_iter(iter); + ChunkedArray::with_chunk("", arr) + } +} + #[cfg(feature = "object")] impl FromTrustedLenIterator> for ObjectChunked { fn from_iter_trusted_length>>(iter: I) -> Self { diff --git a/crates/polars-core/src/chunked_array/upstream_traits.rs b/crates/polars-core/src/chunked_array/upstream_traits.rs index 499c48c2777d..69fc84b847a0 100644 --- a/crates/polars-core/src/chunked_array/upstream_traits.rs +++ b/crates/polars-core/src/chunked_array/upstream_traits.rs @@ -5,7 +5,7 @@ use std::iter::FromIterator; use std::marker::PhantomData; use std::sync::Arc; -use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; +use arrow::array::{BooleanArray, PrimitiveArray}; use arrow::bitmap::{Bitmap, MutableBitmap}; use polars_utils::sync::SyncPtr; use rayon::iter::{FromParallelIterator, IntoParallelIterator}; @@ -89,7 +89,8 @@ where Ptr: AsRef, { fn from_iter>>(iter: I) -> Self { - Utf8Array::::from_iter(iter).into() + let arr = MutableBinaryViewArray::from_iterator(iter.into_iter()).freeze(); + ChunkedArray::with_chunk("", arr) } } @@ -100,14 +101,21 @@ impl PolarsAsRef for String {} impl PolarsAsRef for &str {} // &["foo", "bar"] impl PolarsAsRef for &&str {} + impl<'a> PolarsAsRef for Cow<'a, str> {} +impl PolarsAsRef<[u8]> for Vec {} +impl PolarsAsRef<[u8]> for &[u8] {} +// TODO: remove! +impl PolarsAsRef<[u8]> for &&[u8] {} +impl<'a> PolarsAsRef<[u8]> for Cow<'a, [u8]> {} impl FromIterator for StringChunked where Ptr: PolarsAsRef, { fn from_iter>(iter: I) -> Self { - Utf8Array::::from_iter_values(iter.into_iter()).into() + let arr = MutableBinaryViewArray::from_values_iter(iter.into_iter()).freeze(); + ChunkedArray::with_chunk("", arr) } } @@ -117,25 +125,18 @@ where Ptr: AsRef<[u8]>, { fn from_iter>>(iter: I) -> Self { - BinaryArray::::from_iter(iter).into() + let arr = MutableBinaryViewArray::from_iter(iter).freeze(); + ChunkedArray::with_chunk("", arr) } } -impl PolarsAsRef<[u8]> for Vec {} - -impl PolarsAsRef<[u8]> for &[u8] {} - -// TODO: remove! -impl PolarsAsRef<[u8]> for &&[u8] {} - -impl<'a> PolarsAsRef<[u8]> for Cow<'a, [u8]> {} - impl FromIterator for BinaryChunked where Ptr: PolarsAsRef<[u8]>, { fn from_iter>(iter: I) -> Self { - BinaryArray::::from_iter_values(iter.into_iter()).into() + let arr = MutableBinaryViewArray::from_values_iter(iter.into_iter()).freeze(); + ChunkedArray::with_chunk("", arr) } } @@ -515,14 +516,34 @@ where fn from_par_iter>(iter: I) -> Self { let vectors = collect_into_linked_list(iter); let cap = get_capacity_from_par_results(&vectors); - let mut builder = MutableUtf8ValuesArray::with_capacities(cap, cap * 10); + + let mut builder = MutableBinaryViewArray::with_capacity(cap); + // TODO! we can do this in parallel ind just combine the buffers. for vec in vectors { for val in vec { - builder.push(val.as_ref()) + builder.push_value_ignore_validity(val.as_ref()) } } - let arr: LargeStringArray = builder.into(); - arr.into() + ChunkedArray::with_chunk("", builder.freeze()) + } +} + +impl FromParallelIterator for BinaryChunked +where + Ptr: PolarsAsRef<[u8]> + Send + Sync, +{ + fn from_par_iter>(iter: I) -> Self { + let vectors = collect_into_linked_list(iter); + let cap = get_capacity_from_par_results(&vectors); + + let mut builder = MutableBinaryViewArray::with_capacity(cap); + // TODO! we can do this in parallel ind just combine the buffers. + for vec in vectors { + for val in vec { + builder.push_value_ignore_validity(val.as_ref()) + } + } + ChunkedArray::with_chunk("", builder.freeze()) } } @@ -538,61 +559,53 @@ where .into_par_iter() .map(|vector| { let cap = vector.len(); - let mut builder = MutableUtf8Array::with_capacities(cap, cap * 10); + let mut mutable = MutableBinaryViewArray::with_capacity(cap); for opt_val in vector { - builder.push(opt_val) + mutable.push(opt_val) } - let arr: LargeStringArray = builder.into(); - arr + mutable.freeze() }) .collect::>(); - let mut len = 0; - let mut thread_offsets = Vec::with_capacity(arrays.len()); - let values = arrays + // TODO! + // do this in parallel. + let arrays = arrays .iter() - .map(|arr| { - thread_offsets.push(len); - len += arr.len(); - arr.values().as_slice() + .map(|arr| arr as &dyn Array) + .collect::>(); + let arr = arrow::compute::concatenate::concatenate(&arrays).unwrap(); + unsafe { StringChunked::from_chunks("", vec![arr]) } + } +} + +impl FromParallelIterator> for BinaryChunked +where + Ptr: AsRef<[u8]> + Send + Sync, +{ + fn from_par_iter>>(iter: I) -> Self { + let vectors = collect_into_linked_list(iter); + let vectors = vectors.into_iter().collect::>(); + + let arrays = vectors + .into_par_iter() + .map(|vector| { + let cap = vector.len(); + let mut mutable = MutableBinaryViewArray::with_capacity(cap); + for opt_val in vector { + mutable.push(opt_val) + } + mutable.freeze() }) .collect::>(); - let values = flatten_par(&values); - - let validity = finish_validities( - arrays - .iter() - .map(|arr| { - let local_len = arr.len(); - (arr.validity().cloned(), local_len) - }) - .collect(), - len, - ); - - // Concat the offsets. - // This is single threaded as the values depend on previous ones - // if this proves to slow we could try parallel reduce. - let mut offsets = Vec::with_capacity(len + 1); - let mut offsets_so_far = 0; - let mut first = true; - for array in &arrays { - let local_offsets = array.offsets().as_slice(); - if first { - offsets.extend_from_slice(local_offsets); - first = false; - } else { - // SAFETY: there is always a single offset. - let skip_first = unsafe { local_offsets.get_unchecked(1..) }; - offsets.extend(skip_first.iter().map(|v| *v + offsets_so_far)); - } - offsets_so_far = unsafe { *offsets.last().unwrap_unchecked() }; - } - let arr = unsafe { - Utf8Array::::from_data_unchecked_default(offsets.into(), values.into(), validity) - }; - arr.into() + // TODO! + // do this in parallel. + let arrays = arrays + .iter() + .map(|arr| arr as &dyn Array) + .collect::>(); + let arr = arrow::compute::concatenate::concatenate(&arrays).unwrap(); + unsafe { BinaryChunked::from_chunks("", vec![arr]) } } } diff --git a/crates/polars-core/src/datatypes/dtype.rs b/crates/polars-core/src/datatypes/dtype.rs index 0b643a908273..8ac4de770d3f 100644 --- a/crates/polars-core/src/datatypes/dtype.rs +++ b/crates/polars-core/src/datatypes/dtype.rs @@ -24,6 +24,7 @@ pub enum DataType { /// String data String, Binary, + BinaryOffset, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). Date, @@ -190,6 +191,10 @@ impl DataType { matches!(self, DataType::Boolean) } + pub fn is_binary(&self) -> bool { + matches!(self, DataType::Binary) + } + /// Check if type is sortable pub fn is_ord(&self) -> bool { #[cfg(feature = "dtype-categorical")] @@ -265,7 +270,7 @@ impl DataType { } #[inline] - pub fn try_to_arrow(&self, _pl_flavor: bool) -> PolarsResult { + pub fn try_to_arrow(&self, pl_flavor: bool) -> PolarsResult { use DataType::*; match self { Boolean => Ok(ArrowDataType::Boolean), @@ -286,10 +291,21 @@ impl DataType { scale.unwrap_or(0), // and what else can we do here? )), String => { - // TODO! implement pl_flavor - Ok(ArrowDataType::LargeUtf8) + let dt = if pl_flavor { + ArrowDataType::Utf8View + } else { + ArrowDataType::LargeUtf8 + }; + Ok(dt) + }, + Binary => { + let dt = if pl_flavor { + ArrowDataType::BinaryView + } else { + ArrowDataType::LargeBinary + }; + Ok(dt) }, - Binary => Ok(ArrowDataType::LargeBinary), Date => Ok(ArrowDataType::Date32), Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(unit.to_arrow(), tz.clone())), Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())), @@ -298,13 +314,13 @@ impl DataType { Array(dt, size) => Ok(ArrowDataType::FixedSizeList( Box::new(arrow::datatypes::Field::new( "item", - dt.try_to_arrow(true)?, + dt.try_to_arrow(pl_flavor)?, true, )), *size, )), List(dt) => Ok(ArrowDataType::LargeList(Box::new( - arrow::datatypes::Field::new("item", dt.to_arrow(true), true), + arrow::datatypes::Field::new("item", dt.to_arrow(pl_flavor), true), ))), Null => Ok(ArrowDataType::Null), #[cfg(feature = "object")] @@ -319,9 +335,10 @@ impl DataType { )), #[cfg(feature = "dtype-struct")] Struct(fields) => { - let fields = fields.iter().map(|fld| fld.to_arrow(true)).collect(); + let fields = fields.iter().map(|fld| fld.to_arrow(pl_flavor)).collect(); Ok(ArrowDataType::Struct(fields)) }, + BinaryOffset => Ok(ArrowDataType::LargeBinary), Unknown => { polars_bail!(InvalidOperation: "cannot convert Unknown dtype data to Arrow") }, @@ -397,6 +414,7 @@ impl Display for DataType { #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()), DataType::Unknown => "unknown", + DataType::BinaryOffset => "binary[offset]", }; f.write_str(s) } diff --git a/crates/polars-core/src/datatypes/field.rs b/crates/polars-core/src/datatypes/field.rs index b6df989efb39..1a8e4574076b 100644 --- a/crates/polars-core/src/datatypes/field.rs +++ b/crates/polars-core/src/datatypes/field.rs @@ -13,6 +13,8 @@ pub struct Field { pub dtype: DataType, } +pub type FieldRef = Arc; + impl Field { /// Creates a new `Field`. /// @@ -112,8 +114,12 @@ impl Field { } } -impl From<&ArrowDataType> for DataType { - fn from(dt: &ArrowDataType) -> Self { +impl DataType { + pub fn boxed(self) -> Box { + Box::new(self) + } + + pub fn from_arrow(dt: &ArrowDataType, bin_to_view: bool) -> DataType { match dt { ArrowDataType::Null => DataType::Null, ArrowDataType::UInt8 => DataType::UInt8, @@ -128,14 +134,12 @@ impl From<&ArrowDataType> for DataType { ArrowDataType::Float32 => DataType::Float32, ArrowDataType::Float64 => DataType::Float64, #[cfg(feature = "dtype-array")] - ArrowDataType::FixedSizeList(f, size) => DataType::Array(Box::new(f.data_type().into()), *size), - ArrowDataType::LargeList(f) | ArrowDataType::List(f) => DataType::List(Box::new(f.data_type().into())), + ArrowDataType::FixedSizeList(f, size) => DataType::Array(DataType::from_arrow(f.data_type(), bin_to_view).boxed(), *size), + ArrowDataType::LargeList(f) | ArrowDataType::List(f) => DataType::List(DataType::from_arrow(f.data_type(), bin_to_view).boxed()), ArrowDataType::Date32 => DataType::Date, ArrowDataType::Timestamp(tu, tz) => DataType::Datetime(tu.into(), tz.clone()), ArrowDataType::Duration(tu) => DataType::Duration(tu.into()), ArrowDataType::Date64 => DataType::Datetime(TimeUnit::Milliseconds, None), - ArrowDataType::LargeUtf8 | ArrowDataType::Utf8 => DataType::String, - ArrowDataType::LargeBinary | ArrowDataType::Binary => DataType::Binary, ArrowDataType::Time64(_) | ArrowDataType::Time32(_) => DataType::Time, #[cfg(feature = "dtype-categorical")] ArrowDataType::Dictionary(_, _, _) => DataType::Categorical(None,Default::default()), @@ -155,11 +159,27 @@ impl From<&ArrowDataType> for DataType { } #[cfg(feature = "dtype-decimal")] ArrowDataType::Decimal(precision, scale) => DataType::Decimal(Some(*precision), Some(*scale)), + ArrowDataType::Utf8View |ArrowDataType::LargeUtf8 | ArrowDataType::Utf8 => DataType::String, + ArrowDataType::BinaryView => DataType::Binary, + ArrowDataType::LargeBinary | ArrowDataType::Binary => { + if bin_to_view { + DataType::Binary + } else { + + DataType::BinaryOffset + } + }, dt => panic!("Arrow datatype {dt:?} not supported by Polars. You probably need to activate that data-type feature."), } } } +impl From<&ArrowDataType> for DataType { + fn from(dt: &ArrowDataType) -> Self { + Self::from_arrow(dt, true) + } +} + impl From<&ArrowField> for Field { fn from(f: &ArrowField) -> Self { Field::new(&f.name, f.data_type().into()) diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs index ac1c2ca5416c..5fc4b4427d2b 100644 --- a/crates/polars-core/src/datatypes/mod.rs +++ b/crates/polars-core/src/datatypes/mod.rs @@ -152,8 +152,9 @@ impl_polars_datatype!(DatetimeType, Unknown, PrimitiveArray, 'a, i64, i64); impl_polars_datatype!(DurationType, Unknown, PrimitiveArray, 'a, i64, i64); impl_polars_datatype!(CategoricalType, Unknown, PrimitiveArray, 'a, u32, u32); impl_polars_datatype!(TimeType, Time, PrimitiveArray, 'a, i64, i64); -impl_polars_datatype!(StringType, String, Utf8Array, 'a, &'a str, Option<&'a str>); -impl_polars_datatype!(BinaryType, Binary, BinaryArray, 'a, &'a [u8], Option<&'a [u8]>); +impl_polars_datatype!(StringType, String, Utf8ViewArray, 'a, &'a str, Option<&'a str>); +impl_polars_datatype!(BinaryType, Binary, BinaryViewArray, 'a, &'a [u8], Option<&'a [u8]>); +impl_polars_datatype!(BinaryOffsetType, BinaryOffset, BinaryArray, 'a, &'a [u8], Option<&'a [u8]>); impl_polars_datatype!(BooleanType, Boolean, BooleanArray, 'a, bool, bool); #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -236,6 +237,7 @@ pub type Float32Chunked = ChunkedArray; pub type Float64Chunked = ChunkedArray; pub type StringChunked = ChunkedArray; pub type BinaryChunked = ChunkedArray; +pub type BinaryOffsetChunked = ChunkedArray; #[cfg(feature = "object")] pub type ObjectChunked = ChunkedArray>; diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index f32bb3dcb8c4..2bd70c3659fd 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -363,6 +363,15 @@ impl Debug for Series { DataType::Binary => { format_array!(f, self.binary().unwrap(), "binary", self.name(), "Series") }, + DataType::BinaryOffset => { + format_array!( + f, + self.binary_offset().unwrap(), + "binary[offset]", + self.name(), + "Series" + ) + }, dt => panic!("{dt:?} not impl"), } } @@ -974,7 +983,14 @@ impl Display for AnyValue<'_> { AnyValue::Boolean(v) => write!(f, "{}", *v), AnyValue::String(v) => write!(f, "{}", format_args!("\"{v}\"")), AnyValue::StringOwned(v) => write!(f, "{}", format_args!("\"{v}\"")), - AnyValue::Binary(_) | AnyValue::BinaryOwned(_) => write!(f, "[binary data]"), + AnyValue::Binary(d) => { + let s = String::from_utf8_lossy(d); + write!(f, "{}", format_args!("b\"{s}\"")) + }, + AnyValue::BinaryOwned(d) => { + let s = String::from_utf8_lossy(d); + write!(f, "{}", format_args!("b\"{s}\"")) + }, #[cfg(feature = "dtype-date")] AnyValue::Date(v) => write!(f, "{}", date32_to_date(*v)), #[cfg(feature = "dtype-datetime")] diff --git a/crates/polars-core/src/frame/chunks.rs b/crates/polars-core/src/frame/chunks.rs index c75b33445bc2..4fb417ecab75 100644 --- a/crates/polars-core/src/frame/chunks.rs +++ b/crates/polars-core/src/frame/chunks.rs @@ -13,7 +13,7 @@ impl std::convert::TryFrom<(ArrowChunk, &[ArrowField])> for DataFrame { .columns() .iter() .zip(arg.1) - .map(|(arr, field)| Series::try_from((field.name.as_ref(), arr.clone()))) + .map(|(arr, field)| Series::try_from((field, arr.clone()))) .collect(); DataFrame::new(columns?) diff --git a/crates/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs index d5b0f05a31f6..b75e1efbe33a 100644 --- a/crates/polars-core/src/frame/explode.rs +++ b/crates/polars-core/src/frame/explode.rs @@ -14,7 +14,6 @@ use crate::POOL; fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer)> { match series.dtype() { DataType::List(_) => series.list().unwrap().explode_and_offsets(), - DataType::String => series.str().unwrap().explode_and_offsets(), #[cfg(feature = "dtype-array")] DataType::Array(_, _) => series.array().unwrap().explode_and_offsets(), _ => polars_bail!(opq = explode, series.dtype()), @@ -293,13 +292,9 @@ impl DataFrame { st = try_get_supertype(&st, dt?)?; } - let values_len = value_vars.iter().map(|name| name.len()).sum::(); - // The column name of the variable that is melted - let mut variable_col = MutableUtf8Array::::with_capacities( - len * value_vars.len() + 1, - len * values_len + 1, - ); + let mut variable_col = + MutableBinaryViewArray::::with_capacity(len * value_vars.len() + 1); // prepare ids let ids_ = self.select_with_schema_unchecked(id_vars, &schema)?; let mut ids = ids_.clone(); @@ -314,7 +309,7 @@ impl DataFrame { let mut values = Vec::with_capacity(value_vars.len()); for value_column_name in &value_vars { - variable_col.extend_trusted_len_values(std::iter::repeat(value_column_name).take(len)); + variable_col.extend_constant(len, Some(value_column_name.as_str())); // ensure we go via the schema so we are O(1) // self.column() is linear // together with this loop that would make it O^2 over value_vars @@ -370,16 +365,6 @@ mod test { exploded.column("foo").unwrap().i8().unwrap().get(8), Some(2) ); - - let str = Series::new("foo", &["abc", "de", "fg"]); - let df = DataFrame::new(vec![str, s0, s1]).unwrap(); - let exploded = df.explode(["foo"]).unwrap(); - assert_eq!(exploded.column("C").unwrap().i32().unwrap().get(6), Some(1)); - assert_eq!(exploded.column("B").unwrap().i32().unwrap().get(6), Some(3)); - assert_eq!( - exploded.column("foo").unwrap().str().unwrap().get(6), - Some("g") - ); } #[test] diff --git a/crates/polars-core/src/frame/group_by/aggregations/string.rs b/crates/polars-core/src/frame/group_by/aggregations/string.rs index f5a02989cae1..889217addd90 100644 --- a/crates/polars-core/src/frame/group_by/aggregations/string.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/string.rs @@ -1,22 +1,22 @@ use super::*; -pub fn _agg_helper_idx_utf8<'a, F>(groups: &'a GroupsIdx, f: F) -> Series +pub fn _agg_helper_idx_bin<'a, F>(groups: &'a GroupsIdx, f: F) -> Series where - F: Fn((IdxSize, &'a IdxVec)) -> Option<&'a str> + Send + Sync, + F: Fn((IdxSize, &'a IdxVec)) -> Option<&'a [u8]> + Send + Sync, { - let ca: StringChunked = POOL.install(|| groups.into_par_iter().map(f).collect()); + let ca: BinaryChunked = POOL.install(|| groups.into_par_iter().map(f).collect()); ca.into_series() } -pub fn _agg_helper_slice_utf8<'a, F>(groups: &'a [[IdxSize; 2]], f: F) -> Series +pub fn _agg_helper_slice_bin<'a, F>(groups: &'a [[IdxSize; 2]], f: F) -> Series where - F: Fn([IdxSize; 2]) -> Option<&'a str> + Send + Sync, + F: Fn([IdxSize; 2]) -> Option<&'a [u8]> + Send + Sync, { - let ca: StringChunked = POOL.install(|| groups.par_iter().copied().map(f).collect()); + let ca: BinaryChunked = POOL.install(|| groups.par_iter().copied().map(f).collect()); ca.into_series() } -impl StringChunked { +impl BinaryChunked { #[allow(clippy::needless_lifetimes)] pub(crate) unsafe fn agg_min<'a>(&'a self, groups: &GroupsProxy) -> Series { // faster paths @@ -35,20 +35,20 @@ impl StringChunked { let ca_self = self.rechunk(); let arr = ca_self.downcast_iter().next().unwrap(); let no_nulls = arr.null_count() == 0; - _agg_helper_idx_utf8(groups, |(first, idx)| { + _agg_helper_idx_bin(groups, |(first, idx)| { debug_assert!(idx.len() <= ca_self.len()); if idx.is_empty() { None } else if idx.len() == 1 { arr.get_unchecked(first as usize) } else if no_nulls { - take_agg_utf8_iter_unchecked_no_null( + take_agg_bin_iter_unchecked_no_null( arr, indexes_to_usizes(idx), |acc, v| if acc < v { acc } else { v }, ) } else { - take_agg_utf8_iter_unchecked( + take_agg_bin_iter_unchecked( arr, indexes_to_usizes(idx), |acc, v| if acc < v { acc } else { v }, @@ -60,19 +60,19 @@ impl StringChunked { GroupsProxy::Slice { groups: groups_slice, .. - } => _agg_helper_slice_utf8(groups_slice, |[first, len]| { + } => _agg_helper_slice_bin(groups_slice, |[first, len]| { debug_assert!(len <= self.len() as IdxSize); match len { 0 => None, 1 => self.get(first as usize), _ => { let arr_group = _slice_from_offsets(self, first, len); - let borrowed = arr_group.min_str(); + let borrowed = arr_group.min_binary(); // Safety: // The borrowed has `arr_group`s lifetime, but it actually points to data // hold by self. Here we tell the compiler that. - unsafe { std::mem::transmute::, Option<&'a str>>(borrowed) } + unsafe { std::mem::transmute::, Option<&'a [u8]>>(borrowed) } }, } }), @@ -97,20 +97,20 @@ impl StringChunked { let ca_self = self.rechunk(); let arr = ca_self.downcast_iter().next().unwrap(); let no_nulls = arr.null_count() == 0; - _agg_helper_idx_utf8(groups, |(first, idx)| { + _agg_helper_idx_bin(groups, |(first, idx)| { debug_assert!(idx.len() <= self.len()); if idx.is_empty() { None } else if idx.len() == 1 { ca_self.get(first as usize) } else if no_nulls { - take_agg_utf8_iter_unchecked_no_null( + take_agg_bin_iter_unchecked_no_null( arr, indexes_to_usizes(idx), |acc, v| if acc > v { acc } else { v }, ) } else { - take_agg_utf8_iter_unchecked( + take_agg_bin_iter_unchecked( arr, indexes_to_usizes(idx), |acc, v| if acc > v { acc } else { v }, @@ -122,22 +122,36 @@ impl StringChunked { GroupsProxy::Slice { groups: groups_slice, .. - } => _agg_helper_slice_utf8(groups_slice, |[first, len]| { + } => _agg_helper_slice_bin(groups_slice, |[first, len]| { debug_assert!(len <= self.len() as IdxSize); match len { 0 => None, 1 => self.get(first as usize), _ => { let arr_group = _slice_from_offsets(self, first, len); - let borrowed = arr_group.max_str(); + let borrowed = arr_group.max_binary(); // Safety: // The borrowed has `arr_group`s lifetime, but it actually points to data // hold by self. Here we tell the compiler that. - unsafe { std::mem::transmute::, Option<&'a str>>(borrowed) } + unsafe { std::mem::transmute::, Option<&'a [u8]>>(borrowed) } }, } }), } } } + +impl StringChunked { + #[allow(clippy::needless_lifetimes)] + pub(crate) unsafe fn agg_min<'a>(&'a self, groups: &GroupsProxy) -> Series { + let out = self.as_binary().agg_min(groups); + out.binary().unwrap().to_string().into_series() + } + + #[allow(clippy::needless_lifetimes)] + pub(crate) unsafe fn agg_max<'a>(&'a self, groups: &GroupsProxy) -> Series { + let out = self.as_binary().agg_max(groups); + out.binary().unwrap().to_string().into_series() + } +} diff --git a/crates/polars-core/src/frame/group_by/into_groups.rs b/crates/polars-core/src/frame/group_by/into_groups.rs index 5a31ee3e75b3..067844a1dafb 100644 --- a/crates/polars-core/src/frame/group_by/into_groups.rs +++ b/crates/polars-core/src/frame/group_by/into_groups.rs @@ -239,6 +239,20 @@ impl IntoGroupsProxy for StringChunked { } } +fn fill_bytes_hashes(ca: &BinaryChunked, null_h: u64, hb: RandomState) -> Vec { + let mut byte_hashes = Vec::with_capacity(ca.len()); + for arr in ca.downcast_iter() { + for opt_b in arr { + let hash = match opt_b { + Some(s) => hb.hash_one(s), + None => null_h, + }; + byte_hashes.push(BytesHash::new(opt_b, hash)) + } + } + byte_hashes +} + impl IntoGroupsProxy for BinaryChunked { #[allow(clippy::needless_lifetimes)] fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult { @@ -255,37 +269,78 @@ impl IntoGroupsProxy for BinaryChunked { .into_par_iter() .map(|(offset, len)| { let ca = self.slice(offset as i64, len); - ca.into_iter() - .map(|opt_b| { - let hash = match opt_b { - Some(s) => hb.hash_one(s), - None => null_h, - }; - // Safety: - // the underlying data is tied to self - unsafe { - std::mem::transmute::, BytesHash<'a>>( - BytesHash::new(opt_b, hash), - ) - } - }) - .collect_trusted::>() + let byte_hashes = fill_bytes_hashes(&ca, null_h, hb.clone()); + + // Safety: + // the underlying data is tied to self + unsafe { + std::mem::transmute::>, Vec>>( + byte_hashes, + ) + } }) .collect::>() }); let byte_hashes = byte_hashes.iter().collect::>(); group_by_threaded_slice(byte_hashes, n_partitions, sorted) } else { - let byte_hashes = self - .into_iter() - .map(|opt_b| { - let hash = match opt_b { - Some(s) => hb.hash_one(s), - None => null_h, - }; - BytesHash::new(opt_b, hash) - }) - .collect_trusted::>(); + let byte_hashes = fill_bytes_hashes(self, null_h, hb.clone()); + group_by(byte_hashes.iter(), sorted) + }; + Ok(out) + } +} + +fn fill_bytes_offset_hashes( + ca: &BinaryOffsetChunked, + null_h: u64, + hb: RandomState, +) -> Vec { + let mut byte_hashes = Vec::with_capacity(ca.len()); + for arr in ca.downcast_iter() { + for opt_b in arr { + let hash = match opt_b { + Some(s) => hb.hash_one(s), + None => null_h, + }; + byte_hashes.push(BytesHash::new(opt_b, hash)) + } + } + byte_hashes +} + +impl IntoGroupsProxy for BinaryOffsetChunked { + #[allow(clippy::needless_lifetimes)] + fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult { + let hb = RandomState::default(); + let null_h = get_null_hash_value(&hb); + + let out = if multithreaded { + let n_partitions = _set_partition_size(); + + let split = _split_offsets(self.len(), n_partitions); + + let byte_hashes = POOL.install(|| { + split + .into_par_iter() + .map(|(offset, len)| { + let ca = self.slice(offset as i64, len); + let byte_hashes = fill_bytes_offset_hashes(&ca, null_h, hb.clone()); + + // Safety: + // the underlying data is tied to self + unsafe { + std::mem::transmute::>, Vec>>( + byte_hashes, + ) + } + }) + .collect::>() + }); + let byte_hashes = byte_hashes.iter().collect::>(); + group_by_threaded_slice(byte_hashes, n_partitions, sorted) + } else { + let byte_hashes = fill_bytes_offset_hashes(self, null_h, hb.clone()); group_by(byte_hashes.iter(), sorted) }; Ok(out) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 10e84380399a..a8290a9c01c1 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1650,17 +1650,7 @@ impl DataFrame { if std::env::var("POLARS_VERT_PAR").is_ok() { return self.clone().filter_vertical(mask); } - let new_col = self.try_apply_columns_par(&|s| match s.dtype() { - DataType::String => { - let ca = s.str().unwrap(); - if ca.get_values_size() / 24 <= ca.len() { - s.filter(mask) - } else { - s.filter_threaded(mask, true) - } - }, - _ => s.filter(mask), - })?; + let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?; Ok(DataFrame::new_no_checks(new_col)) } @@ -1682,19 +1672,7 @@ impl DataFrame { /// } /// ``` pub fn take(&self, indices: &IdxCa) -> PolarsResult { - let new_col = POOL.install(|| { - self.try_apply_columns_par(&|s| match s.dtype() { - DataType::String => { - let ca = s.str().unwrap(); - if ca.get_values_size() / 24 <= ca.len() { - s.take(indices) - } else { - s.take_threaded(indices, true) - } - }, - _ => s.take(indices), - }) - })?; + let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?; Ok(DataFrame::new_no_checks(new_col)) } @@ -1707,12 +1685,7 @@ impl DataFrame { unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self { let cols = if allow_threads { - POOL.install(|| { - self.apply_columns_par(&|s| match s.dtype() { - DataType::String => s.take_unchecked_threaded(idx, true), - _ => s.take_unchecked(idx), - }) - }) + POOL.install(|| self.apply_columns_par(&|s| s.take_unchecked(idx))) } else { self.columns.iter().map(|s| s.take_unchecked(idx)).collect() }; @@ -1725,12 +1698,7 @@ impl DataFrame { unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self { let cols = if allow_threads { - POOL.install(|| { - self.apply_columns_par(&|s| match s.dtype() { - DataType::String => s.take_slice_unchecked_threaded(idx, true), - _ => s.take_slice_unchecked(idx), - }) - }) + POOL.install(|| self.apply_columns_par(&|s| s.take_slice_unchecked(idx))) } else { self.columns .iter() diff --git a/crates/polars-core/src/frame/row/av_buffer.rs b/crates/polars-core/src/frame/row/av_buffer.rs index 7cd3b6ac7db2..5c5d737db096 100644 --- a/crates/polars-core/src/frame/row/av_buffer.rs +++ b/crates/polars-core/src/frame/row/av_buffer.rs @@ -215,14 +215,7 @@ impl<'a> AnyValueBuffer<'a> { new.finish().into_series() }, String(b) => { - let avg_values_len = b - .builder - .values() - .len() - .saturating_div(b.builder.capacity() + 1) - + 1; - let mut new = - StringChunkedBuilder::new(b.field.name(), capacity, avg_values_len * capacity); + let mut new = StringChunkedBuilder::new(b.field.name(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, @@ -304,7 +297,7 @@ impl From<(&DataType, usize)> for AnyValueBuffer<'_> { Time => AnyValueBuffer::Time(PrimitiveChunkedBuilder::new("", len)), Float32 => AnyValueBuffer::Float32(PrimitiveChunkedBuilder::new("", len)), Float64 => AnyValueBuffer::Float64(PrimitiveChunkedBuilder::new("", len)), - String => AnyValueBuffer::String(StringChunkedBuilder::new("", len, len * 5)), + String => AnyValueBuffer::String(StringChunkedBuilder::new("", len)), Null => AnyValueBuffer::Null(NullChunkedBuilder::new("", 0)), // Struct and List can be recursive so use anyvalues for that dt => AnyValueBuffer::All(dt.clone(), Vec::with_capacity(len)), @@ -584,11 +577,7 @@ impl<'a> AnyValueBufferTrusted<'a> { new.finish().into_series() }, String(b) => { - let avg_values_len = - (b.builder.values().len() as f64) / ((b.builder.capacity() + 1) as f64) + 1.0; - // alloc some extra to reduce realloc prob. - let new_values_len = (avg_values_len * capacity as f64 * 1.3) as usize; - let mut new = StringChunkedBuilder::new(b.field.name(), capacity, new_values_len); + let mut new = StringChunkedBuilder::new(b.field.name(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, @@ -666,7 +655,7 @@ impl From<(&DataType, usize)> for AnyValueBufferTrusted<'_> { UInt16 => AnyValueBufferTrusted::UInt16(PrimitiveChunkedBuilder::new("", len)), Float32 => AnyValueBufferTrusted::Float32(PrimitiveChunkedBuilder::new("", len)), Float64 => AnyValueBufferTrusted::Float64(PrimitiveChunkedBuilder::new("", len)), - String => AnyValueBufferTrusted::String(StringChunkedBuilder::new("", len, len * 5)), + String => AnyValueBufferTrusted::String(StringChunkedBuilder::new("", len)), #[cfg(feature = "dtype-struct")] Struct(fields) => { let buffers = fields diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs index 900422dc8070..db4a8fab1005 100644 --- a/crates/polars-core/src/hashing/vector_hasher.rs +++ b/crates/polars-core/src/hashing/vector_hasher.rs @@ -187,7 +187,65 @@ pub fn _hash_binary_array(arr: &BinaryArray, random_state: RandomState, buf } } +fn hash_binview_array(arr: &BinaryViewArray, random_state: RandomState, buf: &mut Vec) { + let null_h = get_null_hash_value(&random_state); + if arr.null_count() == 0 { + // use the null_hash as seed to get a hash determined by `random_state` that is passed + buf.extend(arr.values_iter().map(|v| xxh3_64_with_seed(v, null_h))) + } else { + buf.extend(arr.into_iter().map(|opt_v| match opt_v { + Some(v) => xxh3_64_with_seed(v, null_h), + None => null_h, + })) + } +} + impl VecHash for BinaryChunked { + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { + buf.clear(); + buf.reserve(self.len()); + self.downcast_iter() + .for_each(|arr| hash_binview_array(arr, random_state.clone(), buf)); + Ok(()) + } + + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { + let null_h = get_null_hash_value(&random_state); + + let mut offset = 0; + self.downcast_iter().for_each(|arr| { + match arr.null_count() { + 0 => arr + .values_iter() + .zip(&mut hashes[offset..]) + .for_each(|(v, h)| { + let l = xxh3_64_with_seed(v, null_h); + *h = _boost_hash_combine(l, *h) + }), + _ => { + let validity = arr.validity().unwrap(); + let (slice, byte_offset, _) = validity.as_slice(); + (0..validity.len()) + .map(|i| unsafe { get_bit_unchecked(slice, i + byte_offset) }) + .zip(&mut hashes[offset..]) + .zip(arr.values_iter()) + .for_each(|((valid, h), l)| { + let l = if valid { + xxh3_64_with_seed(l, null_h) + } else { + null_h + }; + *h = _boost_hash_combine(l, *h) + }); + }, + } + offset += arr.len(); + }); + Ok(()) + } +} + +impl VecHash for BinaryOffsetChunked { fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { buf.clear(); buf.reserve(self.len()); diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs index 416fca94a5cc..96677102c23c 100644 --- a/crates/polars-core/src/prelude.rs +++ b/crates/polars-core/src/prelude.rs @@ -7,7 +7,7 @@ pub use arrow::datatypes::{ArrowSchema, Field as ArrowField}; #[cfg(feature = "ewma")] pub use arrow::legacy::kernels::ewm::EWMOptions; pub use arrow::legacy::prelude::*; -pub(crate) use arrow::legacy::trusted_len::TrustedLen; +pub(crate) use arrow::trusted_len::TrustedLen; pub(crate) use polars_utils::total_ord::{TotalEq, TotalOrd}; pub use crate::chunked_array::builder::{ diff --git a/crates/polars-core/src/schema.rs b/crates/polars-core/src/schema.rs index 8ea8a4dea3ef..bfc413a77a95 100644 --- a/crates/polars-core/src/schema.rs +++ b/crates/polars-core/src/schema.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use std::fmt::{Debug, Formatter}; use arrow::datatypes::ArrowSchemaRef; @@ -347,7 +348,18 @@ impl Schema { let fields: Vec<_> = self .inner .iter() - .map(|(name, dtype)| ArrowField::new(name.as_str(), dtype.to_arrow(pl_flavor), true)) + .map(|(name, dtype)| { + let field = ArrowField::new(name.as_str(), dtype.to_arrow(pl_flavor), true); + match dtype { + DataType::BinaryOffset => field.with_metadata({ + let mut bs = BTreeMap::new(); + // Make sure that we keep the type we written. + bs.insert("pl".to_string(), "maintain_type".to_string()); + bs + }), + _ => field, + } + }) .collect(); ArrowSchema::from(fields) } diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 42197f9a6f59..fe784e7d15b5 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -9,7 +9,7 @@ fn any_values_to_primitive(avs: &[AnyValue]) -> ChunkedArr } fn any_values_to_string(avs: &[AnyValue], strict: bool) -> PolarsResult { - let mut builder = StringChunkedBuilder::new("", avs.len(), avs.len() * 10); + let mut builder = StringChunkedBuilder::new("", avs.len()); // amortize allocations let mut owned = String::new(); diff --git a/crates/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs index d7fc5d033414..932abeec6c09 100644 --- a/crates/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -1,7 +1,7 @@ use std::convert::TryFrom; -use arrow::compute::cast::utf8_to_large_utf8; -use arrow::legacy::compute::cast::cast; +use arrow::compute::cast::cast_unchecked as cast; +use arrow::datatypes::Metadata; #[cfg(any(feature = "dtype-struct", feature = "dtype-categorical"))] use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked; #[cfg(any( @@ -100,6 +100,7 @@ impl Series { Boolean => BooleanChunked::from_chunks(name, chunks).into_series(), Float32 => Float32Chunked::from_chunks(name, chunks).into_series(), Float64 => Float64Chunked::from_chunks(name, chunks).into_series(), + BinaryOffset => BinaryOffsetChunked::from_chunks(name, chunks).into_series(), #[cfg(feature = "dtype-struct")] Struct(_) => { Series::_try_from_arrow_unchecked(name, chunks, &dtype.to_arrow(true)).unwrap() @@ -129,22 +130,40 @@ impl Series { } } + /// # Safety + /// The caller must ensure that the given `dtype` matches all the `ArrayRef` dtypes. + pub unsafe fn _try_from_arrow_unchecked( + name: &str, + chunks: Vec, + dtype: &ArrowDataType, + ) -> PolarsResult { + Self::_try_from_arrow_unchecked_with_md(name, chunks, dtype, None) + } + /// Create a new Series without checking if the inner dtype of the chunks is correct /// /// # Safety /// The caller must ensure that the given `dtype` matches all the `ArrayRef` dtypes. - pub unsafe fn _try_from_arrow_unchecked( + pub unsafe fn _try_from_arrow_unchecked_with_md( name: &str, chunks: Vec, dtype: &ArrowDataType, + md: Option<&Metadata>, ) -> PolarsResult { match dtype { - ArrowDataType::LargeUtf8 => Ok(StringChunked::from_chunks(name, chunks).into_series()), - ArrowDataType::Utf8 => { + ArrowDataType::Utf8View => Ok(StringChunked::from_chunks(name, chunks).into_series()), + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { let chunks = cast_chunks(&chunks, &DataType::String, false).unwrap(); Ok(StringChunked::from_chunks(name, chunks).into_series()) }, + ArrowDataType::BinaryView => Ok(BinaryChunked::from_chunks(name, chunks).into_series()), ArrowDataType::LargeBinary => { + if let Some(md) = md { + if md.get("pl").map(|s| s.as_str()) == Some("maintain_type") { + return Ok(BinaryOffsetChunked::from_chunks(name, chunks).into_series()); + } + } + let chunks = cast_chunks(&chunks, &DataType::Binary, false).unwrap(); Ok(BinaryChunked::from_chunks(name, chunks).into_series()) }, ArrowDataType::Binary => { @@ -503,13 +522,14 @@ fn convert ArrayRef>(arr: &[ArrayRef], f: F) -> Vec) -> (Vec, DataType) { match arrays[0].data_type() { - ArrowDataType::Utf8 => ( - convert(&arrays, |arr| { - let arr = arr.as_any().downcast_ref::>().unwrap(); - Box::from(utf8_to_large_utf8(arr)) - }), - DataType::String, - ), + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { + let chunks = cast_chunks(&arrays, &DataType::String, false).unwrap(); + (chunks, DataType::String) + }, + ArrowDataType::Binary | ArrowDataType::LargeBinary | ArrowDataType::FixedSizeBinary(_) => { + let chunks = cast_chunks(&arrays, &DataType::Binary, false).unwrap(); + (chunks, DataType::Binary) + }, #[allow(unused_variables)] dt @ ArrowDataType::Dictionary(_, _, _) => { feature_gated!("dtype-categorical", { @@ -559,12 +579,6 @@ unsafe fn to_physical_and_dtype(arrays: Vec) -> (Vec, DataTy (arrays, DataType::Array(Box::new(dtype), *size)) }) }, - ArrowDataType::FixedSizeBinary(_) | ArrowDataType::Binary => { - let out = convert(&arrays, |arr| { - cast(arr, &ArrowDataType::LargeBinary).unwrap() - }); - to_physical_and_dtype(out) - }, ArrowDataType::LargeList(_) => { let values = arrays .iter() @@ -645,26 +659,31 @@ unsafe fn to_physical_and_dtype(arrays: Vec) -> (Vec, DataTy } } +fn check_types(chunks: &[ArrayRef]) -> PolarsResult { + let mut chunks_iter = chunks.iter(); + let data_type: ArrowDataType = chunks_iter + .next() + .ok_or_else(|| polars_err!(NoData: "expected at least one array-ref"))? + .data_type() + .clone(); + + for chunk in chunks_iter { + if chunk.data_type() != &data_type { + polars_bail!( + ComputeError: "cannot create series from multiple arrays with different types" + ); + } + } + Ok(data_type) +} + impl TryFrom<(&str, Vec)> for Series { type Error = PolarsError; fn try_from(name_arr: (&str, Vec)) -> PolarsResult { let (name, chunks) = name_arr; - let mut chunks_iter = chunks.iter(); - let data_type: ArrowDataType = chunks_iter - .next() - .ok_or_else(|| polars_err!(NoData: "expected at least one array-ref"))? - .data_type() - .clone(); - - for chunk in chunks_iter { - if chunk.data_type() != &data_type { - polars_bail!( - ComputeError: "cannot create series from multiple arrays with different types" - ); - } - } + let data_type = check_types(&chunks)?; // Safety: // dtype is checked unsafe { Series::_try_from_arrow_unchecked(name, chunks, &data_type) } @@ -680,6 +699,36 @@ impl TryFrom<(&str, ArrayRef)> for Series { } } +impl TryFrom<(&ArrowField, Vec)> for Series { + type Error = PolarsError; + + fn try_from(field_arr: (&ArrowField, Vec)) -> PolarsResult { + let (field, chunks) = field_arr; + + let data_type = check_types(&chunks)?; + + // Safety: + // dtype is checked + unsafe { + Series::_try_from_arrow_unchecked_with_md( + &field.name, + chunks, + &data_type, + Some(&field.metadata), + ) + } + } +} + +impl TryFrom<(&ArrowField, ArrayRef)> for Series { + type Error = PolarsError; + + fn try_from(field_arr: (&ArrowField, ArrayRef)) -> PolarsResult { + let (field, arr) = field_arr; + Series::try_from((field, vec![arr])) + } +} + /// Used to convert a [`ChunkedArray`], `&dyn SeriesTrait` and [`Series`] /// into a [`Series`]. /// # Safety diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index b6adaafe377c..ab50db3a86ad 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -64,6 +64,16 @@ impl private::PrivateSeries for SeriesWrap { self.0.agg_list(groups) } + #[cfg(feature = "algorithm_group_by")] + unsafe fn agg_min(&self, groups: &GroupsProxy) -> Series { + self.0.agg_min(groups) + } + + #[cfg(feature = "algorithm_group_by")] + unsafe fn agg_max(&self, groups: &GroupsProxy) -> Series { + self.0.agg_max(groups) + } + fn subtract(&self, rhs: &Series) -> PolarsResult { NumOpsDispatch::subtract(&self.0, rhs) } diff --git a/crates/polars-core/src/series/implementations/binary_offset.rs b/crates/polars-core/src/series/implementations/binary_offset.rs new file mode 100644 index 000000000000..62d7205cc686 --- /dev/null +++ b/crates/polars-core/src/series/implementations/binary_offset.rs @@ -0,0 +1,192 @@ +use std::borrow::Cow; + +use ahash::RandomState; + +use super::{private, IntoSeries, SeriesTrait, *}; +use crate::chunked_array::comparison::*; +use crate::chunked_array::ops::compare_inner::{ + IntoTotalEqInner, IntoTotalOrdInner, TotalEqInner, TotalOrdInner, +}; +#[cfg(feature = "algorithm_group_by")] +use crate::frame::group_by::*; +use crate::prelude::*; +use crate::series::implementations::SeriesWrap; + +impl private::PrivateSeries for SeriesWrap { + fn compute_len(&mut self) { + self.0.compute_len() + } + fn _field(&self) -> Cow { + Cow::Borrowed(self.0.ref_field()) + } + fn _dtype(&self) -> &DataType { + self.0.ref_field().data_type() + } + fn _get_flags(&self) -> Settings { + self.0.get_flags() + } + fn _set_flags(&mut self, flags: Settings) { + self.0.set_flags(flags) + } + + unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { + self.0.equal_element(idx_self, idx_other, other) + } + + fn into_total_eq_inner<'a>(&'a self) -> Box { + (&self.0).into_total_eq_inner() + } + fn into_total_ord_inner<'a>(&'a self) -> Box { + (&self.0).into_total_ord_inner() + } + + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { + self.0.vec_hash(random_state, buf)?; + Ok(()) + } + + fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { + self.0.vec_hash_combine(build_hasher, hashes)?; + Ok(()) + } + + #[cfg(feature = "algorithm_group_by")] + fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult { + IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted) + } + + fn arg_sort_multiple(&self, options: &SortMultipleOptions) -> PolarsResult { + self.0.arg_sort_multiple(options) + } +} + +impl SeriesTrait for SeriesWrap { + fn rename(&mut self, name: &str) { + self.0.rename(name); + } + + fn chunk_lengths(&self) -> ChunkIdIter { + self.0.chunk_id() + } + fn name(&self) -> &str { + self.0.name() + } + + fn chunks(&self) -> &Vec { + self.0.chunks() + } + unsafe fn chunks_mut(&mut self) -> &mut Vec { + self.0.chunks_mut() + } + fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit() + } + + fn slice(&self, offset: i64, length: usize) -> Series { + self.0.slice(offset, length).into_series() + } + + fn append(&mut self, other: &Series) -> PolarsResult<()> { + polars_ensure!(self.0.dtype() == other.dtype(), append); + // todo! add object + self.0.append(other.as_ref().as_ref()); + Ok(()) + } + + fn extend(&mut self, other: &Series) -> PolarsResult<()> { + polars_ensure!(self.0.dtype() == other.dtype(), extend); + self.0.extend(other.as_ref().as_ref()); + Ok(()) + } + + fn filter(&self, filter: &BooleanChunked) -> PolarsResult { + ChunkFilter::filter(&self.0, filter).map(|ca| ca.into_series()) + } + + #[cfg(feature = "chunked_ids")] + unsafe fn _take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Series { + self.0.take_chunked_unchecked(by, sorted).into_series() + } + + #[cfg(feature = "chunked_ids")] + unsafe fn _take_opt_chunked_unchecked(&self, by: &[Option]) -> Series { + self.0.take_opt_chunked_unchecked(by).into_series() + } + + fn take(&self, indices: &IdxCa) -> PolarsResult { + Ok(self.0.take(indices)?.into_series()) + } + + unsafe fn take_unchecked(&self, indices: &IdxCa) -> Series { + self.0.take_unchecked(indices).into_series() + } + + fn take_slice(&self, indices: &[IdxSize]) -> PolarsResult { + Ok(self.0.take(indices)?.into_series()) + } + + unsafe fn take_slice_unchecked(&self, indices: &[IdxSize]) -> Series { + self.0.take_unchecked(indices).into_series() + } + + fn len(&self) -> usize { + self.0.len() + } + + fn rechunk(&self) -> Series { + self.0.rechunk().into_series() + } + + fn new_from_index(&self, index: usize, length: usize) -> Series { + ChunkExpandAtIndex::new_from_index(&self.0, index, length).into_series() + } + + fn cast(&self, data_type: &DataType) -> PolarsResult { + self.0.cast(data_type) + } + + fn get(&self, index: usize) -> PolarsResult { + self.0.get_any_value(index) + } + + #[inline] + unsafe fn get_unchecked(&self, index: usize) -> AnyValue { + self.0.get_any_value_unchecked(index) + } + + fn sort_with(&self, options: SortOptions) -> Series { + ChunkSort::sort_with(&self.0, options).into_series() + } + + fn arg_sort(&self, options: SortOptions) -> IdxCa { + ChunkSort::arg_sort(&self.0, options) + } + + fn null_count(&self) -> usize { + self.0.null_count() + } + + fn has_validity(&self) -> bool { + self.0.has_validity() + } + + fn is_null(&self) -> BooleanChunked { + self.0.is_null() + } + + fn is_not_null(&self) -> BooleanChunked { + self.0.is_not_null() + } + + fn reverse(&self) -> Series { + ChunkReverse::reverse(&self.0).into_series() + } + + fn shift(&self, periods: i64) -> Series { + ChunkShift::shift(&self.0, periods).into_series() + } + + fn clone_inner(&self) -> Arc { + Arc::new(SeriesWrap(Clone::clone(&self.0))) + } +} diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 10f0b7fdfc19..61d780ca6ac4 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -1,6 +1,7 @@ #[cfg(feature = "dtype-array")] mod array; mod binary; +mod binary_offset; mod boolean; #[cfg(feature = "dtype-categorical")] mod categorical; @@ -464,6 +465,7 @@ impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap {} +impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap {} #[cfg(feature = "dtype-array")] impl private::PrivateSeriesNumeric for SeriesWrap {} diff --git a/crates/polars-core/src/series/into.rs b/crates/polars-core/src/series/into.rs index f79ddbfcd499..10eb3687ff43 100644 --- a/crates/polars-core/src/series/into.rs +++ b/crates/polars-core/src/series/into.rs @@ -4,7 +4,8 @@ feature = "dtype-duration", feature = "dtype-time" ))] -use arrow::legacy::compute::cast::cast; +use arrow::compute::cast::cast_default as cast; +use arrow::compute::cast::cast_unchecked; use crate::prelude::*; @@ -22,7 +23,7 @@ impl Series { match self.dtype() { // make sure that we recursively apply all logical types. #[cfg(feature = "dtype-struct")] - DataType::Struct(_) => self.struct_().unwrap().to_arrow(chunk_idx), + DataType::Struct(_) => self.struct_().unwrap().to_arrow(chunk_idx, pl_flavor), // special list branch to // make sure that we recursively apply all logical types. DataType::List(inner) => { @@ -44,10 +45,10 @@ impl Series { .unwrap() }; - s.to_arrow(0, true) + s.to_arrow(0, pl_flavor) }; - let data_type = ListArray::::default_datatype(inner.to_arrow(true)); + let data_type = ListArray::::default_datatype(inner.to_arrow(pl_flavor)); let arr = ListArray::::new( data_type, arr.offsets().clone(), @@ -76,21 +77,29 @@ impl Series { Box::new(arr) as ArrayRef }, #[cfg(feature = "dtype-date")] - DataType::Date => { - cast(&*self.chunks()[chunk_idx], &DataType::Date.to_arrow(true)).unwrap() - }, + DataType::Date => cast( + &*self.chunks()[chunk_idx], + &DataType::Date.to_arrow(pl_flavor), + ) + .unwrap(), #[cfg(feature = "dtype-datetime")] - DataType::Datetime(_, _) => { - cast(&*self.chunks()[chunk_idx], &self.dtype().to_arrow(true)).unwrap() - }, + DataType::Datetime(_, _) => cast( + &*self.chunks()[chunk_idx], + &self.dtype().to_arrow(pl_flavor), + ) + .unwrap(), #[cfg(feature = "dtype-duration")] - DataType::Duration(_) => { - cast(&*self.chunks()[chunk_idx], &self.dtype().to_arrow(true)).unwrap() - }, + DataType::Duration(_) => cast( + &*self.chunks()[chunk_idx], + &self.dtype().to_arrow(pl_flavor), + ) + .unwrap(), #[cfg(feature = "dtype-time")] - DataType::Time => { - cast(&*self.chunks()[chunk_idx], &DataType::Time.to_arrow(true)).unwrap() - }, + DataType::Time => cast( + &*self.chunks()[chunk_idx], + &DataType::Time.to_arrow(pl_flavor), + ) + .unwrap(), #[cfg(feature = "object")] DataType::Object(_, None) => { use crate::chunked_array::object::builder::object_series_to_arrow_array; @@ -107,13 +116,21 @@ impl Series { object_series_to_arrow_array(&s) } }, - DataType::String if pl_flavor => { - // TODO: implement Utf8ViewCast Here - self.array_ref(chunk_idx).clone() + DataType::String => { + if pl_flavor { + self.array_ref(chunk_idx).clone() + } else { + let arr = self.array_ref(chunk_idx); + cast_unchecked(arr.as_ref(), &ArrowDataType::LargeUtf8).unwrap() + } }, - DataType::Binary if pl_flavor => { - // TODO: implement BinViewCast Here - self.array_ref(chunk_idx).clone() + DataType::Binary => { + if pl_flavor { + self.array_ref(chunk_idx).clone() + } else { + let arr = self.array_ref(chunk_idx); + cast_unchecked(arr.as_ref(), &ArrowDataType::LargeBinary).unwrap() + } }, _ => self.array_ref(chunk_idx).clone(), } diff --git a/crates/polars-core/src/series/iterator.rs b/crates/polars-core/src/series/iterator.rs index 6d4b7bde5edf..b8d6385cdbe8 100644 --- a/crates/polars-core/src/series/iterator.rs +++ b/crates/polars-core/src/series/iterator.rs @@ -118,7 +118,7 @@ impl Series { } else { match dtype { DataType::String => { - let arr = arr.as_any().downcast_ref::>().unwrap(); + let arr = arr.as_any().downcast_ref::().unwrap(); if arr.null_count() == 0 { Box::new(arr.values_iter().map(AnyValue::String)) as Box> + '_> diff --git a/crates/polars-core/src/series/ops/downcast.rs b/crates/polars-core/src/series/ops/downcast.rs index 95182b610609..2a5ef2b0948b 100644 --- a/crates/polars-core/src/series/ops/downcast.rs +++ b/crates/polars-core/src/series/ops/downcast.rs @@ -94,6 +94,11 @@ impl Series { unpack_chunked!(self, DataType::Binary => BinaryChunked, "Binary") } + /// Unpack to [`ChunkedArray`] of dtype `[DataType::Binary]` + pub fn binary_offset(&self) -> PolarsResult<&BinaryOffsetChunked> { + unpack_chunked!(self, DataType::BinaryOffset => BinaryOffsetChunked, "BinaryOffset") + } + /// Unpack to [`ChunkedArray`] of dtype `[DataType::Time]` #[cfg(feature = "dtype-time")] pub fn time(&self) -> PolarsResult<&TimeChunked> { diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index 9c2b569c27d6..9455ac2d7411 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -6,7 +6,8 @@ use std::ops::{Deref, DerefMut}; use arrow::bitmap::bitmask::BitMask; use arrow::bitmap::Bitmap; -pub use arrow::legacy::utils::{TrustMyLength, *}; +pub use arrow::legacy::utils::*; +pub use arrow::trusted_len::TrustMyLength; use flatten::*; use num_traits::{One, Zero}; use rayon::prelude::*; diff --git a/crates/polars-io/src/csv/buffer.rs b/crates/polars-io/src/csv/buffer.rs index 94d86cda83bd..42768924927b 100644 --- a/crates/polars-io/src/csv/buffer.rs +++ b/crates/polars-io/src/csv/buffer.rs @@ -1,6 +1,4 @@ -use arrow::array::Utf8Array; -use arrow::bitmap::MutableBitmap; -use arrow::legacy::prelude::FromDataUtf8; +use arrow::array::MutableBinaryViewArray; use polars_core::prelude::*; use polars_error::to_compute_err; #[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))] @@ -11,7 +9,6 @@ use polars_time::prelude::string::infer::{ }; use crate::csv::parser::{is_whitespace, skip_whitespace}; -use crate::csv::read_impl::RunningSize; use crate::csv::utils::escape_field; use crate::csv::CsvEncoding; @@ -122,46 +119,24 @@ where pub(crate) struct Utf8Field { name: String, - // buffer that holds the string data - data: Vec, - // offsets in the string data buffer - offsets: Vec, - validity: MutableBitmap, + mutable: MutableBinaryViewArray, + scratch: Vec, quote_char: u8, encoding: CsvEncoding, - ignore_errors: bool, } impl Utf8Field { - fn new( - name: &str, - capacity: usize, - str_capacity: usize, - quote_char: Option, - encoding: CsvEncoding, - ignore_errors: bool, - ) -> Self { - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(0); + fn new(name: &str, capacity: usize, quote_char: Option, encoding: CsvEncoding) -> Self { Self { name: name.to_string(), - data: Vec::with_capacity(str_capacity), - offsets, - validity: MutableBitmap::with_capacity(capacity), + mutable: MutableBinaryViewArray::with_capacity(capacity), + scratch: vec![], quote_char: quote_char.unwrap_or(b'"'), encoding, - ignore_errors, } } } -/// We delay validation if we expect utf8 and no errors -/// In case of `ignore-error` -#[inline] -fn delay_utf8_validation(encoding: CsvEncoding, ignore_errors: bool) -> bool { - !(matches!(encoding, CsvEncoding::LossyUtf8) || ignore_errors) -} - #[inline] fn validate_utf8(bytes: &[u8]) -> bool { simdutf8::basic::from_utf8(bytes).is_ok() @@ -178,70 +153,46 @@ impl ParsedBuffer for Utf8Field { _time_unit: Option, ) -> PolarsResult<()> { if bytes.is_empty() { - // append null - self.offsets.push(self.data.len() as i64); - self.validity.push(!missing_is_null); + if missing_is_null { + self.mutable.push_null() + } else { + self.mutable.push(Some("")) + } return Ok(()); } - // Only for lossy utf8 we check utf8 now. Otherwise we check all utf8 at the end. - let parse_result = if delay_utf8_validation(self.encoding, ignore_errors) { - true - } else { - validate_utf8(bytes) - }; - let data_len = self.data.len(); - - // check if field fits in the str data buffer - let remaining_capacity = self.data.capacity() - data_len; - if remaining_capacity < bytes.len() { - // exponential growth strategy - self.data - .reserve(std::cmp::max(self.data.capacity(), bytes.len())) - } + let parse_result = validate_utf8(bytes); // note that one branch writes without updating the length, so we must do that later. - let n_written = if needs_escaping { + let bytes = if needs_escaping { + self.scratch.clear(); + self.scratch.reserve(bytes.len()); polars_ensure!(bytes.len() > 1, ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?); + // Safety: // we just allocated enough capacity and data_len is correct. - unsafe { escape_field(bytes, self.quote_char, self.data.spare_capacity_mut()) } + unsafe { + let n_written = + escape_field(bytes, self.quote_char, self.scratch.spare_capacity_mut()); + self.scratch.set_len(n_written); + } + self.scratch.as_slice() } else { - self.data.extend_from_slice(bytes); - bytes.len() + bytes }; match parse_result { true => { - // Soundness - // the n_written from csv-core are now valid bytes so we can update the length. - unsafe { self.data.set_len(data_len + n_written) } - self.offsets.push(self.data.len() as i64); - self.validity.push(true); + let value = unsafe { std::str::from_utf8_unchecked(bytes) }; + self.mutable.push_value(value) }, false => { if matches!(self.encoding, CsvEncoding::LossyUtf8) { - // Safety: - // we extended to data_len + n_written - // so the bytes are initialized - debug_assert!(self.data.capacity() >= data_len + n_written); - let slice = unsafe { - self.data - .as_slice() - .get_unchecked(data_len..data_len + n_written) - }; - let s = String::from_utf8_lossy(slice).into_owned(); - let b = s.as_bytes(); - // Make sure that we extend at the proper location, - // otherwise we append valid bytes to invalid utf8 bytes. - unsafe { self.data.set_len(data_len) } - self.data.extend_from_slice(b); - self.offsets.push(self.data.len() as i64); - self.validity.push(true); + // TODO! do this without allocating + let s = String::from_utf8_lossy(bytes); + self.mutable.push_value(s.as_ref()) } else if ignore_errors { - // append null - self.offsets.push(self.data.len() as i64); - self.validity.push(false); + self.mutable.push_null() } else { polars_bail!(ComputeError: "invalid utf-8 sequence"); } @@ -482,26 +433,13 @@ pub(crate) fn init_buffers( projection: &[usize], capacity: usize, schema: &Schema, - // The running statistic of the amount of bytes we must allocate per str column - str_capacities: &[RunningSize], quote_char: Option, encoding: CsvEncoding, - ignore_errors: bool, ) -> PolarsResult> { - // we keep track of the string columns we have seen so that we can increment the index - let mut str_index = 0; - projection .iter() .map(|&i| { let (name, dtype) = schema.get_at_index(i).unwrap(); - let mut str_capacity = 0; - // determine the needed capacity for this column - if dtype == &DataType::String { - str_capacity = str_capacities[str_index].size_hint(); - str_index += 1; - } - let builder = match dtype { &DataType::Boolean => Buffer::Boolean(BooleanChunkedBuilder::new(name, capacity)), &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)), @@ -513,10 +451,8 @@ pub(crate) fn init_buffers( &DataType::String => Buffer::Utf8(Utf8Field::new( name, capacity, - str_capacity, quote_char, encoding, - ignore_errors, )), #[cfg(feature = "dtype-datetime")] DataType::Datetime(time_unit, time_zone) => Buffer::Datetime { @@ -595,42 +531,8 @@ impl Buffer { .cast(&DataType::Date) .unwrap(), - Buffer::Utf8(mut v) => { - v.offsets.shrink_to_fit(); - v.data.shrink_to_fit(); - - let mut valid_utf8 = true; - if delay_utf8_validation(v.encoding, v.ignore_errors) { - // Check if the whole buffer is utf8. This alone is not enough, - // we must also check byte starts, see: https://github.com/jorgecarleitao/arrow2/pull/823 - simdutf8::basic::from_utf8(&v.data) - .map_err(|_| polars_err!(ComputeError: "invalid utf-8 sequence in csv"))?; - - for i in (0..v.offsets.len() - 1).step_by(2) { - // SAFETY: we iterate over offsets.len(). - let start = unsafe { *v.offsets.get_unchecked(i) as usize }; - let first = v.data.get(start); - - // A valid code-point iff it does not start with 0b10xxxxxx - // Bit-magic taken from `std::str::is_char_boundary` - if let Some(&b) = first { - if (b as i8) < -0x40 { - valid_utf8 = false; - break; - } - } - } - polars_ensure!(valid_utf8, ComputeError: "invalid utf-8 sequence in CSV"); - } - - // SAFETY: we already checked utf8 validity during parsing, or just now. - let arr = unsafe { - Utf8Array::::from_data_unchecked_default( - v.offsets.into(), - v.data.into(), - Some(v.validity.into()), - ) - }; + Buffer::Utf8(v) => { + let arr = v.mutable.freeze(); StringChunked::with_chunk(v.name.as_str(), arr).into_series() }, #[allow(unused_variables)] @@ -658,8 +560,11 @@ impl Buffer { Buffer::Float32(v) => v.append_null(), Buffer::Float64(v) => v.append_null(), Buffer::Utf8(v) => { - v.offsets.push(v.data.len() as i64); - v.validity.push(valid); + if valid { + v.mutable.push_value("") + } else { + v.mutable.push_null() + } }, #[cfg(feature = "dtype-datetime")] Buffer::Datetime { buf, .. } => buf.builder.append_null(), diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs index a5e5a461df55..4730adca4156 100644 --- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs +++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs @@ -129,9 +129,7 @@ impl<'a> CoreReader<'a> { eol_char: self.eol_char, }; - let projection = self.get_projection(); - - let str_columns = self.get_string_columns(&projection)?; + let projection = self.get_projection()?; // RAII structure that will ensure we maintain a global stringcache #[cfg(feature = "dtype-categorical")] @@ -149,8 +147,6 @@ impl<'a> CoreReader<'a> { chunk_size: self.chunk_size, file_chunks_iter: file_chunks, file_chunks: vec![], - str_capacities: self.init_string_size_stats(&str_columns, self.chunk_size), - str_columns, projection, starting_point_offset, row_index: self.row_index, @@ -177,8 +173,6 @@ pub struct BatchedCsvReaderMmap<'a> { chunk_size: usize, file_chunks_iter: ChunkOffsetIter<'a>, file_chunks: Vec<(usize, usize)>, - str_capacities: Vec, - str_columns: StringColumns, projection: Vec, starting_point_offset: Option, row_index: Option, @@ -242,7 +236,6 @@ impl<'a> BatchedCsvReaderMmap<'a> { self.eol_char, self.comment_prefix.as_ref(), self.chunk_size, - &self.str_capacities, self.encoding, self.null_values.as_ref(), self.missing_is_null, @@ -254,7 +247,6 @@ impl<'a> BatchedCsvReaderMmap<'a> { cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; - update_string_stats(&self.str_capacities, &self.str_columns, &df)?; if let Some(rc) = &self.row_index { df.with_row_index_mut(&rc.name, Some(rc.offset)); } diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs index b62fd9b90ff1..7c7f8ea56c1c 100644 --- a/crates/polars-io/src/csv/read_impl/batched_read.rs +++ b/crates/polars-io/src/csv/read_impl/batched_read.rs @@ -212,9 +212,7 @@ impl<'a> CoreReader<'a> { 4096, ); - let projection = self.get_projection(); - - let str_columns = self.get_string_columns(&projection)?; + let projection = self.get_projection()?; // RAII structure that will ensure we maintain a global stringcache #[cfg(feature = "dtype-categorical")] @@ -232,8 +230,6 @@ impl<'a> CoreReader<'a> { finished: false, file_chunk_reader: chunk_iter, file_chunks: vec![], - str_capacities: self.init_string_size_stats(&str_columns, self.chunk_size), - str_columns, projection, starting_point_offset, row_index: self.row_index, @@ -260,8 +256,6 @@ pub struct BatchedCsvReaderRead<'a> { finished: bool, file_chunk_reader: ChunkReader<'a>, file_chunks: Vec<(usize, usize)>, - str_capacities: Vec, - str_columns: StringColumns, projection: Vec, starting_point_offset: Option, row_index: Option, @@ -339,7 +333,6 @@ impl<'a> BatchedCsvReaderRead<'a> { self.eol_char, self.comment_prefix.as_ref(), self.chunk_size, - &self.str_capacities, self.encoding, self.null_values.as_ref(), self.missing_is_null, @@ -351,7 +344,6 @@ impl<'a> BatchedCsvReaderRead<'a> { cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; - update_string_stats(&self.str_capacities, &self.str_columns, &df)?; if let Some(rc) = &self.row_index { df.with_row_index_mut(&rc.name, Some(rc.offset)); } diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs index 285f0c5b79de..db268f92147c 100644 --- a/crates/polars-io/src/csv/read_impl/mod.rs +++ b/crates/polars-io/src/csv/read_impl/mod.rs @@ -3,10 +3,8 @@ mod batched_read; use std::fmt; use std::ops::Deref; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use arrow::array::ValueSize; pub use batched_mmap::*; pub use batched_read::*; use polars_core::config::verbose; @@ -130,54 +128,6 @@ impl<'a> fmt::Debug for CoreReader<'a> { } } -pub(crate) struct RunningSize { - max: AtomicUsize, - sum: AtomicUsize, - count: AtomicUsize, - last: AtomicUsize, -} - -fn compute_size_hint(max: usize, sum: usize, count: usize, last: usize) -> usize { - let avg = (sum as f32 / count as f32) as usize; - let size = std::cmp::max(last, avg) as f32; - if (max as f32) < (size * 1.5) { - max - } else { - size as usize - } -} -impl RunningSize { - fn new(size: usize) -> Self { - Self { - max: AtomicUsize::new(size), - sum: AtomicUsize::new(size), - count: AtomicUsize::new(1), - last: AtomicUsize::new(size), - } - } - - pub(crate) fn update(&self, size: usize) -> (usize, usize, usize, usize) { - let max = self.max.fetch_max(size, Ordering::Release); - let sum = self.sum.fetch_add(size, Ordering::Release); - let count = self.count.fetch_add(1, Ordering::Release); - let last = self.last.fetch_add(size, Ordering::Release); - ( - max, - sum / count, - last, - compute_size_hint(max, sum, count, last), - ) - } - - pub(crate) fn size_hint(&self) -> usize { - let max = self.max.load(Ordering::Acquire); - let sum = self.sum.load(Ordering::Acquire); - let count = self.count.load(Ordering::Acquire); - let last = self.last.load(Ordering::Acquire); - compute_size_hint(max, sum, count, last) - } -} - impl<'a> CoreReader<'a> { #[allow(clippy::too_many_arguments)] pub(crate) fn new( @@ -486,55 +436,19 @@ impl<'a> CoreReader<'a> { remaining_bytes, )) } - fn get_projection(&mut self) -> Vec { + fn get_projection(&mut self) -> PolarsResult> { // we also need to sort the projection to have predictable output. // the `parse_lines` function expects this. self.projection .take() .map(|mut v| { v.sort_unstable(); - v + if let Some(idx) = v.last() { + polars_ensure!(*idx < self.schema.len(), OutOfBounds: "projection index: {} is out of bounds for csv schema with length: {}", idx, self.schema.len()) + } + Ok(v) }) - .unwrap_or_else(|| (0..self.schema.len()).collect()) - } - - fn get_string_columns(&self, projection: &[usize]) -> PolarsResult { - // keep track of the maximum capacity that needs to be allocated for the utf8-builder - // Per string column we keep a statistic of the maximum length of string bytes per chunk - // We must the names, not the indexes, (the indexes are incorrect due to projection - // pushdown) - - let mut new_projection = Vec::with_capacity(projection.len()); - - for i in projection { - let (_, dtype) = self.schema.get_at_index(*i).ok_or_else(|| { - polars_err!( - OutOfBounds: - "projection index {} is out of bounds for CSV schema with {} columns", - i, self.schema.len(), - ) - })?; - - if dtype == &DataType::String { - new_projection.push(*i) - } - } - - Ok(StringColumns::new(self.schema.clone(), new_projection)) - } - - fn init_string_size_stats( - &self, - str_columns: &StringColumns, - capacity: usize, - ) -> Vec { - // assume 10 chars per str - // this is not updated in low memory mode - let init_str_bytes = capacity * 10; - str_columns - .iter() - .map(|_| RunningSize::new(init_str_bytes)) - .collect() + .unwrap_or_else(|| Ok((0..self.schema.len()).collect())) } fn parse_csv( @@ -546,8 +460,7 @@ impl<'a> CoreReader<'a> { let logging = verbose(); let (file_chunks, chunk_size, total_rows, starting_point_offset, bytes, remaining_bytes) = self.determine_file_chunks_and_statistics(&mut n_threads, bytes, logging)?; - let projection = self.get_projection(); - let str_columns = self.get_string_columns(&projection)?; + let projection = self.get_projection()?; // An empty file with a schema should return an empty DataFrame with that schema if bytes.is_empty() { @@ -562,7 +475,6 @@ impl<'a> CoreReader<'a> { // Structure: // the inner vec has got buffers from all the columns. if let Some(predicate) = predicate { - let str_capacities = self.init_string_size_stats(&str_columns, chunk_size); let dfs = POOL.install(|| { file_chunks .into_par_iter() @@ -583,10 +495,8 @@ impl<'a> CoreReader<'a> { projection, chunk_size, schema, - &str_capacities, self.quote_char, self.encoding, - self.ignore_errors, )?; let local_bytes = &bytes[read..stop_at_nbytes]; @@ -627,10 +537,6 @@ impl<'a> CoreReader<'a> { let mask = s.bool()?; local_df = local_df.filter(mask)?; - // update the running str bytes statistics - if !self.low_memory { - update_string_stats(&str_capacities, &str_columns, &local_df)?; - } dfs.push((local_df, current_row_count)); } Ok(dfs) @@ -654,8 +560,6 @@ impl<'a> CoreReader<'a> { std::cmp::min(rows_per_thread, max_proxy) }; - let str_capacities = self.init_string_size_stats(&str_columns, capacity); - let mut dfs = POOL.install(|| { file_chunks .into_par_iter() @@ -671,7 +575,6 @@ impl<'a> CoreReader<'a> { self.eol_char, self.comment_prefix.as_ref(), capacity, - &str_capacities, self.encoding, self.null_values.as_ref(), self.missing_is_null, @@ -681,11 +584,6 @@ impl<'a> CoreReader<'a> { starting_point_offset, )?; - // update the running str bytes statistics - if !self.low_memory { - update_string_stats(&str_capacities, &str_columns, &df)?; - } - cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; if let Some(rc) = &self.row_index { df.with_row_index_mut(&rc.name, Some(rc.offset)); @@ -705,10 +603,8 @@ impl<'a> CoreReader<'a> { &projection, remaining_rows, self.schema.as_ref(), - &str_capacities, self.quote_char, self.encoding, - self.ignore_errors, )?; parse_lines( @@ -773,22 +669,6 @@ impl<'a> CoreReader<'a> { } } -fn update_string_stats( - str_capacities: &[RunningSize], - str_columns: &StringColumns, - local_df: &DataFrame, -) -> PolarsResult<()> { - // update the running str bytes statistics - for (str_index, name) in str_columns.iter().enumerate() { - let ca = local_df.column(name)?.str()?; - let str_bytes_len = ca.get_values_size(); - - let _ = str_capacities[str_index].update(str_bytes_len); - } - - Ok(()) -} - #[allow(clippy::too_many_arguments)] fn read_chunk( bytes: &[u8], @@ -801,7 +681,6 @@ fn read_chunk( eol_char: u8, comment_prefix: Option<&CommentPrefix>, capacity: usize, - str_capacities: &[RunningSize], encoding: CsvEncoding, null_values: Option<&NullValuesCompiled>, missing_is_null: bool, @@ -811,15 +690,7 @@ fn read_chunk( starting_point_offset: Option, ) -> PolarsResult { let mut read = bytes_offset_thread; - let mut buffers = init_buffers( - projection, - capacity, - schema, - str_capacities, - quote_char, - encoding, - ignore_errors, - )?; + let mut buffers = init_buffers(projection, capacity, schema, quote_char, encoding)?; let mut last_read = usize::MAX; loop { @@ -856,27 +727,3 @@ fn read_chunk( .collect::>()?, )) } - -/// List of strings, which are stored inside of a [Schema]. -/// -/// Conceptually it is `Vec<&str>` with `&str` tied to the lifetime of -/// the [Schema]. -struct StringColumns { - schema: SchemaRef, - fields: Vec, -} - -impl StringColumns { - /// New [StringColumns], where the list `fields` has indices - /// of fields in the `schema`. - fn new(schema: SchemaRef, fields: Vec) -> Self { - Self { schema, fields } - } - - fn iter(&self) -> impl Iterator { - self.fields.iter().map(|schema_i| { - let (name, _) = self.schema.get_at_index(*schema_i).unwrap(); - name.as_str() - }) - } -} diff --git a/crates/polars-io/src/ipc/write.rs b/crates/polars-io/src/ipc/write.rs index bd4afa798b55..0b96f289fbdf 100644 --- a/crates/polars-io/src/ipc/write.rs +++ b/crates/polars-io/src/ipc/write.rs @@ -247,35 +247,38 @@ mod test { .unwrap(); df_read.equals(&expected); - let mut buf: Cursor> = Cursor::new(Vec::new()); - let mut df = df![ - "letters" => ["x", "y", "z"], - "ints" => [123, 456, 789], - "floats" => [4.5, 10.0, 10.0], - "other" => ["misc", "other", "value"], - ] - .unwrap(); - IpcWriter::new(&mut buf) - .finish(&mut df) - .expect("ipc writer"); - buf.set_position(0); - let expected = df![ - "letters" => ["x", "y", "z"], - "floats" => [4.5, 10.0, 10.0], - "other" => ["misc", "other", "value"], - "ints" => [123, 456, 789], - ] - .unwrap(); - let df_read = IpcReader::new(&mut buf) - .with_columns(Some(vec![ - "letters".to_string(), - "floats".to_string(), - "other".to_string(), - "ints".to_string(), - ])) - .finish() + for pl_flavor in [false, true] { + let mut buf: Cursor> = Cursor::new(Vec::new()); + let mut df = df![ + "letters" => ["x", "y", "z"], + "ints" => [123, 456, 789], + "floats" => [4.5, 10.0, 10.0], + "other" => ["misc", "other", "value"], + ] .unwrap(); - assert!(df_read.equals(&expected)); + IpcWriter::new(&mut buf) + .with_pl_flavor(pl_flavor) + .finish(&mut df) + .expect("ipc writer"); + buf.set_position(0); + let expected = df![ + "letters" => ["x", "y", "z"], + "floats" => [4.5, 10.0, 10.0], + "other" => ["misc", "other", "value"], + "ints" => [123, 456, 789], + ] + .unwrap(); + let df_read = IpcReader::new(&mut buf) + .with_columns(Some(vec![ + "letters".to_string(), + "floats".to_string(), + "other".to_string(), + "ints".to_string(), + ])) + .finish() + .unwrap(); + assert!(df_read.equals(&expected)); + } } #[test] diff --git a/crates/polars-json/src/json/write/utf8.rs b/crates/polars-json/src/json/write/utf8.rs index f571518fe170..f967853bc1e1 100644 --- a/crates/polars-json/src/json/write/utf8.rs +++ b/crates/polars-json/src/json/write/utf8.rs @@ -1,7 +1,7 @@ // Adapted from https://github.com/serde-rs/json/blob/f901012df66811354cb1d490ad59480d8fdf77b5/src/ser.rs use std::io; -use arrow::array::{Array, MutableUtf8ValuesArray, Utf8Array}; +use arrow::array::{Array, MutableBinaryViewArray, Utf8ViewArray}; use crate::json::write::new_serializer; @@ -141,12 +141,12 @@ where writer.write_all(s) } -pub fn serialize_to_utf8(array: &dyn Array) -> Utf8Array { - let mut values = MutableUtf8ValuesArray::::with_capacity(array.len()); +pub fn serialize_to_utf8(array: &dyn Array) -> Utf8ViewArray { + let mut values = MutableBinaryViewArray::with_capacity(array.len()); let mut serializer = new_serializer(array, 0, usize::MAX); while let Some(v) = serializer.next() { - unsafe { values.push(std::str::from_utf8_unchecked(v)) } + unsafe { values.push_value(std::str::from_utf8_unchecked(v)) } } values.into() } diff --git a/crates/polars-ops/src/chunked_array/array/join.rs b/crates/polars-ops/src/chunked_array/array/join.rs index 2c681a911e08..15afdb2d90d6 100644 --- a/crates/polars-ops/src/chunked_array/array/join.rs +++ b/crates/polars-ops/src/chunked_array/array/join.rs @@ -1,21 +1,16 @@ use std::fmt::Write; -use arrow::array::ValueSize; use polars_core::prelude::ArrayChunked; use super::*; fn join_literal(ca: &ArrayChunked, separator: &str) -> PolarsResult { - let DataType::Array(_, width) = ca.dtype() else { + let DataType::Array(_, _) = ca.dtype() else { unreachable!() }; let mut buf = String::with_capacity(128); - let mut builder = StringChunkedBuilder::new( - ca.name(), - ca.len(), - ca.get_values_size() + separator.len() * (*width - 1) * ca.len(), - ); + let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); ca.for_each_amortized(|opt_s| { let opt_val = opt_s.map(|s| { @@ -39,9 +34,8 @@ fn join_literal(ca: &ArrayChunked, separator: &str) -> PolarsResult PolarsResult { - let mut buf = String::with_capacity(128); - let mut builder = - StringChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size() + ca.len()); + let mut buf = String::new(); + let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); ca.amortized_iter() .zip(separator) diff --git a/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs b/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs index 45d71265405a..2dfe4208da1d 100644 --- a/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs +++ b/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs @@ -69,7 +69,7 @@ pub trait ChunkGatherSkipNulls: Sized { impl ChunkGatherSkipNulls<[IdxSize]> for ChunkedArray where - ChunkedArray: ChunkFilter, + ChunkedArray: ChunkFilter + ChunkTake<[IdxSize]>, { fn gather_skip_nulls(&self, indices: &[IdxSize]) -> PolarsResult { if self.null_count() == 0 { @@ -101,7 +101,7 @@ where impl ChunkGatherSkipNulls for ChunkedArray where - ChunkedArray: ChunkFilter, + ChunkedArray: ChunkFilter + ChunkTake, { fn gather_skip_nulls(&self, indices: &IdxCa) -> PolarsResult { if self.null_count() == 0 { diff --git a/crates/polars-ops/src/chunked_array/hist.rs b/crates/polars-ops/src/chunked_array/hist.rs index cd31dcc3e945..c16c38a1ae84 100644 --- a/crates/polars-ops/src/chunked_array/hist.rs +++ b/crates/polars-ops/src/chunked_array/hist.rs @@ -120,7 +120,7 @@ where if include_category { // Use AnyValue for formatting. let mut lower = AnyValue::Float64(f64::NEG_INFINITY); - let mut categories = StringChunkedBuilder::new("category", breaks.len(), breaks.len() * 20); + let mut categories = StringChunkedBuilder::new("category", breaks.len()); let mut buf = String::new(); for br in &breaks { diff --git a/crates/polars-ops/src/chunked_array/list/namespace.rs b/crates/polars-ops/src/chunked_array/list/namespace.rs index 84773625f91f..e53fb830361d 100644 --- a/crates/polars-ops/src/chunked_array/list/namespace.rs +++ b/crates/polars-ops/src/chunked_array/list/namespace.rs @@ -96,11 +96,7 @@ pub trait ListNameSpaceImpl: AsList { let ca = self.as_list(); // used to amortize heap allocs let mut buf = String::with_capacity(128); - let mut builder = StringChunkedBuilder::new( - ca.name(), - ca.len(), - ca.get_values_size() + separator.len() * ca.len(), - ); + let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); ca.for_each_amortized(|opt_s| { let opt_val = opt_s.map(|s| { @@ -126,8 +122,7 @@ pub trait ListNameSpaceImpl: AsList { let ca = self.as_list(); // used to amortize heap allocs let mut buf = String::with_capacity(128); - let mut builder = - StringChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size() + ca.len()); + let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); // SAFETY: unstable series never lives longer than the iterator. unsafe { ca.amortized_iter() diff --git a/crates/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs index 8473ee4d3d65..d3f32b050a54 100644 --- a/crates/polars-ops/src/chunked_array/list/sets.rs +++ b/crates/polars-ops/src/chunked_array/list/sets.rs @@ -2,8 +2,8 @@ use std::fmt::{Display, Formatter}; use std::hash::Hash; use arrow::array::{ - BinaryArray, ListArray, MutableArray, MutableBinaryArray, MutablePrimitiveArray, - PrimitiveArray, Utf8Array, + Array, BinaryViewArray, ListArray, MutableArray, MutablePlBinary, MutablePrimitiveArray, + PrimitiveArray, Utf8ViewArray, }; use arrow::bitmap::Bitmap; use arrow::compute::utils::combine_validities_and; @@ -29,7 +29,7 @@ where } } -impl<'a> MaterializeValues> for MutableBinaryArray { +impl<'a> MaterializeValues> for MutablePlBinary { fn extend_buf>>(&mut self, values: I) -> usize { self.extend(values); self.len() @@ -231,8 +231,8 @@ where } fn binary( - a: &BinaryArray, - b: &BinaryArray, + a: &BinaryViewArray, + b: &BinaryViewArray, offsets_a: &[i64], offsets_b: &[i64], set_op: SetOperation, @@ -244,7 +244,7 @@ fn binary( let mut set = Default::default(); let mut set2: PlIndexSet> = Default::default(); - let mut values_out = MutableBinaryArray::with_capacity(std::cmp::max( + let mut values_out = MutablePlBinary::with_capacity(std::cmp::max( *offsets_a.last().unwrap(), *offsets_b.last().unwrap(), ) as usize); @@ -315,17 +315,10 @@ fn binary( offsets.push(offset as i64); } let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) }; - let values: BinaryArray = values_out.into(); + let values = values_out.freeze(); if as_utf8 { - let values = unsafe { - Utf8Array::::new_unchecked( - ArrowDataType::LargeUtf8, - values.offsets().clone(), - values.values().clone(), - values.validity().cloned(), - ) - }; + let values = unsafe { values.to_utf8view_unchecked() }; let dtype = ListArray::::default_datatype(values.data_type().clone()); Ok(ListArray::new(dtype, offsets, values.boxed(), validity)) } else { @@ -334,15 +327,6 @@ fn binary( } } -fn utf8_to_binary(arr: &Utf8Array) -> BinaryArray { - BinaryArray::::new( - ArrowDataType::LargeBinary, - arr.offsets().clone(), - arr.values().clone(), - arr.validity().cloned(), - ) -} - fn array_set_operation( a: &ListArray, b: &ListArray, @@ -359,23 +343,23 @@ fn array_set_operation( let validity = combine_validities_and(a.validity(), b.validity()); match dtype { - ArrowDataType::LargeUtf8 => { - let a = values_a.as_any().downcast_ref::>().unwrap(); - let b = values_b.as_any().downcast_ref::>().unwrap(); - - let a = utf8_to_binary(a); - let b = utf8_to_binary(b); - binary(&a, &b, offsets_a, offsets_b, set_op, validity, true) - }, - ArrowDataType::LargeBinary => { + ArrowDataType::Utf8View => { let a = values_a .as_any() - .downcast_ref::>() - .unwrap(); + .downcast_ref::() + .unwrap() + .to_binview(); let b = values_b .as_any() - .downcast_ref::>() - .unwrap(); + .downcast_ref::() + .unwrap() + .to_binview(); + + binary(&a, &b, offsets_a, offsets_b, set_op, validity, true) + }, + ArrowDataType::LargeBinary => { + let a = values_a.as_any().downcast_ref::().unwrap(); + let b = values_b.as_any().downcast_ref::().unwrap(); binary(a, b, offsets_a, offsets_b, set_op, validity, false) }, ArrowDataType::Boolean => { diff --git a/crates/polars-ops/src/chunked_array/scatter.rs b/crates/polars-ops/src/chunked_array/scatter.rs index 50f5047d80ff..26ea76cd66ce 100644 --- a/crates/polars-ops/src/chunked_array/scatter.rs +++ b/crates/polars-ops/src/chunked_array/scatter.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, PrimitiveArray, ValueSize}; +use arrow::array::{Array, PrimitiveArray}; use polars_core::prelude::*; use polars_core::series::IsSorted; use polars_core::utils::arrow::bitmap::MutableBitmap; @@ -142,8 +142,7 @@ impl<'a> ChunkedSet<&'a str> for &'a StringChunked { check_bounds(idx, self.len() as IdxSize)?; check_sorted(idx)?; let mut ca_iter = self.into_iter().enumerate(); - let mut builder = - StringChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + let mut builder = StringChunkedBuilder::new(self.name(), self.len()); for (current_idx, current_value) in idx.iter().zip(values) { for (cnt_idx, opt_val_self) in &mut ca_iter { diff --git a/crates/polars-ops/src/chunked_array/strings/concat.rs b/crates/polars-ops/src/chunked_array/strings/concat.rs index 551a14293309..86f94a8a42b9 100644 --- a/crates/polars-ops/src/chunked_array/strings/concat.rs +++ b/crates/polars-ops/src/chunked_array/strings/concat.rs @@ -1,4 +1,5 @@ use arrow::array::{Utf8Array, ValueSize}; +use arrow::compute::cast::utf8_to_utf8view; use arrow::legacy::array::default_arrays::FromDataUtf8; use polars_core::prelude::*; @@ -38,8 +39,11 @@ pub fn str_concat(ca: &StringChunked, delimiter: &str, ignore_nulls: bool) -> St }); let buf = buf.into_bytes(); + assert!(capacity >= buf.len()); let offsets = vec![0, buf.len() as i64]; let arr = unsafe { Utf8Array::from_data_unchecked_default(offsets.into(), buf.into(), None) }; + // conversion is cheap with one value. + let arr = utf8_to_utf8view(&arr); StringChunked::with_chunk(ca.name(), arr) } @@ -71,20 +75,7 @@ pub fn hor_str_concat(cas: &[&StringChunked], delimiter: &str) -> PolarsResult = cas diff --git a/crates/polars-ops/src/chunked_array/strings/extract.rs b/crates/polars-ops/src/chunked_array/strings/extract.rs index ba6dc6911e63..b56e1251c840 100644 --- a/crates/polars-ops/src/chunked_array/strings/extract.rs +++ b/crates/polars-ops/src/chunked_array/strings/extract.rs @@ -2,7 +2,7 @@ use std::iter::zip; #[cfg(feature = "extract_groups")] use arrow::array::{Array, StructArray}; -use arrow::array::{MutableArray, MutableUtf8Array, Utf8Array}; +use arrow::array::{MutableBinaryViewArray, Utf8ViewArray}; use polars_core::export::regex::Regex; use polars_core::prelude::arity::{try_binary_mut_with_options, try_unary_mut_with_options}; @@ -10,13 +10,13 @@ use super::*; #[cfg(feature = "extract_groups")] fn extract_groups_array( - arr: &Utf8Array, + arr: &Utf8ViewArray, reg: &Regex, names: &[&str], data_type: ArrowDataType, ) -> PolarsResult { let mut builders = (0..names.len()) - .map(|_| MutableUtf8Array::::with_capacity(arr.len())) + .map(|_| MutableBinaryViewArray::::with_capacity(arr.len())) .collect::>(); let mut locs = reg.capture_locations(); @@ -35,13 +35,7 @@ fn extract_groups_array( builders.iter_mut().for_each(|arr| arr.push_null()); } - let values = builders - .into_iter() - .map(|a| { - let immutable_a: Utf8Array = a.into(); - immutable_a.to_boxed() - }) - .collect(); + let values = builders.into_iter().map(|a| a.freeze().boxed()).collect(); Ok(StructArray::new(data_type.clone(), values, arr.validity().cloned()).boxed()) } @@ -76,11 +70,11 @@ pub(super) fn extract_groups( } fn extract_group_reg_lit( - arr: &Utf8Array, + arr: &Utf8ViewArray, reg: &Regex, group_index: usize, -) -> PolarsResult> { - let mut builder = MutableUtf8Array::::with_capacity(arr.len()); +) -> PolarsResult { + let mut builder = MutableBinaryViewArray::::with_capacity(arr.len()); let mut locs = reg.capture_locations(); for opt_v in arr { @@ -100,10 +94,10 @@ fn extract_group_reg_lit( fn extract_group_array_lit( s: &str, - pat: &Utf8Array, + pat: &Utf8ViewArray, group_index: usize, -) -> PolarsResult> { - let mut builder = MutableUtf8Array::::with_capacity(pat.len()); +) -> PolarsResult { + let mut builder = MutableBinaryViewArray::::with_capacity(pat.len()); for opt_pat in pat { if let Some(pat) = opt_pat { @@ -123,11 +117,11 @@ fn extract_group_array_lit( } fn extract_group_binary( - arr: &Utf8Array, - pat: &Utf8Array, + arr: &Utf8ViewArray, + pat: &Utf8ViewArray, group_index: usize, -) -> PolarsResult> { - let mut builder = MutableUtf8Array::::with_capacity(arr.len()); +) -> PolarsResult { + let mut builder = MutableBinaryViewArray::::with_capacity(arr.len()); for (opt_s, opt_pat) in zip(arr, pat) { match (opt_s, opt_pat) { diff --git a/crates/polars-ops/src/chunked_array/strings/mod.rs b/crates/polars-ops/src/chunked_array/strings/mod.rs index a800fc646aa0..b9149983307b 100644 --- a/crates/polars-ops/src/chunked_array/strings/mod.rs +++ b/crates/polars-ops/src/chunked_array/strings/mod.rs @@ -12,8 +12,6 @@ mod json_path; mod namespace; #[cfg(feature = "string_pad")] mod pad; -#[cfg(feature = "strings")] -mod replace; #[cfg(feature = "string_reverse")] mod reverse; #[cfg(feature = "strings")] diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index c1309658e138..f5b0e9bd6757 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -200,7 +200,7 @@ pub trait StringNameSpaceImpl: AsString { /// Get the length of the string values as number of bytes. fn str_len_bytes(&self) -> UInt32Chunked { let ca = self.as_string(); - ca.apply_kernel_cast(&string_len_bytes) + ca.apply_kernel_cast(&utf8view_len_bytes) } /// Pad the start of the string until it reaches the given length. @@ -299,20 +299,6 @@ pub trait StringNameSpaceImpl: AsString { return Ok(ca.clone()); } - // for single bytes we can replace on the whole values buffer - if pat.len() == 1 && val.len() == 1 { - let pat = pat.as_bytes()[0]; - let val = val.as_bytes()[0]; - return Ok( - ca.apply_kernel(&|arr| Box::new(replace::replace_lit_n_char(arr, n, pat, val))) - ); - } - if pat.len() == val.len() { - return Ok( - ca.apply_kernel(&|arr| Box::new(replace::replace_lit_n_str(arr, n, pat, val))) - ); - } - // amortize allocation let mut buf = String::new(); @@ -355,19 +341,6 @@ pub trait StringNameSpaceImpl: AsString { if ca.is_empty() { return Ok(ca.clone()); } - // for single bytes we can replace on the whole values buffer - if pat.len() == 1 && val.len() == 1 { - let pat = pat.as_bytes()[0]; - let val = val.as_bytes()[0]; - return Ok( - ca.apply_kernel(&|arr| Box::new(replace::replace_lit_single_char(arr, pat, val))) - ); - } - if pat.len() == val.len() { - return Ok(ca.apply_kernel(&|arr| { - Box::new(replace::replace_lit_n_str(arr, usize::MAX, pat, val)) - })); - } // Amortize allocation. let mut buf = String::new(); diff --git a/crates/polars-ops/src/chunked_array/strings/replace.rs b/crates/polars-ops/src/chunked_array/strings/replace.rs deleted file mode 100644 index dd098a1fd6a5..000000000000 --- a/crates/polars-ops/src/chunked_array/strings/replace.rs +++ /dev/null @@ -1,132 +0,0 @@ -use arrow::array::Utf8Array; -use arrow::offset::OffsetsBuffer; - -// ensure the offsets are corrected in case of sliced arrays -fn correct_offsets(offsets: OffsetsBuffer, start: i64) -> OffsetsBuffer { - if start != 0 { - let offsets_buf: Vec = offsets.iter().map(|o| *o - start).collect(); - return unsafe { OffsetsBuffer::new_unchecked(offsets_buf.into()) }; - } - offsets -} - -pub(super) fn replace_lit_single_char(arr: &Utf8Array, pat: u8, val: u8) -> Utf8Array { - let values = arr.values(); - let offsets = arr.offsets().clone(); - let validity = arr.validity().cloned(); - let start = offsets[0] as usize; - let end = (offsets[offsets.len() - 1]) as usize; - - let mut values = values.as_slice()[start..end].to_vec(); - for byte in values.iter_mut() { - if *byte == pat { - *byte = val; - } - } - // ensure the offsets are corrected in case of sliced arrays - let offsets = correct_offsets(offsets, start as i64); - unsafe { Utf8Array::new_unchecked(arr.data_type().clone(), offsets, values.into(), validity) } -} - -pub(super) fn replace_lit_n_char( - arr: &Utf8Array, - n: usize, - pat: u8, - val: u8, -) -> Utf8Array { - let values = arr.values(); - let offsets = arr.offsets().clone(); - let validity = arr.validity().cloned(); - let start = offsets[0] as usize; - let end = (offsets[offsets.len() - 1]) as usize; - - let mut values = values.as_slice()[start..end].to_vec(); - // ensure the offsets are corrected in case of sliced arrays - let offsets = correct_offsets(offsets, start as i64); - - let mut offsets_iter = offsets.iter(); - // ignore the first - let _ = *offsets_iter.next().unwrap(); - let mut end = None; - // must loop to skip all null/empty values, as they all have the same offsets. - for next in offsets_iter.by_ref() { - // we correct offsets before, it's guaranteed to start at 0. - if *next != 0 { - end = Some(*next as usize - 1); - break; - } - } - - let Some(mut end) = end else { - return arr.clone(); - }; - - let mut count = 0; - for (i, byte) in values.iter_mut().enumerate() { - if *byte == pat && count < n { - *byte = val; - count += 1; - }; - if i == end { - // reset the count as we entered a new string region - count = 0; - - // set the end of this string region - // safety: invariant of Utf8Array tells us that there is a next offset. - - // must loop to skip null/empty values, as they have the same offsets - for next in offsets_iter.by_ref() { - let new_end = *next as usize - 1; - if new_end != end { - end = new_end; - break; - } - } - } - } - unsafe { Utf8Array::new_unchecked(arr.data_type().clone(), offsets, values.into(), validity) } -} - -pub(super) fn replace_lit_n_str( - arr: &Utf8Array, - n: usize, - pat: &str, - val: &str, -) -> Utf8Array { - assert_eq!(pat.len(), val.len()); - let values = arr.values(); - let offsets = arr.offsets().clone(); - let validity = arr.validity().cloned(); - let start = offsets[0] as usize; - let end = (offsets[offsets.len() - 1]) as usize; - - let mut values = values.as_slice()[start..end].to_vec(); - // // ensure the offsets are corrected in case of sliced arrays - let offsets = correct_offsets(offsets, start as i64); - let mut offsets_iter = offsets.iter(); - - // overwrite previous every iter - let mut previous = *offsets_iter.next().unwrap(); - - let values_str = unsafe { std::str::from_utf8_unchecked_mut(&mut values) }; - for &end in offsets_iter { - let substr = unsafe { values_str.get_unchecked_mut(previous as usize..end as usize) }; - - for (start, part) in substr.match_indices(pat).take(n) { - let len = part.len(); - // safety: - // this violates the aliasing rules - // if this become a problem we must implement our own `match_indices` - // that works on pointers instead of references. - unsafe { - let bytes = std::slice::from_raw_parts_mut( - substr.as_bytes().as_ptr().add(start) as *mut u8, - len, - ); - bytes.copy_from_slice(val.as_bytes()); - } - } - previous = end; - } - unsafe { Utf8Array::new_unchecked(arr.data_type().clone(), offsets, values.into(), validity) } -} diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 9838733b4079..90b6e9ef9370 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -16,7 +16,7 @@ use std::hash::Hash; use ahash::RandomState; pub use args::*; -use arrow::legacy::trusted_len::TrustedLen; +use arrow::trusted_len::TrustedLen; #[cfg(feature = "asof_join")] pub use asof::{AsOfOptions, AsofJoin, AsofJoinBy, AsofStrategy}; #[cfg(feature = "dtype-categorical")] diff --git a/crates/polars-ops/src/series/ops/search_sorted.rs b/crates/polars-ops/src/series/ops/search_sorted.rs index 1235c46137be..09f083548124 100644 --- a/crates/polars-ops/src/series/ops/search_sorted.rs +++ b/crates/polars-ops/src/series/ops/search_sorted.rs @@ -166,6 +166,36 @@ where out } +fn search_sorted_bin_array_with_binary_offset( + ca: &BinaryChunked, + search_values: &BinaryOffsetChunked, + side: SearchSortedSide, + descending: bool, +) -> Vec { + let ca = ca.rechunk(); + let arr = ca.downcast_iter().next().unwrap(); + + let mut out = Vec::with_capacity(search_values.len()); + + for search_arr in search_values.downcast_iter() { + if search_arr.null_count() == 0 { + for search_value in search_arr.values_iter() { + binary_search_array(side, &mut out, arr, ca.len(), search_value, descending) + } + } else { + for opt_v in search_arr.into_iter() { + match opt_v { + None => out.push(0), + Some(search_value) => { + binary_search_array(side, &mut out, arr, ca.len(), search_value, descending) + }, + } + } + } + } + out +} + fn search_sorted_bin_array( ca: &BinaryChunked, search_values: &BinaryChunked, @@ -218,8 +248,18 @@ pub fn search_sorted( }, DataType::Binary => { let ca = s.binary().unwrap(); - let search_values = search_values.binary().unwrap(); - let idx = search_sorted_bin_array(ca, search_values, side, descending); + + let idx = match search_values.dtype() { + DataType::BinaryOffset => { + let search_values = search_values.binary_offset().unwrap(); + search_sorted_bin_array_with_binary_offset(ca, search_values, side, descending) + }, + DataType::Binary => { + let search_values = search_values.binary().unwrap(); + search_sorted_bin_array(ca, search_values, side, descending) + }, + _ => unreachable!(), + }; Ok(IdxCa::new_vec(s.name(), idx)) }, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs index 6b0cd7f1ba92..ce0fda8fe3e3 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs @@ -144,6 +144,8 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { ); }, BinaryState::FilteredRequiredDictionary(page) => { + // TODO! directly set the dict as buffers and only insert the proper views. + // This will save a lot of memory. // Already done on the dict. validate_utf8 = false; let page_dict = &page.dict; @@ -159,6 +161,8 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { BinaryState::FilteredOptionalDictionary(page_validity, page_values) => { // Already done on the dict. validate_utf8 = false; + // TODO! directly set the dict as buffers and only insert the proper views. + // This will save a lot of memory. let page_dict = &page_values.dict; extend_from_decoder( validity, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs index e195daabe18a..0d55700cfade 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs @@ -157,7 +157,7 @@ pub fn n_columns(data_type: &ArrowDataType) -> usize { use arrow::datatypes::PhysicalType::*; match data_type.to_physical_type() { Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 - | Dictionary(_) | LargeUtf8 => 1, + | Dictionary(_) | LargeUtf8 | BinaryView | Utf8View => 1, List | FixedSizeList | LargeList => { let a = data_type.to_logical_type(); if let ArrowDataType::List(inner) = a { diff --git a/crates/polars-parquet/src/arrow/read/schema/metadata.rs b/crates/polars-parquet/src/arrow/read/schema/metadata.rs index aa974c23757c..0dbcd7829753 100644 --- a/crates/polars-parquet/src/arrow/read/schema/metadata.rs +++ b/crates/polars-parquet/src/arrow/read/schema/metadata.rs @@ -20,13 +20,13 @@ pub fn read_schema_from_metadata(metadata: &mut Metadata) -> PolarsResult