Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust): Add split_at method to arrow Array #16620

Merged
merged 1 commit into from
Jun 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion crates/polars-arrow/src/array/binary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use either::Either;

use super::specification::try_check_offsets_bounds;
use super::{Array, GenericBinaryArray};
use super::{Array, GenericBinaryArray, Splitable};
use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::utils::{BitmapIter, ZipValidity};
use crate::bitmap::Bitmap;
Expand Down Expand Up @@ -450,3 +450,30 @@ unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
self.offsets().buffer()
}
}

impl<O: Offset> Splitable for BinaryArray<O> {
#[inline(always)]
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
offsets: lhs_offsets,
values: self.values.clone(),
validity: lhs_validity,
},
Self {
data_type: self.data_type.clone(),
offsets: rhs_offsets,
values: self.values.clone(),
validity: rhs_validity,
},
)
}
}
48 changes: 48 additions & 0 deletions crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;
pub use view::{View, INLINE_VIEW_SIZE};

use super::Splitable;

pub type MutablePlString = MutableBinaryViewArray<str>;
pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;

Expand Down Expand Up @@ -476,6 +478,16 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
self.validity.as_ref()
}

fn split_at_boxed(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
let (lhs, rhs) = Splitable::split_at(self, offset);
(Box::new(lhs), Box::new(rhs))
}

unsafe fn split_at_boxed_unchecked(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
let (lhs, rhs) = unsafe { Splitable::split_at_unchecked(self, offset) };
(Box::new(lhs), Box::new(rhs))
}

fn slice(&mut self, offset: usize, length: usize) {
assert!(
offset + length <= self.len(),
Expand Down Expand Up @@ -505,3 +517,39 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
Box::new(self.clone())
}
}

impl<T: ViewType + ?Sized> Splitable for BinaryViewArrayGeneric<T> {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_views, rhs_views) = unsafe { self.views.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

unsafe {
(
Self::new_unchecked(
self.data_type.clone(),
lhs_views,
self.buffers.clone(),
lhs_validity,
if offset == 0 { 0 } else { UNKNOWN_LEN as _ },
self.total_buffer_len(),
),
Self::new_unchecked(
self.data_type.clone(),
rhs_views,
self.buffers.clone(),
rhs_validity,
if offset == self.len() {
0
} else {
UNKNOWN_LEN as _
},
self.total_buffer_len(),
),
)
}
}
}
26 changes: 25 additions & 1 deletion crates/polars-arrow/src/array/boolean/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use either::Either;

use super::Array;
use super::{Array, Splitable};
use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::utils::{BitmapIter, ZipValidity};
use crate::bitmap::{Bitmap, MutableBitmap};
Expand Down Expand Up @@ -390,6 +390,30 @@ impl Array for BooleanArray {
}
}

impl Splitable for BooleanArray {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_values, rhs_values) = unsafe { self.values.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
values: lhs_values,
validity: lhs_validity,
},
Self {
data_type: self.data_type.clone(),
values: rhs_values,
validity: rhs_validity,
},
)
}
}

impl From<Bitmap> for BooleanArray {
fn from(values: Bitmap) -> Self {
Self {
Expand Down
25 changes: 24 additions & 1 deletion crates/polars-arrow/src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use polars_error::{polars_bail, PolarsResult};

use super::primitive::PrimitiveArray;
use super::specification::check_indexes;
use super::{new_empty_array, new_null_array, Array};
use super::{new_empty_array, new_null_array, Array, Splitable};
use crate::array::dictionary::typed_iterator::{
DictValue, DictionaryIterTyped, DictionaryValuesIterTyped,
};
Expand Down Expand Up @@ -398,3 +398,26 @@ impl<K: DictionaryKey> Array for DictionaryArray<K> {
Box::new(self.clone().with_validity(validity))
}
}

impl<K: DictionaryKey> Splitable for DictionaryArray<K> {
fn check_bound(&self, offset: usize) -> bool {
offset < self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_keys, rhs_keys) = unsafe { Splitable::split_at_unchecked(&self.keys, offset) };

(
Self {
data_type: self.data_type.clone(),
keys: lhs_keys,
values: self.values.clone(),
},
Self {
data_type: self.data_type.clone(),
keys: rhs_keys,
values: self.values.clone(),
},
)
}
}
30 changes: 29 additions & 1 deletion crates/polars-arrow/src/array/fixed_size_binary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::Array;
use super::{Array, Splitable};
use crate::bitmap::Bitmap;
use crate::buffer::Buffer;
use crate::datatypes::ArrowDataType;
Expand Down Expand Up @@ -235,6 +235,34 @@ impl Array for FixedSizeBinaryArray {
}
}

impl Splitable for FixedSizeBinaryArray {
fn check_bound(&self, offset: usize) -> bool {
offset < self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_values, rhs_values) = unsafe { self.values.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

let size = self.size;

(
Self {
data_type: self.data_type.clone(),
values: lhs_values,
validity: lhs_validity,
size,
},
Self {
data_type: self.data_type.clone(),
values: rhs_values,
validity: rhs_validity,
size,
},
)
}
}

impl FixedSizeBinaryArray {
/// Creates a [`FixedSizeBinaryArray`] from an fallible iterator of optional `[u8]`.
pub fn try_from_iter<P: AsRef<[u8]>, I: IntoIterator<Item = Option<P>>>(
Expand Down
32 changes: 31 additions & 1 deletion crates/polars-arrow/src/array/fixed_size_list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::{new_empty_array, new_null_array, Array};
use super::{new_empty_array, new_null_array, Array, Splitable};
use crate::bitmap::Bitmap;
use crate::datatypes::{ArrowDataType, Field};

Expand Down Expand Up @@ -215,3 +215,33 @@ impl Array for FixedSizeListArray {
Box::new(self.clone().with_validity(validity))
}
}

impl Splitable for FixedSizeListArray {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_values, rhs_values) =
unsafe { self.values.split_at_boxed_unchecked(offset * self.size) };
let (lhs_validity, rhs_validity) =
unsafe { self.validity.split_at_unchecked(offset * self.size) };

let size = self.size;

(
Self {
data_type: self.data_type.clone(),
values: lhs_values,
validity: lhs_validity,
size,
},
Self {
data_type: self.data_type.clone(),
values: rhs_values,
validity: rhs_validity,
size,
},
)
}
}
28 changes: 27 additions & 1 deletion crates/polars-arrow/src/array/list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::specification::try_check_offsets_bounds;
use super::{new_empty_array, Array};
use super::{new_empty_array, Array, Splitable};
use crate::bitmap::Bitmap;
use crate::datatypes::{ArrowDataType, Field};
use crate::offset::{Offset, Offsets, OffsetsBuffer};
Expand Down Expand Up @@ -237,3 +237,29 @@ impl<O: Offset> Array for ListArray<O> {
Box::new(self.clone().with_validity(validity))
}
}

impl<O: Offset> Splitable for ListArray<O> {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
offsets: lhs_offsets,
validity: lhs_validity,
values: self.values.clone(),
},
Self {
data_type: self.data_type.clone(),
offsets: rhs_offsets,
validity: rhs_validity,
values: self.values.clone(),
},
)
}
}
28 changes: 27 additions & 1 deletion crates/polars-arrow/src/array/map/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::specification::try_check_offsets_bounds;
use super::{new_empty_array, Array};
use super::{new_empty_array, Array, Splitable};
use crate::bitmap::Bitmap;
use crate::datatypes::{ArrowDataType, Field};
use crate::offset::OffsetsBuffer;
Expand Down Expand Up @@ -195,3 +195,29 @@ impl Array for MapArray {
Box::new(self.clone().with_validity(validity))
}
}

impl Splitable for MapArray {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
offsets: lhs_offsets,
field: self.field.clone(),
validity: lhs_validity,
},
Self {
data_type: self.data_type.clone(),
offsets: rhs_offsets,
field: self.field.clone(),
validity: rhs_validity,
},
)
}
}
Loading