Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: new implementation for String/Binary type. #13748

Merged
merged 49 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
5ac4953
impl MutableArray
ritchie46 Jan 15, 2024
9ecc7dd
try impl static array [skip ci]
ritchie46 Jan 15, 2024
ae54ebd
impl static_array for binview [skip ci]
ritchie46 Jan 15, 2024
b5df9b9
change builders
ritchie46 Jan 15, 2024
5b2d53b
merge trustedlen traits [skip ci]
ritchie46 Jan 15, 2024
d1df244
more [skip ci]
ritchie46 Jan 15, 2024
e6a7f87
extend/full [skip ci]
ritchie46 Jan 15, 2024
c29f84a
sort and add BinaryOffset type [skip ci]
ritchie46 Jan 16, 2024
4f5ab8f
arithmetic module [skip ci]
ritchie46 Jan 16, 2024
ca371b0
iterators [skip ci]
ritchie46 Jan 16, 2024
c5c714b
group by [skip ci]
ritchie46 Jan 16, 2024
952ddbf
polars-core compiles [skip ci]
ritchie46 Jan 16, 2024
a375dda
fix some tests
ritchie46 Jan 16, 2024
3a5f66a
impl list builder [skip ci]
ritchie46 Jan 16, 2024
58b46be
fix validity sizes
ritchie46 Jan 16, 2024
6a23f27
more casts and fix melt [skip ci]
ritchie46 Jan 17, 2024
d82a5d2
improve string filter [skip ci]
ritchie46 Jan 17, 2024
3d93da3
remove unused take kernels [skip ci]
ritchie46 Jan 17, 2024
12703ad
perf: specialed binview take [skip ci]
ritchie46 Jan 17, 2024
9c0665d
compile core all features [skip ci]
ritchie46 Jan 17, 2024
052eb79
all polars-core tests run
ritchie46 Jan 17, 2024
831fd14
allow binaryoffset series
ritchie46 Jan 17, 2024
2bda6cb
take chunked
ritchie46 Jan 17, 2024
dbeb578
polars-ops compiles [skip ci]
ritchie46 Jan 17, 2024
eb1b276
fix ipc tests
ritchie46 Jan 17, 2024
02bd615
full check passes [skip ci]
ritchie46 Jan 17, 2024
8258591
run most lazy tests
ritchie46 Jan 17, 2024
5d09721
fmt [skip ci]
ritchie46 Jan 17, 2024
265181c
python compiles [skip ci]
ritchie46 Jan 17, 2024
5c651b2
fix pl flavored ipc
ritchie46 Jan 18, 2024
2d29f1e
more tests [skip ci]
ritchie46 Jan 18, 2024
8988adf
more tests [skip ci]
ritchie46 Jan 18, 2024
0283fb3
fix segfault [skip ci]
ritchie46 Jan 18, 2024
028350f
4 tests remaining .... [skip ci]
ritchie46 Jan 18, 2024
4cb19c1
explode for strings
ritchie46 Jan 18, 2024
e701b44
fix all tests
ritchie46 Jan 18, 2024
87f09ba
checks
ritchie46 Jan 18, 2024
2d18bdc
checks pass
ritchie46 Jan 18, 2024
2a62b24
lint
ritchie46 Jan 18, 2024
b072146
merge main
ritchie46 Jan 18, 2024
99e2ce1
lint
ritchie46 Jan 18, 2024
0c1c93b
fmt
ritchie46 Jan 18, 2024
32eecad
lint
ritchie46 Jan 18, 2024
574fbe1
lint
ritchie46 Jan 18, 2024
4c0c708
lint
ritchie46 Jan 18, 2024
71ce6d4
rust tests
ritchie46 Jan 18, 2024
301729c
fix parquet
ritchie46 Jan 19, 2024
c08ea2a
clippy
ritchie46 Jan 19, 2024
deec1eb
fix ooc sort
ritchie46 Jan 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 67 additions & 9 deletions crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,22 @@ mod private {
impl Sealed for str {}
impl Sealed for [u8] {}
}
pub use iterator::BinaryViewValueIter;
pub use mutable::MutableBinaryViewArray;
use private::Sealed;

use crate::array::binview::iterator::BinaryViewValueIter;
use crate::array::binview::view::{
validate_binary_view, validate_utf8_only_view, validate_utf8_view,
};
use crate::array::binview::view::{validate_binary_view, validate_utf8_only, validate_utf8_view};
use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::utils::{BitmapIter, ZipValidity};

pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;

pub type MutablePlString = MutableBinaryViewArray<str>;
pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;

static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView;
static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View;

pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
const IS_UTF8: bool;
const DATA_TYPE: ArrowDataType;
Expand All @@ -49,6 +52,8 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {

#[allow(clippy::wrong_self_convention)]
fn into_owned(&self) -> Self::Owned;

fn dtype() -> &'static ArrowDataType;
}

impl ViewType for str {
Expand All @@ -69,6 +74,9 @@ impl ViewType for str {
fn into_owned(&self) -> Self::Owned {
self.to_string()
}
fn dtype() -> &'static ArrowDataType {
&UTF8_VIEW_TYPE
}
}

impl ViewType for [u8] {
Expand All @@ -89,6 +97,10 @@ impl ViewType for [u8] {
fn into_owned(&self) -> Self::Owned {
self.to_vec()
}

fn dtype() -> &'static ArrowDataType {
&BIN_VIEW_TYPE
}
}

pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
Expand All @@ -105,6 +117,12 @@ pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
total_buffer_len: usize,
}

impl<T: ViewType + ?Sized> PartialEq for BinaryViewArrayGeneric<T> {
fn eq(&self, other: &Self) -> bool {
self.into_iter().zip(other).all(|(l, r)| l == r)
}
}

impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {
fn clone(&self) -> Self {
Self {
Expand Down Expand Up @@ -262,7 +280,7 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
// data: 12 bytes

let bytes = if len <= 12 {
let ptr = self.views.storage_ptr() as *const u8;
let ptr = self.views.as_ptr() as *const u8;
std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize)
} else {
let buffer_idx = (v >> 64) as u32;
Expand All @@ -285,6 +303,10 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
BinaryViewValueIter::new(self)
}

pub fn len_iter(&self) -> impl Iterator<Item = u32> + '_ {
self.views.iter().map(|v| *v as u32)
}

/// Returns an iterator of the non-null values.
pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric<T>> {
NonNullValuesIter::new(self, self.validity())
Expand All @@ -299,13 +321,19 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
impl_mut_validity!();
impl_into_array!();

pub fn from<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {
pub fn from_slice<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {
let mutable = MutableBinaryViewArray::from_iterator(
slice.as_ref().iter().map(|opt_v| opt_v.as_ref()),
);
mutable.into()
}

pub fn from_slice_values<S: AsRef<T>, P: AsRef<[S]>>(slice: P) -> Self {
let mutable =
MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref()));
mutable.into()
}

/// Get the total length of bytes that it would take to concatenate all binary/str values in this array.
pub fn total_bytes_len(&self) -> usize {
self.total_bytes_len
Expand All @@ -320,12 +348,40 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
pub fn len(&self) -> usize {
self.views.len()
}

/// Garbage collect
pub fn gc(self) -> Self {
if self.buffers.is_empty() {
return self;
}
let mut mutable = MutableBinaryViewArray::with_capacity(self.len());
let buffers = self.raw_buffers.as_ref();

for view in self.views.as_ref() {
unsafe { mutable.push_view(*view, buffers) }
}
mutable.freeze().with_validity(self.validity)
}

pub fn maybe_gc(self) -> Self {
if self.total_buffer_len == 0 {
return self;
}
// Subtract the maximum amount of inlined strings.
let min_in_buffer = self.total_bytes_len.saturating_sub(self.len() * 12);
let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64);

if frac < 0.25 {
return self.gc();
}
self
}
}

impl BinaryViewArray {
/// Validate the underlying bytes on UTF-8.
pub fn validate_utf8(&self) -> PolarsResult<()> {
validate_utf8_only_view(&self.views, &self.buffers)
validate_utf8_only(&self.views, &self.buffers)
}

/// Convert [`BinaryViewArray`] to [`Utf8ViewArray`].
Expand Down Expand Up @@ -381,7 +437,7 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
}

fn data_type(&self) -> &ArrowDataType {
&self.data_type
T::dtype()
}

fn validity(&self) -> Option<&Bitmap> {
Expand All @@ -397,12 +453,14 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
}

unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
debug_assert!(offset + length <= self.len());
self.validity = self
.validity
.take()
.map(|bitmap| bitmap.sliced_unchecked(offset, length))
.filter(|bitmap| bitmap.unset_bits() > 0);
self.views.slice_unchecked(offset, length);
self.total_bytes_len = self.len_iter().map(|v| v as usize).sum::<usize>();
}

fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Expand Down
Loading
Loading