Skip to content

Commit

Permalink
change builders
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 15, 2024
1 parent 12a2c80 commit 3db3608
Show file tree
Hide file tree
Showing 12 changed files with 83 additions and 188 deletions.
15 changes: 15 additions & 0 deletions crates/polars-arrow/src/array/binview/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,21 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
}
}

pub fn extend_constant<V: AsRef<T>>(&mut self, additional: usize, value: Option<V>) {
if let Some(validity) = &mut self.validity {
validity.extend_constant(additional, value.is_some())
}

// Push and pop to get the properly encoded value.
// For long string this leads to a dictionary encoding,
// as we push the string only once in the buffers
let view_value = value.map(|v| {
self.push_value_ignore_validity(v);
self.views.pop().unwrap()
}).unwrap_or(0);
self.views.extend(std::iter::repeat(view_value).take(additional))
}

impl_mutable_array_mut_validity!();

#[inline]
Expand Down
93 changes: 0 additions & 93 deletions crates/polars-core/src/chunked_array/builder/binary.rs

This file was deleted.

15 changes: 6 additions & 9 deletions crates/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
mod binary;
mod boolean;
#[cfg(feature = "dtype-array")]
pub mod fixed_size_list;
Expand All @@ -7,14 +6,12 @@ mod null;
mod primitive;
mod string;

use std::borrow::Cow;
use std::iter::FromIterator;
use std::marker::PhantomData;
use std::sync::Arc;

use arrow::array::*;
use arrow::bitmap::Bitmap;
pub use binary::*;
pub use boolean::*;
#[cfg(feature = "dtype-array")]
pub(crate) use fixed_size_list::*;
Expand Down Expand Up @@ -148,15 +145,15 @@ where

fn from_iter_options(name: &str, it: impl Iterator<Item = Option<S>>) -> Self {
let cap = get_iter_capacity(&it);
let mut builder = StringChunkedBuilder::new(name, cap, cap * 5);
let mut builder = StringChunkedBuilder::new(name, cap);
it.for_each(|opt| builder.append_option(opt));
builder.finish()
}

/// Create a new ChunkedArray from an iterator.
fn from_iter_values(name: &str, it: impl Iterator<Item = S>) -> Self {
let cap = get_iter_capacity(&it);
let mut builder = StringChunkedBuilder::new(name, cap, cap * 5);
let mut builder = StringChunkedBuilder::new(name, cap);
it.for_each(|v| builder.append_value(v));
builder.finish()
}
Expand Down Expand Up @@ -187,16 +184,16 @@ where

fn from_iter_options(name: &str, it: impl Iterator<Item = Option<B>>) -> Self {
let cap = get_iter_capacity(&it);
let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5);
it.for_each(|opt| builder.append_option(opt));
let mut builder = BinaryChunkedBuilder::new(name, cap);
builder.chunk_builder.extend(it);
builder.finish()
}

/// Create a new ChunkedArray from an iterator.
fn from_iter_values(name: &str, it: impl Iterator<Item = B>) -> Self {
let cap = get_iter_capacity(&it);
let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5);
it.for_each(|v| builder.append_value(v));
let mut builder = BinaryChunkedBuilder::new(name, cap);
builder.chunk_builder.extend_values(it);
builder.finish()
}
}
Expand Down
87 changes: 37 additions & 50 deletions crates/polars-core/src/chunked_array/builder/string.rs
Original file line number Diff line number Diff line change
@@ -1,49 +1,56 @@
use super::*;

#[derive(Clone)]
pub struct StringChunkedBuilder {
pub(crate) builder: MutableUtf8Array<i64>,
pub capacity: usize,
pub(crate) field: Field,
pub struct BinViewChunkedBuilder<T: ViewType + ?Sized> {
pub(crate) chunk_builder: MutableBinaryViewArray<T>,
pub(crate) field: FieldRef
}

impl StringChunkedBuilder {
pub type StringChunkedBuilder = BinViewChunkedBuilder<str>;
pub type BinaryChunkedBuilder = BinViewChunkedBuilder<[u8]>;

impl<T: ViewType + ?Sized> BinViewChunkedBuilder<T> {
/// Create a new StringChunkedBuilder
///
/// # Arguments
///
/// * `capacity` - Number of string elements in the final array.
/// * `bytes_capacity` - Number of bytes needed to store the string values.
pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self {
StringChunkedBuilder {
builder: MutableUtf8Array::<i64>::with_capacities(capacity, bytes_capacity),
capacity,
field: Field::new(name, DataType::String),
pub fn new(name: &str, capacity: usize) -> Self {
Self {
chunk_builder: MutableBinaryViewArray::with_capacity(capacity),
field: Arc::new(Field::new(name, DataType::from(&T::DATA_TYPE)))
}
}

/// Appends a value of type `T` into the builder
#[inline]
pub fn append_value<S: AsRef<str>>(&mut self, v: S) {
self.builder.push(Some(v.as_ref()));
pub fn append_value<S: AsRef<T>>(&mut self, v: S) {
self.chunk_builder.push_value(v.as_ref());
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.builder.push::<&str>(None);
self.chunk_builder.push_null()
}

#[inline]
pub fn append_option<S: AsRef<str>>(&mut self, opt: Option<S>) {
self.builder.push(opt);
pub fn append_option<S: AsRef<T>>(&mut self, opt: Option<S>) {
self.chunk_builder.push(opt);
}

fn shrink_to_fit(&mut self) {
self.chunk_builder.shrink_to_fit()
}
}

impl StringChunkedBuilder {
pub fn finish(mut self) -> StringChunked {
let arr = self.builder.as_box();
let arr = self.chunk_builder.as_box();

let mut ca = ChunkedArray {
field: Arc::new(self.field),
field: self.field,
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
Expand All @@ -53,40 +60,20 @@ impl StringChunkedBuilder {
ca.compute_len();
ca
}

fn shrink_to_fit(&mut self) {
self.builder.shrink_to_fit()
}
}

pub struct StringChunkedBuilderCow {
builder: StringChunkedBuilder,
}

impl StringChunkedBuilderCow {
pub fn new(name: &str, capacity: usize) -> Self {
StringChunkedBuilderCow {
builder: StringChunkedBuilder::new(name, capacity, capacity),
}
}
}
impl BinaryChunkedBuilder {
pub fn finish(mut self) -> BinaryChunked {
let arr = self.chunk_builder.as_box();

impl ChunkedBuilder<Cow<'_, str>, StringType> for StringChunkedBuilderCow {
#[inline]
fn append_value(&mut self, val: Cow<'_, str>) {
self.builder.append_value(val.as_ref())
}

#[inline]
fn append_null(&mut self) {
self.builder.append_null()
}

fn finish(self) -> ChunkedArray<StringType> {
self.builder.finish()
}

fn shrink_to_fit(&mut self) {
self.builder.shrink_to_fit()
let mut ca = ChunkedArray {
field: self.field,
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
ca.compute_len();
ca
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ impl LogicalType for CategoricalChunked {
let mapping = &**self.get_rev_map();

let mut builder =
StringChunkedBuilder::new(self.physical.name(), self.len(), self.len() * 5);
StringChunkedBuilder::new(self.physical.name(), self.len());

let f = |idx: u32| mapping.get(idx);

Expand Down
11 changes: 5 additions & 6 deletions crates/polars-core/src/chunked_array/ops/explode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,7 @@ impl ExplodeByOffsets for BinaryChunked {
let arr = self.downcast_iter().next().unwrap();

let cap = get_capacity(offsets);
let bytes_size = self.get_values_size();
let mut builder = BinaryChunkedBuilder::new(self.name(), cap, bytes_size);
let mut builder = BinaryChunkedBuilder::new(self.name(), cap);

let mut start = offsets[0] as usize;
let mut last = start;
Expand All @@ -361,10 +360,10 @@ impl ExplodeByOffsets for BinaryChunked {
let vals = arr.slice_typed(start, last - start);
if vals.null_count() == 0 {
builder
.builder
.chunk_builder
.extend_trusted_len_values(vals.values_iter())
} else {
builder.builder.extend_trusted_len(vals.into_iter());
builder.chunk_builder.extend_trusted_len(vals.into_iter());
}
}
builder.append_null();
Expand All @@ -375,10 +374,10 @@ impl ExplodeByOffsets for BinaryChunked {
let vals = arr.slice_typed(start, last - start);
if vals.null_count() == 0 {
builder
.builder
.chunk_builder
.extend_trusted_len_values(vals.values_iter())
} else {
builder.builder.extend_trusted_len(vals.into_iter());
builder.chunk_builder.extend_trusted_len(vals.into_iter());
}
builder.finish().into()
}
Expand Down
13 changes: 4 additions & 9 deletions crates/polars-core/src/chunked_array/ops/full.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,8 @@ impl ChunkFullNull for BooleanChunked {

impl<'a> ChunkFull<&'a str> for StringChunked {
fn full(name: &str, value: &'a str, length: usize) -> Self {
let mut builder = StringChunkedBuilder::new(name, length, length * value.len());

for _ in 0..length {
builder.append_value(value);
}
let mut builder = StringChunkedBuilder::new(name, length);
builder.chunk_builder.extend_constant(length, Some(value));
let mut out = builder.finish();
out.set_sorted_flag(IsSorted::Ascending);
out
Expand All @@ -66,11 +63,9 @@ impl ChunkFullNull for StringChunked {

impl<'a> ChunkFull<&'a [u8]> for BinaryChunked {
fn full(name: &str, value: &'a [u8], length: usize) -> Self {
let mut builder = BinaryChunkedBuilder::new(name, length, length * value.len());

for _ in 0..length {
builder.append_value(value);
}
let mut builder = BinaryChunkedBuilder::new(name, length);
builder.chunk_builder.extend_constant(length, Some(value));
let mut out = builder.finish();
out.set_sorted_flag(IsSorted::Ascending);
out
Expand Down
Loading

0 comments on commit 3db3608

Please sign in to comment.