From 365a20a9576d973b951e4c5502c3b6ef0738b670 Mon Sep 17 00:00:00 2001 From: ritchie Date: Sat, 2 Mar 2024 10:33:37 +0100 Subject: [PATCH 1/2] feat: ensure binview types are rle-encoded in parquet writ --- .../polars-arrow/src/array/binview/mutable.rs | 26 ++++++++++++++++- crates/polars-arrow/src/array/indexable.rs | 25 +++++++++++++++-- .../src/compute/cast/binview_to.rs | 22 +++++++++++++++ crates/polars-arrow/src/compute/cast/mod.rs | 28 +++++++++++-------- crates/polars-io/src/parquet/write.rs | 8 ++++-- py-polars/tests/unit/io/test_parquet.py | 19 +++++++++++++ 6 files changed, 110 insertions(+), 18 deletions(-) diff --git a/crates/polars-arrow/src/array/binview/mutable.rs b/crates/polars-arrow/src/array/binview/mutable.rs index 26a3a2faa930..467a6a8785d3 100644 --- a/crates/polars-arrow/src/array/binview/mutable.rs +++ b/crates/polars-arrow/src/array/binview/mutable.rs @@ -8,7 +8,7 @@ use polars_utils::slice::GetSaferUnchecked; use crate::array::binview::iterator::MutableBinaryViewValueIter; use crate::array::binview::view::validate_utf8_only; use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; -use crate::array::{Array, MutableArray, View}; +use crate::array::{Array, MutableArray, TryExtend, TryPush, View}; use crate::bitmap::MutableBitmap; use crate::buffer::Buffer; use crate::datatypes::ArrowDataType; @@ -328,6 +328,12 @@ impl MutableBinaryViewArray { self.into() } + #[inline] + pub fn value(&self, i: usize) -> &T { + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } + } + /// Returns the element at index `i` /// /// # Safety @@ -444,3 +450,21 @@ impl MutableArray for MutableBinaryViewArray { self.views.shrink_to_fit() } } + +impl> TryExtend> for MutableBinaryViewArray { + /// This is infallible and is implemented for consistency with all other types + #[inline] + fn try_extend>>(&mut self, iter: I) -> PolarsResult<()> { + self.extend(iter.into_iter()); + Ok(()) + } +} + +impl> TryPush> for MutableBinaryViewArray { + /// This is infallible and is implemented for consistency with all other types + #[inline(always)] + fn try_push(&mut self, item: Option

) -> PolarsResult<()> { + self.push(item.as_ref().map(|p| p.as_ref())); + Ok(()) + } +} diff --git a/crates/polars-arrow/src/array/indexable.rs b/crates/polars-arrow/src/array/indexable.rs index b4f455ab00c4..dbf6b75c4bcf 100644 --- a/crates/polars-arrow/src/array/indexable.rs +++ b/crates/polars-arrow/src/array/indexable.rs @@ -1,8 +1,9 @@ use std::borrow::Borrow; use crate::array::{ - MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray, - MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array, MutableUtf8ValuesArray, + MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBinaryViewArray, + MutableBooleanArray, MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array, + MutableUtf8ValuesArray, ViewType, }; use crate::offset::Offset; use crate::types::NativeType; @@ -125,6 +126,26 @@ impl AsIndexed for &[u8] { } } +impl Indexable for MutableBinaryViewArray { + type Value<'a> = &'a T; + type Type = T; + + fn value_at(&self, index: usize) -> Self::Value<'_> { + self.value(index) + } + + unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> { + self.value_unchecked(index) + } +} + +impl AsIndexed> for &T { + #[inline] + fn as_indexed(&self) -> &T { + self + } +} + // TODO: should NativeType derive from Hash? impl Indexable for MutablePrimitiveArray { type Value<'a> = T; diff --git a/crates/polars-arrow/src/compute/cast/binview_to.rs b/crates/polars-arrow/src/compute/cast/binview_to.rs index f3c0a7de2b7c..daab95aac2e1 100644 --- a/crates/polars-arrow/src/compute/cast/binview_to.rs +++ b/crates/polars-arrow/src/compute/cast/binview_to.rs @@ -13,6 +13,28 @@ use crate::types::NativeType; pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; +/// Cast [`BinaryViewArray`] to [`DictionaryArray`], also known as packing. +/// # Errors +/// This function errors if the maximum key is smaller than the number of distinct elements +/// in the array. +pub(super) fn binview_to_dictionary( + from: &BinaryViewArray, +) -> PolarsResult> { + let mut array = MutableDictionaryArray::>::new(); + array.try_extend(from.iter())?; + + Ok(array.into()) +} + +pub(super) fn utf8view_to_dictionary( + from: &Utf8ViewArray, +) -> PolarsResult> { + let mut array = MutableDictionaryArray::>::new(); + array.try_extend(from.iter())?; + + Ok(array.into()) +} + pub(super) fn view_to_binary(array: &BinaryViewArray) -> BinaryArray { let len: usize = Array::len(array); let mut mutable = MutableBinaryValuesArray::::with_capacities(len, array.total_bytes_len()); diff --git a/crates/polars-arrow/src/compute/cast/mod.rs b/crates/polars-arrow/src/compute/cast/mod.rs index 015eac0606ea..fd215d798060 100644 --- a/crates/polars-arrow/src/compute/cast/mod.rs +++ b/crates/polars-arrow/src/compute/cast/mod.rs @@ -22,7 +22,8 @@ pub use utf8_to::*; use crate::array::*; use crate::compute::cast::binview_to::{ - utf8view_to_date32_dyn, utf8view_to_naive_timestamp_dyn, view_to_binary, + binview_to_dictionary, utf8view_to_date32_dyn, utf8view_to_dictionary, + utf8view_to_naive_timestamp_dyn, view_to_binary, }; use crate::datatypes::*; use crate::legacy::index::IdxSize; @@ -293,6 +294,12 @@ pub fn cast( (Struct(_), _) | (_, Struct(_)) => polars_bail!(InvalidOperation: "Cannot cast from struct to other types" ), + (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { + dictionary_cast_dyn::<$T>(array, to_type, options) + }), + (_, Dictionary(index_type, value_type, _)) => match_integer_type!(index_type, |$T| { + cast_to_dictionary::<$T>(array, value_type, options) + }), // not supported by polars // (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( // array.as_any().downcast_ref().unwrap(), @@ -320,11 +327,6 @@ pub fn cast( options, ) .map(|x| x.boxed()), - // not supported by polars - // (List(_), List(_)) => { - // cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) - // .map(|x| x.boxed()) - // }, (BinaryView, _) => match to_type { Utf8View => array .as_any() @@ -430,12 +432,6 @@ pub fn cast( } }, - (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { - dictionary_cast_dyn::<$T>(array, to_type, options) - }), - (_, Dictionary(index_type, value_type, _)) => match_integer_type!(index_type, |$T| { - cast_to_dictionary::<$T>(array, value_type, options) - }), (_, Boolean) => match from_type { UInt8 => primitive_to_boolean_dyn::(array, to_type.clone()), UInt16 => primitive_to_boolean_dyn::(array, to_type.clone()), @@ -774,6 +770,14 @@ fn cast_to_dictionary( ArrowDataType::UInt16 => primitive_to_dictionary_dyn::(array), ArrowDataType::UInt32 => primitive_to_dictionary_dyn::(array), ArrowDataType::UInt64 => primitive_to_dictionary_dyn::(array), + ArrowDataType::BinaryView => { + binview_to_dictionary::(array.as_any().downcast_ref().unwrap()) + .map(|arr| arr.boxed()) + }, + ArrowDataType::Utf8View => { + utf8view_to_dictionary::(array.as_any().downcast_ref().unwrap()) + .map(|arr| arr.boxed()) + }, ArrowDataType::LargeUtf8 => utf8_to_dictionary_dyn::(array), ArrowDataType::LargeBinary => binary_to_dictionary_dyn::(array), ArrowDataType::Time64(_) => primitive_to_dictionary_dyn::(array), diff --git a/crates/polars-io/src/parquet/write.rs b/crates/polars-io/src/parquet/write.rs index 776effe36f00..e18ed380167d 100644 --- a/crates/polars-io/src/parquet/write.rs +++ b/crates/polars-io/src/parquet/write.rs @@ -247,9 +247,11 @@ fn get_encodings(schema: &ArrowSchema) -> Vec> { /// Declare encodings fn encoding_map(data_type: &ArrowDataType) -> Encoding { match data_type.to_physical_type() { - PhysicalType::Dictionary(_) | PhysicalType::LargeBinary | PhysicalType::LargeUtf8 => { - Encoding::RleDictionary - }, + PhysicalType::Dictionary(_) + | PhysicalType::LargeBinary + | PhysicalType::LargeUtf8 + | PhysicalType::Utf8View + | PhysicalType::BinaryView => Encoding::RleDictionary, PhysicalType::Primitive(dt) => { use arrow::types::PrimitiveType::*; match dt { diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index f30045996793..17f962552caa 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -730,3 +730,22 @@ def test_parquet_rle_null_binary_read_14638() -> None: assert "RLE_DICTIONARY" in pq.read_metadata(f).row_group(0).column(0).encodings f.seek(0) assert_frame_equal(df, pl.read_parquet(f)) + + +def test_parquet_string_rle_encoding() -> None: + n = 3 + data = { + "id": ["abcdefgh"] * n, + } + + df = pl.DataFrame(data) + f = io.BytesIO() + df.write_parquet(f, use_pyarrow=False) + f.seek(0) + + assert ( + "RLE_DICTIONARY" + in pq.ParquetFile(f).metadata.to_dict()["row_groups"][0]["columns"][0][ + "encodings" + ] + ) From f87f8b5795f3cb5540f8d4f00419980b6964123f Mon Sep 17 00:00:00 2001 From: ritchie Date: Sat, 2 Mar 2024 11:40:24 +0100 Subject: [PATCH 2/2] keep null statistics of dict pags --- .../src/arrow/write/dictionary.rs | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/crates/polars-parquet/src/arrow/write/dictionary.rs b/crates/polars-parquet/src/arrow/write/dictionary.rs index cfc5ad888a84..1922c4363856 100644 --- a/crates/polars-parquet/src/arrow/write/dictionary.rs +++ b/crates/polars-parquet/src/arrow/write/dictionary.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, DictionaryArray, DictionaryKey, Utf8ViewArray}; +use arrow::array::{Array, BinaryViewArray, DictionaryArray, DictionaryKey, Utf8ViewArray}; use arrow::bitmap::{Bitmap, MutableBitmap}; use arrow::datatypes::{ArrowDataType, IntegerType}; use num_traits::ToPrimitive; @@ -242,7 +242,7 @@ pub fn array_to_pages( match encoding { Encoding::PlainDictionary | Encoding::RleDictionary => { // write DictPage - let (dict_page, statistics): (_, Option) = + let (dict_page, mut statistics): (_, Option) = match array.values().data_type().to_logical_type() { ArrowDataType::Int8 => dyn_prim!(i8, i32, array, options, type_), ArrowDataType::Int16 => dyn_prim!(i16, i32, array, options, type_), @@ -278,6 +278,22 @@ pub fn array_to_pages( }; (DictPage::new(buffer, array.len(), false), stats) }, + ArrowDataType::BinaryView => { + let array = array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let mut buffer = vec![]; + binview::encode_plain(array, &mut buffer); + + let stats = if options.write_statistics { + Some(binview::build_statistics(array, type_.clone())) + } else { + None + }; + (DictPage::new(buffer, array.len(), false), stats) + }, ArrowDataType::Utf8View => { let array = array .values() @@ -301,9 +317,7 @@ pub fn array_to_pages( let mut buffer = vec![]; binary_encode_plain::(values, &mut buffer); let stats = if options.write_statistics { - let mut stats = binary_build_statistics(values, type_.clone()); - stats.null_count = Some(array.null_count() as i64); - Some(stats) + Some(binary_build_statistics(values, type_.clone())) } else { None }; @@ -314,8 +328,7 @@ pub fn array_to_pages( let array = array.values().as_any().downcast_ref().unwrap(); fixed_binary_encode_plain(array, false, &mut buffer); let stats = if options.write_statistics { - let mut stats = fixed_binary_build_statistics(array, type_.clone()); - stats.null_count = Some(array.null_count() as i64); + let stats = fixed_binary_build_statistics(array, type_.clone()); Some(serialize_statistics(&stats)) } else { None @@ -329,6 +342,10 @@ pub fn array_to_pages( }, }; + if let Some(stats) = &mut statistics { + stats.null_count = Some(array.null_count() as i64) + } + // write DataPage pointing to DictPage let data_page = serialize_keys(array, type_, nested, statistics, options)?.unwrap_data();