From 2908a80d9ca3e3fb0414e35b67856f1fb761304c Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:52:18 +0200 Subject: [PATCH] add `extend_dictionary` in dictionary builder for improved performance (#6875) * add `extend_dictionary` in dictionary builder for improved performance * fix extends all nulls * support null in mapped value * adding comment * run `clippy` and `fmt` * fix ci * Apply suggestions from code review Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- .../generic_bytes_dictionary_builder.rs | 187 ++++++++++++++++- .../builder/primitive_dictionary_builder.rs | 198 +++++++++++++++++- 2 files changed, 379 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index bb0fb5e91be2..ead151d5ceea 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -17,7 +17,7 @@ use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; +use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; use hashbrown::HashTable; @@ -305,6 +305,63 @@ where }; } + /// Extends builder with an existing dictionary array. + /// + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null + /// + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls + if v_len == 0 { + self.append_nulls(k_len); + return Ok(()); + } + + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values + .iter() + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.dedup.clear(); @@ -445,8 +502,9 @@ mod tests { use super::*; use crate::array::Int8Array; + use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; - use crate::{BinaryArray, StringArray}; + use crate::{ArrowPrimitiveType, BinaryArray, StringArray}; fn test_bytes_dictionary_builder(values: Vec<&T::Native>) where @@ -664,4 +722,129 @@ mod tests { assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); assert_eq!(dict.values().len(), 4); } + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); + builder.extend([None::<&str>]); + builder.extend(["c", "d", "a"].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some)); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 6); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [ + Some("e"), + Some("e"), + Some("f"), + Some("e"), + Some("d"), + Some("a"), + Some("b"), + Some("c"), + Some("a"), + Some("b"), + Some("c"), + None, + Some("c"), + Some("d"), + Some("a"), + None + ] + ); + } + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = GenericByteBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value("I like worm hugs"); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(Utf8Type::DATA_TYPE), + ); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_string::(); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some("I like worm hugs")] + ); + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, Some("I like worm hugs")]); + } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, None]); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index ac40f8a469d3..282f0ae9d5b1 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -17,7 +17,9 @@ use crate::builder::{ArrayBuilder, PrimitiveBuilder}; use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; +use crate::{ + Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray, +}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -44,7 +46,7 @@ impl PartialEq for Value { impl Eq for Value {} -/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) +/// Builder for [`DictionaryArray`] of [`PrimitiveArray`] /// /// # Example: /// @@ -303,6 +305,63 @@ where }; } + /// Extends builder with dictionary + /// + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null + /// + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls + if v_len == 0 { + self.append_nulls(k_len); + return Ok(()); + } + + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values + .iter() + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.map.clear(); @@ -368,9 +427,9 @@ impl Extend> mod tests { use super::*; - use crate::array::UInt32Array; - use crate::array::UInt8Array; + use crate::array::{Int32Array, UInt32Array, UInt8Array}; use crate::builder::Decimal128Builder; + use crate::cast::AsArray; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; #[test] @@ -443,4 +502,135 @@ mod tests { ) ); } + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); + builder.extend([None::]); + builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([6, 6, 7, 6, 5].into_iter().map(Some)); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 7); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [ + Some(6), + Some(6), + Some(7), + Some(6), + Some(5), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + None, + Some(4), + Some(5), + Some(1), + Some(3), + Some(1), + None + ] + ); + } + + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = PrimitiveBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value(42); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_primitive::(); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some(42)] + ); + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, Some(42)]); + } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, None]); + } }