From b3d84cb6b070501c3e9f2c12475f98980c4339dc Mon Sep 17 00:00:00 2001 From: Misaki Kasumi Date: Sat, 20 Jul 2024 18:50:19 +0800 Subject: [PATCH] fix(rust,python): Fix GC logic in write_ipc --- .../polars-arrow/src/io/ipc/write/common.rs | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/crates/polars-arrow/src/io/ipc/write/common.rs b/crates/polars-arrow/src/io/ipc/write/common.rs index aa07e2184f47..30312bf7f19d 100644 --- a/crates/polars-arrow/src/io/ipc/write/common.rs +++ b/crates/polars-arrow/src/io/ipc/write/common.rs @@ -265,6 +265,22 @@ fn set_variadic_buffer_counts(counts: &mut Vec, array: &dyn Array) { } } +fn gc_bin_view<'a, T: ViewType + ?Sized>( + arr: &'a Box, + concrete_arr: &'a BinaryViewArrayGeneric, +) -> Cow<'a, Box> { + let bytes_len = concrete_arr.total_bytes_len(); + let buffer_len = concrete_arr.total_buffer_len(); + let extra_len = buffer_len.saturating_sub(bytes_len); + if extra_len < bytes_len.min(1024) { + // We can afford some tiny waste. + Cow::Borrowed(arr) + } else { + // Force GC it. + Cow::Owned(concrete_arr.clone().gc().boxed()) + } +} + /// Write [`RecordBatchT`] into two sets of bytes, one for the header (ipc::Schema::Message) and the /// other for the batch's data fn chunk_to_bytes_amortized( @@ -284,19 +300,11 @@ fn chunk_to_bytes_amortized( let array = match array.data_type() { ArrowDataType::BinaryView => { let concrete_arr = array.as_any().downcast_ref::().unwrap(); - if concrete_arr.is_sliced() { - Cow::Owned(concrete_arr.clone().maybe_gc().boxed()) - } else { - Cow::Borrowed(array) - } + gc_bin_view(array, concrete_arr) }, ArrowDataType::Utf8View => { let concrete_arr = array.as_any().downcast_ref::().unwrap(); - if concrete_arr.is_sliced() { - Cow::Owned(concrete_arr.clone().maybe_gc().boxed()) - } else { - Cow::Borrowed(array) - } + gc_bin_view(array, concrete_arr) }, _ => Cow::Borrowed(array), };