From 3490639252294215c7ee05990d82b43e2cd097a6 Mon Sep 17 00:00:00 2001 From: Dario Curreri <48800335+dariocurr@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:54:22 +0200 Subject: [PATCH 1/8] Update lexical-core requirement from 0.8 to 1.0 (to resolve RUSTSEC-2023-0086) (#6402) * Update lexical-core requirement from 0.8 to 1.0 * Remove safety comment --- arrow-cast/Cargo.toml | 2 +- arrow-cast/src/display.rs | 4 +--- arrow-csv/Cargo.toml | 2 +- arrow-json/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 86f1a431a8b2..4046f5226094 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -49,7 +49,7 @@ arrow-select = { workspace = true } chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } -lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } +lexical-core = { version = "1.0", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } atoi = "2.0.0" comfy-table = { version = "7.0", optional = true, default-features = false } base64 = "0.22" diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 312e7973963e..6373cf67840a 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -421,9 +421,7 @@ macro_rules! primitive_display { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { let value = self.value(idx); let mut buffer = [0u8; <$t as ArrowPrimitiveType>::Native::FORMATTED_SIZE]; - // SAFETY: - // buffer is T::FORMATTED_SIZE - let b = unsafe { lexical_core::write_unchecked(value, &mut buffer) }; + let b = lexical_core::write(value, &mut buffer); // Lexical core produces valid UTF-8 let s = unsafe { std::str::from_utf8_unchecked(b) }; f.write_str(s)?; diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index d29c85c56cfd..be213c9363c2 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -43,7 +43,7 @@ chrono = { workspace = true } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1" } lazy_static = { version = "1.4", default-features = false } -lexical-core = { version = "^0.8", default-features = false } +lexical-core = { version = "1.0", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } [dev-dependencies] diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index dd232f197ead..517bb03d2064 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -45,7 +45,7 @@ num = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } chrono = { workspace = true } -lexical-core = { version = "0.8", default-features = false } +lexical-core = { version = "1.0", default-features = false} [dev-dependencies] tempfile = "3.3" From aad55d58e4b6bc636686ab2f1c5c7261310cd51d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Sep 2024 12:16:15 -0400 Subject: [PATCH 2/8] Remove "NOT YET FULLY SUPPORTED" comment from DataType::Utf8View/BinaryView (#6380) --- arrow-schema/src/datatype.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 1848c8b3f76e..b9cfc3d8a848 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -261,9 +261,7 @@ pub enum DataType { /// A single LargeBinary array can store up to [`i64::MAX`] bytes /// of binary data in total. LargeBinary, - /// (NOT YET FULLY SUPPORTED) Opaque binary data of variable length. - /// - /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. + /// Opaque binary data of variable length. /// /// Logically the same as [`Self::Binary`], but the internal representation uses a view /// struct that contains the string length and either the string's entire data @@ -280,9 +278,7 @@ pub enum DataType { /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes /// of string data in total. LargeUtf8, - /// (NOT YET FULLY SUPPORTED) A variable-length string in Unicode with UTF-8 encoding - /// - /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. + /// A variable-length string in Unicode with UTF-8 encoding /// /// Logically the same as [`Self::Utf8`], but the internal representation uses a view /// struct that contains the string length and either the string's entire data From 5414f1d7c0683c64d69cf721a83c17d677c78a71 Mon Sep 17 00:00:00 2001 From: Dario Curreri <48800335+dariocurr@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:19:10 +0200 Subject: [PATCH 3/8] Move lifetime of `take_iter` from iterator to its items (#6403) --- arrow-array/src/array/binary_array.rs | 4 ++-- arrow-array/src/array/string_array.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 6b18cbc2d9f7..8f8a39b2093f 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -84,7 +84,7 @@ impl GenericBinaryArray { pub fn take_iter<'a>( &'a self, indexes: impl Iterator> + 'a, - ) -> impl Iterator> + 'a { + ) -> impl Iterator> { indexes.map(|opt_index| opt_index.map(|index| self.value(index))) } @@ -95,7 +95,7 @@ impl GenericBinaryArray { pub unsafe fn take_iter_unchecked<'a>( &'a self, indexes: impl Iterator> + 'a, - ) -> impl Iterator> + 'a { + ) -> impl Iterator> { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 4722de55d39d..25581cfaa49d 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -42,7 +42,7 @@ impl GenericStringArray { pub fn take_iter<'a>( &'a self, indexes: impl Iterator> + 'a, - ) -> impl Iterator> + 'a { + ) -> impl Iterator> { indexes.map(|opt_index| opt_index.map(|index| self.value(index))) } @@ -53,7 +53,7 @@ impl GenericStringArray { pub unsafe fn take_iter_unchecked<'a>( &'a self, indexes: impl Iterator> + 'a, - ) -> impl Iterator> + 'a { + ) -> impl Iterator> { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } From d7e87022d9ebeba0a9ff4177e70fdc698278b766 Mon Sep 17 00:00:00 2001 From: Tzu Gwo Date: Thu, 19 Sep 2024 04:47:51 +0800 Subject: [PATCH 4/8] Derive `Clone` for `object_store::aws::AmazonS3` (#6414) --- object_store/src/aws/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 4a773e7a1879..a27ed053317e 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -77,7 +77,7 @@ use crate::client::parts::Parts; pub use credential::{AwsAuthorizer, AwsCredential}; /// Interface for [Amazon S3](https://aws.amazon.com/s3/). -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct AmazonS3 { client: Arc, } From f5a6382f030d0a89498e543cc8ed97fbabac95d3 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 18 Sep 2024 14:10:50 -0700 Subject: [PATCH 5/8] fix: binary_mut should work if only one input array has null buffer (#6396) * fix: binary_mut should work if only one input array has null buffer * Avoid copying null buffer in binary_mut * Update arrow-arith/src/arity.rs Co-authored-by: Andrew Lamb * Update arrow-arith/src/arity.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-arith/src/arity.rs | 66 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 17c1b0dbccf0..9cf4453d7fca 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -313,8 +313,6 @@ where )))); } - let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); - let mut builder = a.into_builder()?; builder @@ -323,7 +321,12 @@ where .zip(b.values()) .for_each(|(l, r)| *l = op(*l, *r)); - let array_builder = builder.finish().into_data().into_builder().nulls(nulls); + let array = builder.finish(); + + // The builder has the null buffer from `a`, it is not changed. + let nulls = NullBuffer::union(array.logical_nulls().as_ref(), b.logical_nulls().as_ref()); + + let array_builder = array.into_data().into_builder().nulls(nulls); let array_data = unsafe { array_builder.build_unchecked() }; Ok(Ok(PrimitiveArray::::from(array_data))) @@ -413,7 +416,8 @@ where try_binary_no_nulls_mut(len, a, b, op) } else { let nulls = - NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap(); + create_union_null_buffer(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) + .unwrap(); let mut builder = a.into_builder()?; @@ -435,6 +439,22 @@ where } } +/// Computes the union of the nulls in two optional [`NullBuffer`] which +/// is not shared with the input buffers. +/// +/// The union of the nulls is the same as `NullBuffer::union(lhs, rhs)` but +/// it does not increase the reference count of the null buffer. +fn create_union_null_buffer( + lhs: Option<&NullBuffer>, + rhs: Option<&NullBuffer>, +) -> Option { + match (lhs, rhs) { + (Some(lhs), Some(rhs)) => Some(NullBuffer::new(lhs.inner() & rhs.inner())), + (Some(n), None) | (None, Some(n)) => Some(NullBuffer::new(n.inner() & n.inner())), + (None, None) => None, + } +} + /// This intentional inline(never) attribute helps LLVM optimize the loop. #[inline(never)] fn try_binary_no_nulls( @@ -557,6 +577,25 @@ mod tests { assert_eq!(c, expected); } + #[test] + fn test_binary_mut_null_buffer() { + let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]); + + let b = Int32Array::from(vec![Some(10), Some(11), Some(12), Some(13), Some(14)]); + + let r1 = binary_mut(a, &b, |a, b| a + b).unwrap(); + + let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]); + let b = Int32Array::new( + vec![10, 11, 12, 13, 14].into(), + Some(vec![true, true, true, true, true].into()), + ); + + // unwrap here means that no copying occured + let r2 = binary_mut(a, &b, |a, b| a + b).unwrap(); + assert_eq!(r1.unwrap(), r2.unwrap()); + } + #[test] fn test_try_binary_mut() { let a = Int32Array::from(vec![15, 14, 9, 8, 1]); @@ -587,6 +626,25 @@ mod tests { .expect_err("should got error"); } + #[test] + fn test_try_binary_mut_null_buffer() { + let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]); + + let b = Int32Array::from(vec![Some(10), Some(11), Some(12), Some(13), Some(14)]); + + let r1 = try_binary_mut(a, &b, |a, b| Ok(a + b)).unwrap(); + + let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]); + let b = Int32Array::new( + vec![10, 11, 12, 13, 14].into(), + Some(vec![true, true, true, true, true].into()), + ); + + // unwrap here means that no copying occured + let r2 = try_binary_mut(a, &b, |a, b| Ok(a + b)).unwrap(); + assert_eq!(r1.unwrap(), r2.unwrap()); + } + #[test] fn test_unary_dict_mut() { let values = Int32Array::from(vec![Some(10), Some(20), None]); From e7598a4fc3d0c24c191e5f021b1156f2777068cf Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 18 Sep 2024 23:12:45 +0200 Subject: [PATCH 6/8] Fix encoding/decoding REE Dicts when using streaming IPC (#6399) * arrow-ipc: Add test for streaming IPC with REE dicts * arrow-schema: Include child fields of REE fields --- arrow-ipc/src/reader/stream.rs | 63 ++++++++++++++++++++++++++++++++-- arrow-schema/src/field.rs | 1 + 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/arrow-ipc/src/reader/stream.rs b/arrow-ipc/src/reader/stream.rs index 64191a22b33e..de5f5bdd629f 100644 --- a/arrow-ipc/src/reader/stream.rs +++ b/arrow-ipc/src/reader/stream.rs @@ -275,8 +275,10 @@ impl StreamDecoder { #[cfg(test)] mod tests { use super::*; - use crate::writer::StreamWriter; - use arrow_array::{Int32Array, Int64Array, RecordBatch}; + use crate::writer::{IpcWriteOptions, StreamWriter}; + use arrow_array::{ + types::Int32Type, DictionaryArray, Int32Array, Int64Array, RecordBatch, RunArray, + }; use arrow_schema::{DataType, Field, Schema}; // Further tests in arrow-integration-testing/tests/ipc_reader.rs @@ -315,4 +317,61 @@ mod tests { let err = decoder.finish().unwrap_err().to_string(); assert_eq!(err, "Ipc error: Unexpected End of Stream"); } + + #[test] + fn test_read_ree_dict_record_batches_from_buffer() { + let schema = Schema::new(vec![Field::new( + "test1", + DataType::RunEndEncoded( + Arc::new(Field::new("run_ends".to_string(), DataType::Int32, false)), + Arc::new(Field::new_dict( + "values".to_string(), + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + 0, + false, + )), + ), + true, + )]); + let batch = RecordBatch::try_new( + schema.clone().into(), + vec![Arc::new( + RunArray::try_new( + &Int32Array::from(vec![1, 2, 3]), + &vec![Some("a"), None, Some("a")] + .into_iter() + .collect::>(), + ) + .expect("Failed to create RunArray"), + )], + ) + .expect("Failed to create RecordBatch"); + + let mut buffer = vec![]; + { + let mut writer = StreamWriter::try_new_with_options( + &mut buffer, + &schema, + IpcWriteOptions::default().with_preserve_dict_id(false), + ) + .expect("Failed to create StreamWriter"); + writer.write(&batch).expect("Failed to write RecordBatch"); + writer.finish().expect("Failed to finish StreamWriter"); + } + + let mut decoder = StreamDecoder::new(); + let buf = &mut Buffer::from(buffer.as_slice()); + while let Some(batch) = decoder + .decode(buf) + .map_err(|e| { + ArrowError::ExternalError(format!("Failed to decode record batch: {}", e).into()) + }) + .expect("Failed to decode record batch") + { + assert_eq!(batch, batch); + } + + decoder.finish().expect("Failed to finish decoder"); + } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index a84a6ada334d..fc4852a3d37d 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -375,6 +375,7 @@ impl Field { | DataType::FixedSizeList(field, _) | DataType::Map(field, _) => field.fields(), DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()), + DataType::RunEndEncoded(_, field) => field.fields(), _ => vec![], } } From d274b6965497790f25f7bd8d56b675fc03410edb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 18 Sep 2024 19:52:31 -0600 Subject: [PATCH 7/8] fix: Stop losing precision and scale when casting decimal to dictionary (#6383) * Stop losing precision and scale when casting decimal to dictionary * address feedback --- arrow-cast/src/cast/dictionary.rs | 38 +++++++++++++++++++++++++++---- arrow-cast/src/cast/mod.rs | 32 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/cast/dictionary.rs b/arrow-cast/src/cast/dictionary.rs index daaddc4915ef..fc4d99430151 100644 --- a/arrow-cast/src/cast/dictionary.rs +++ b/arrow-cast/src/cast/dictionary.rs @@ -202,11 +202,41 @@ pub(crate) fn cast_to_dictionary( UInt16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), UInt32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), UInt64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - Decimal128(_, _) => { - pack_numeric_to_dictionary::(array, dict_value_type, cast_options) + Decimal128(p, s) => { + let dict = pack_numeric_to_dictionary::( + array, + dict_value_type, + cast_options, + )?; + let dict = dict + .as_dictionary::() + .downcast_dict::() + .unwrap(); + let value = dict.values().clone(); + // Set correct precision/scale + let value = value.with_precision_and_scale(p, s)?; + Ok(Arc::new(DictionaryArray::::try_new( + dict.keys().clone(), + Arc::new(value), + )?)) } - Decimal256(_, _) => { - pack_numeric_to_dictionary::(array, dict_value_type, cast_options) + Decimal256(p, s) => { + let dict = pack_numeric_to_dictionary::( + array, + dict_value_type, + cast_options, + )?; + let dict = dict + .as_dictionary::() + .downcast_dict::() + .unwrap(); + let value = dict.values().clone(); + // Set correct precision/scale + let value = value.with_precision_and_scale(p, s)?; + Ok(Arc::new(DictionaryArray::::try_new( + dict.keys().clone(), + Arc::new(value), + )?)) } Float16 => { pack_numeric_to_dictionary::(array, dict_value_type, cast_options) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index fe59a141cbe2..e80d497c8cba 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -2650,6 +2650,38 @@ mod tests { err.unwrap_err().to_string()); } + #[test] + fn test_cast_decimal128_to_decimal128_dict() { + let p = 20; + let s = 3; + let input_type = DataType::Decimal128(p, s); + let output_type = DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Decimal128(p, s)), + ); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal_array(array, p, s).unwrap(); + let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap(); + assert_eq!(cast_array.data_type(), &output_type); + } + + #[test] + fn test_cast_decimal256_to_decimal256_dict() { + let p = 20; + let s = 3; + let input_type = DataType::Decimal256(p, s); + let output_type = DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Decimal256(p, s)), + ); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal_array(array, p, s).unwrap(); + let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap(); + assert_eq!(cast_array.data_type(), &output_type); + } + #[test] fn test_cast_decimal128_to_decimal128_overflow() { let input_type = DataType::Decimal128(38, 3); From 13902836eaecc2c99d603b4cf41281e6d4671a85 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 19 Sep 2024 18:55:09 +0800 Subject: [PATCH 8/8] Rephrase doc comment (#6421) * docs: rephase some Signed-off-by: Ruihang Xia * fix all warnings Signed-off-by: Ruihang Xia * big letter at the beginning Signed-off-by: Ruihang Xia * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow/src/pyarrow.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-array/src/types.rs Co-authored-by: Matthijs Brobbel --------- Signed-off-by: Ruihang Xia Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Matthijs Brobbel --- arrow-arith/src/arity.rs | 17 +++++++++++------ arrow-arith/src/temporal.rs | 8 ++++++-- .../src/builder/generic_bytes_view_builder.rs | 3 ++- arrow-array/src/ffi_stream.rs | 1 + arrow-array/src/types.rs | 4 +++- arrow-buffer/src/builder/null.rs | 1 + arrow-buffer/src/util/bit_mask.rs | 6 ++++-- arrow-data/src/data.rs | 8 +++++--- arrow-flight/src/lib.rs | 3 +-- arrow-flight/src/sql/server.rs | 1 + arrow-ipc/src/writer.rs | 4 +++- arrow-select/src/filter.rs | 6 ++++-- arrow-string/src/length.rs | 1 + arrow/src/pyarrow.rs | 7 +++++-- arrow/src/util/bench_util.rs | 4 +++- parquet/src/basic.rs | 2 ++ parquet/src/bloom_filter/mod.rs | 7 ++++--- parquet/src/encodings/decoding.rs | 4 ++++ parquet/src/lib.rs | 4 ++-- parquet/src/record/record_reader.rs | 3 ++- parquet/src/record/record_writer.rs | 13 ++++++++++--- parquet_derive/src/lib.rs | 10 ++++++---- 22 files changed, 81 insertions(+), 36 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 9cf4453d7fca..bb983e1225ac 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -332,8 +332,10 @@ where Ok(Ok(PrimitiveArray::::from(array_data))) } -/// Applies the provided fallible binary operation across `a` and `b`, returning any error, -/// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a` +/// Applies the provided fallible binary operation across `a` and `b`. +/// +/// This will return any error encountered, or collect the results into +/// a [`PrimitiveArray`]. If any index is null in either `a` /// or `b`, the corresponding index in the result will also be null /// /// Like [`try_unary`] the function is only evaluated for non-null indices @@ -384,12 +386,15 @@ where } /// Applies the provided fallible binary operation across `a` and `b` by mutating the mutable -/// [`PrimitiveArray`] `a` with the results, returning any error. If any index is null in -/// either `a` or `b`, the corresponding index in the result will also be null +/// [`PrimitiveArray`] `a` with the results. /// -/// Like [`try_unary`] the function is only evaluated for non-null indices +/// Returns any error encountered, or collects the results into a [`PrimitiveArray`] as return +/// value. If any index is null in either `a` or `b`, the corresponding index in the result will +/// also be null. +/// +/// Like [`try_unary`] the function is only evaluated for non-null indices. /// -/// See [`binary_mut`] for errors and buffer reuse information +/// See [`binary_mut`] for errors and buffer reuse information. pub fn try_binary_mut( a: PrimitiveArray, b: &PrimitiveArray, diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 5f3eeb325104..09d690d3237c 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -666,6 +666,7 @@ impl ChronoDateExt for T { /// Parse the given string into a string representing fixed-offset that is correct as of the given /// UTC NaiveDateTime. +/// /// Note that the offset is function of time and can vary depending on whether daylight savings is /// in effect or not. e.g. Australia/Sydney is +10:00 or +11:00 depending on DST. #[deprecated(note = "Use arrow_array::timezone::Tz instead")] @@ -811,6 +812,7 @@ where } /// Extracts the day of a given temporal array as an array of integers. +/// /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. #[deprecated(since = "51.0.0", note = "Use `date_part` instead")] @@ -828,7 +830,8 @@ where date_part_primitive(array, DatePart::Day) } -/// Extracts the day of year of a given temporal array as an array of integers +/// Extracts the day of year of a given temporal array as an array of integers. +/// /// The day of year that ranges from 1 to 366. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -837,7 +840,8 @@ pub fn doy_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::DayOfYear) } -/// Extracts the day of year of a given temporal primitive array as an array of integers +/// Extracts the day of year of a given temporal primitive array as an array of integers. +/// /// The day of year that ranges from 1 to 366 #[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn doy(array: &PrimitiveArray) -> Result diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 3a9cf17c028e..09277c679c16 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -515,7 +515,8 @@ fn make_inlined_view(data: &[u8]) -> u128 { u128::from_le_bytes(view_buffer) } -/// Create a view based on the given data, block id and offset +/// Create a view based on the given data, block id and offset. +/// /// Note that the code below is carefully examined with x86_64 assembly code: /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), /// which slows down things. diff --git a/arrow-array/src/ffi_stream.rs b/arrow-array/src/ffi_stream.rs index 6f3405ead7b0..db44ebad1c22 100644 --- a/arrow-array/src/ffi_stream.rs +++ b/arrow-array/src/ffi_stream.rs @@ -275,6 +275,7 @@ fn get_error_code(err: &ArrowError) -> i32 { } /// A `RecordBatchReader` which imports Arrays from `FFI_ArrowArrayStream`. +/// /// Struct used to fetch `RecordBatch` from the C Stream Interface. /// Its main responsibility is to expose `RecordBatchReader` functionality /// that requires [FFI_ArrowArrayStream]. diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 550d1aadf3fa..b39c9c40311b 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -50,7 +50,9 @@ impl BooleanType { pub const DATA_TYPE: DataType = DataType::Boolean; } -/// Trait for [primitive values], bridging the dynamic-typed nature of Arrow +/// Trait for [primitive values]. +/// +/// This trait bridges the dynamic-typed nature of Arrow /// (via [`DataType`]) with the static-typed nature of rust types /// ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. /// diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs index a1cea6ef2cca..ce5e1dc34aa0 100644 --- a/arrow-buffer/src/builder/null.rs +++ b/arrow-buffer/src/builder/null.rs @@ -18,6 +18,7 @@ use crate::{BooleanBufferBuilder, MutableBuffer, NullBuffer}; /// Builder for creating the null bit buffer. +/// /// This builder only materializes the buffer when we append `false`. /// If you only append `true`s to the builder, what you get will be /// `None` when calling [`finish`](#method.finish). diff --git a/arrow-buffer/src/util/bit_mask.rs b/arrow-buffer/src/util/bit_mask.rs index 2074f0fab988..e9c80e097f82 100644 --- a/arrow-buffer/src/util/bit_mask.rs +++ b/arrow-buffer/src/util/bit_mask.rs @@ -19,8 +19,10 @@ use crate::bit_util::ceil; -/// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the -/// bits in `data` in the range `[offset_read..offset_read+len]` +/// Util function to set bits in a slice of bytes. +/// +/// This will sets all bits on `write_data` in the range `[offset_write..offset_write+len]` +/// to be equal to the bits in `data` in the range `[offset_read..offset_read+len]` /// returns the number of `0` bits `data[offset_read..offset_read+len]` /// `offset_write`, `offset_read`, and `len` are in terms of bits pub fn set_bits( diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index a14bc4873628..33cbc897a6c1 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -161,9 +161,11 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff } } -/// A generic representation of Arrow array data which encapsulates common attributes and -/// operations for Arrow array. Specific operations for different arrays types (e.g., -/// primitive, list, struct) are implemented in `Array`. +/// A generic representation of Arrow array data which encapsulates common attributes +/// and operations for Arrow array. +/// +/// Specific operations for different arrays types (e.g., primitive, list, struct) +/// are implemented in `Array`. /// /// # Memory Layout /// diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 1180264e5ddd..ff9e387dab0b 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -50,8 +50,7 @@ use std::{fmt, ops::Deref}; type ArrowResult = std::result::Result; -#[allow(clippy::derive_partial_eq_without_eq)] - +#[allow(clippy::all)] mod gen { include!("arrow.flight.protocol.rs"); } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index e348367a91eb..37b2885b5aff 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -979,6 +979,7 @@ fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status { /// A wrapper around [`Streaming`] that allows "peeking" at the /// message at the front of the stream without consuming it. +/// /// This is needed because sometimes the first message in the stream will contain /// a [`FlightDescriptor`] in addition to potentially any data, and the dispatch logic /// must inspect this information. diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index b09dcdc5029b..6ef70cdeaa2c 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -710,7 +710,9 @@ fn into_zero_offset_run_array( } /// Keeps track of dictionaries that have been written, to avoid emitting the same dictionary -/// multiple times. Can optionally error if an update to an existing dictionary is attempted, which +/// multiple times. +/// +/// Can optionally error if an update to an existing dictionary is attempted, which /// isn't allowed in the `FileWriter`. pub struct DictionaryTracker { written: HashMap, diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index e07b03d1f276..e59ad50dd3f9 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -42,8 +42,9 @@ use arrow_schema::*; const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8; /// An iterator of `(usize, usize)` each representing an interval -/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each -/// interval corresponds to a contiguous region of memory to be +/// `[start, end)` whose slots of a bitmap [Buffer] are true. +/// +/// Each interval corresponds to a contiguous region of memory to be /// "taken" from an array to be filtered. /// /// ## Notes: @@ -117,6 +118,7 @@ fn filter_count(filter: &BooleanArray) -> usize { pub type Filter<'a> = Box ArrayData + 'a>; /// Returns a prepared function optimized to filter multiple arrays. +/// /// Creating this function requires time, but using it is faster than [filter] when the /// same filter needs to be applied to multiple arrays (e.g. a multi-column `RecordBatch`). /// WARNING: the nulls of `filter` are ignored and the value on its slot is considered. diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 82fb2e0d109b..97f876a9f953 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -45,6 +45,7 @@ fn bit_length_impl( } /// Returns an array of Int32/Int64 denoting the length of each value in the array. +/// /// For list array, length is the number of elements in each list. /// For string array and binary array, length is the number of bytes of each value. /// diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index a7b593799835..6ff6df01c454 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -18,6 +18,7 @@ //! Pass Arrow objects from and to PyArrow, using Arrow's //! [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) //! and [pyo3](https://docs.rs/pyo3/latest/pyo3/). +//! //! For underlying implementation, see the [ffi] module. //! //! One can use these to write Python functions that take and return PyArrow @@ -472,8 +473,10 @@ impl IntoPyArrow for ArrowArrayStreamReader { } } -/// A newtype wrapper. When wrapped around a type `T: FromPyArrow`, it -/// implements `FromPyObject` for the PyArrow objects. When wrapped around a +/// A newtype wrapper for types implementing [`FromPyArrow`] or [`IntoPyArrow`]. +/// +/// When wrapped around a type `T: FromPyArrow`, it +/// implements [`FromPyObject`] for the PyArrow objects. When wrapped around a /// `T: IntoPyArrow`, it implements `IntoPy` for the wrapped type. #[derive(Debug)] pub struct PyArrowType(pub T); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 2561c925aaec..cd615aa73383 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -108,7 +108,9 @@ where .collect() } -/// Creates a random (but fixed-seeded) string array of a given size and null density, strings have a random length +/// Creates a random (but fixed-seeded) string array of a given size and null density. +/// +/// Strings have a random length /// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths, /// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex. pub fn create_string_array( diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 02c2f44f60c3..8fde542f59c8 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -39,6 +39,7 @@ pub use crate::format::{ // Mirrors `parquet::Type` /// Types supported by Parquet. +/// /// These physical types are intended to be used in combination with the encodings to /// control the on disk storage format. /// For example INT16 is not included as a type since a good encoding of INT32 @@ -60,6 +61,7 @@ pub enum Type { // Mirrors `parquet::ConvertedType` /// Common types (converted types) used by frameworks when using Parquet. +/// /// This helps map between types in those frameworks to the base types in Parquet. /// This is only metadata and not needed to read or write the data. /// diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index a8d68d4b6442..f98111416f6a 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -181,9 +181,10 @@ impl std::ops::IndexMut for Block { } } -/// A split block Bloom filter. The creation of this structure is based on the -/// [`crate::file::properties::BloomFilterProperties`] struct set via [`crate::file::properties::WriterProperties`] and -/// is thus hidden by default. +/// A split block Bloom filter. +/// +/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`] +/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default. #[derive(Debug, Clone)] pub struct Sbbf(Vec); diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index b5217d02ff09..e7f437304b7a 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -273,6 +273,7 @@ pub struct PlainDecoderDetails { } /// Plain decoding that supports all types. +/// /// Values are encoded back to back. For native types, data is encoded as little endian. /// Floating point types are encoded in IEEE. /// See [`PlainEncoder`](crate::encoding::PlainEncoder) for more information. @@ -333,6 +334,7 @@ impl Decoder for PlainDecoder { // RLE_DICTIONARY/PLAIN_DICTIONARY Decoding /// Dictionary decoder. +/// /// The dictionary encoding builds a dictionary of values encountered in a given column. /// The dictionary is be stored in a dictionary page per column chunk. /// See [`DictEncoder`](crate::encoding::DictEncoder) for more information. @@ -824,6 +826,7 @@ where // DELTA_LENGTH_BYTE_ARRAY Decoding /// Delta length byte array decoder. +/// /// Only applied to byte arrays to separate the length values and the data, the lengths /// are encoded using DELTA_BINARY_PACKED encoding. /// See [`DeltaLengthByteArrayEncoder`](crate::encoding::DeltaLengthByteArrayEncoder) @@ -952,6 +955,7 @@ impl Decoder for DeltaLengthByteArrayDecoder { // DELTA_BYTE_ARRAY Decoding /// Delta byte array decoder. +/// /// Prefix lengths are encoded using `DELTA_BINARY_PACKED` encoding, Suffixes are stored /// using `DELTA_LENGTH_BYTE_ARRAY` encoding. /// See [`DeltaByteArrayEncoder`](crate::encoding::DeltaByteArrayEncoder) for more diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 543c629d3425..a54d4a427635 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -116,8 +116,8 @@ pub mod basic; /// /// [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift // see parquet/CONTRIBUTING.md for instructions on regenerating -#[allow(clippy::derivable_impls, clippy::match_single_binding)] -// Don't try and format auto generated code +// Don't try clippy and format auto generated code +#[allow(clippy::all)] #[rustfmt::skip] pub mod format; diff --git a/parquet/src/record/record_reader.rs b/parquet/src/record/record_reader.rs index bcfeb95dcdf4..cfaf14a3d6f8 100644 --- a/parquet/src/record/record_reader.rs +++ b/parquet/src/record/record_reader.rs @@ -18,7 +18,8 @@ use super::super::errors::ParquetError; use super::super::file::reader::RowGroupReader; -/// read up to `max_records` records from `row_group_reader` into `self` +/// Read up to `max_records` records from `row_group_reader` into `self`. +/// /// The type parameter `T` is used to work around the rust orphan rule /// when implementing on types such as `Vec`. pub trait RecordReader { diff --git a/parquet/src/record/record_writer.rs b/parquet/src/record/record_writer.rs index 0b2b95ef7dea..56e0aa490e4d 100644 --- a/parquet/src/record/record_writer.rs +++ b/parquet/src/record/record_writer.rs @@ -20,16 +20,23 @@ use crate::schema::types::TypePtr; use super::super::errors::ParquetError; use super::super::file::writer::SerializedRowGroupWriter; -/// `write_to_row_group` writes from `self` into `row_group_writer` -/// `schema` builds the schema used by `row_group_writer` +/// Trait describing how to write a record (the implementator) to a row group writer. +/// +/// [`parquet_derive`] crate provides a derive macro [`ParquetRecordWriter`] for this trait +/// for unnested structs. +/// /// The type parameter `T` is used to work around the rust orphan rule /// when implementing on types such as `&[T]`. +/// +/// [`parquet_derive`]: https://crates.io/crates/parquet_derive +/// [`ParquetRecordWriter`]: https://docs.rs/parquet_derive/53.0.0/parquet_derive/derive.ParquetRecordWriter.html pub trait RecordWriter { + /// Writes from `self` into `row_group_writer`. fn write_to_row_group( &self, row_group_writer: &mut SerializedRowGroupWriter, ) -> Result<(), ParquetError>; - /// Generated schema + /// Generated schema used by `row_group_writer` fn schema(&self) -> Result; } diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs index 9c93e2cca978..038d8fa446e5 100644 --- a/parquet_derive/src/lib.rs +++ b/parquet_derive/src/lib.rs @@ -29,8 +29,9 @@ use ::syn::{parse_macro_input, Data, DataStruct, DeriveInput}; mod parquet_field; -/// Derive flat, simple RecordWriter implementations. Works by parsing -/// a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting +/// Derive flat, simple RecordWriter implementations. +/// +/// Works by parsing a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting /// the correct writing code for each field of the struct. Column writers /// are generated in the order they are defined. /// @@ -143,8 +144,9 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke }).into() } -/// Derive flat, simple RecordReader implementations. Works by parsing -/// a struct tagged with `#[derive(ParquetRecordReader)]` and emitting +/// Derive flat, simple RecordReader implementations. +/// +/// Works by parsing a struct tagged with `#[derive(ParquetRecordReader)]` and emitting /// the correct writing code for each field of the struct. Column readers /// are generated by matching names in the schema to the names in the struct. ///