From 3490639252294215c7ee05990d82b43e2cd097a6 Mon Sep 17 00:00:00 2001
From: Dario Curreri <48800335+dariocurr@users.noreply.github.com>
Date: Tue, 17 Sep 2024 12:54:22 +0200
Subject: [PATCH 1/8] Update lexical-core requirement from 0.8 to 1.0 (to
 resolve RUSTSEC-2023-0086) (#6402)

* Update lexical-core requirement from 0.8 to 1.0

* Remove safety comment
---
 arrow-cast/Cargo.toml     | 2 +-
 arrow-cast/src/display.rs | 4 +---
 arrow-csv/Cargo.toml      | 2 +-
 arrow-json/Cargo.toml     | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml
index 86f1a431a8b2..4046f5226094 100644
--- a/arrow-cast/Cargo.toml
+++ b/arrow-cast/Cargo.toml
@@ -49,7 +49,7 @@ arrow-select = { workspace = true }
 chrono = { workspace = true }
 half = { version = "2.1", default-features = false }
 num = { version = "0.4", default-features = false, features = ["std"] }
-lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] }
+lexical-core = { version = "1.0", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] }
 atoi = "2.0.0"
 comfy-table = { version = "7.0", optional = true, default-features = false }
 base64 = "0.22"
diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs
index 312e7973963e..6373cf67840a 100644
--- a/arrow-cast/src/display.rs
+++ b/arrow-cast/src/display.rs
@@ -421,9 +421,7 @@ macro_rules! primitive_display {
             fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
                 let value = self.value(idx);
                 let mut buffer = [0u8; <$t as ArrowPrimitiveType>::Native::FORMATTED_SIZE];
-                // SAFETY:
-                // buffer is T::FORMATTED_SIZE
-                let b = unsafe { lexical_core::write_unchecked(value, &mut buffer) };
+                let b = lexical_core::write(value, &mut buffer);
                 // Lexical core produces valid UTF-8
                 let s = unsafe { std::str::from_utf8_unchecked(b) };
                 f.write_str(s)?;
diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml
index d29c85c56cfd..be213c9363c2 100644
--- a/arrow-csv/Cargo.toml
+++ b/arrow-csv/Cargo.toml
@@ -43,7 +43,7 @@ chrono = { workspace = true }
 csv = { version = "1.1", default-features = false }
 csv-core = { version = "0.1" }
 lazy_static = { version = "1.4", default-features = false }
-lexical-core = { version = "^0.8", default-features = false }
+lexical-core = { version = "1.0", default-features = false }
 regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
 
 [dev-dependencies]
diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml
index dd232f197ead..517bb03d2064 100644
--- a/arrow-json/Cargo.toml
+++ b/arrow-json/Cargo.toml
@@ -45,7 +45,7 @@ num = { version = "0.4", default-features = false, features = ["std"] }
 serde = { version = "1.0", default-features = false }
 serde_json = { version = "1.0", default-features = false, features = ["std"] }
 chrono = { workspace = true }
-lexical-core = { version = "0.8", default-features = false }
+lexical-core = { version = "1.0", default-features = false}
 
 [dev-dependencies]
 tempfile = "3.3"

From aad55d58e4b6bc636686ab2f1c5c7261310cd51d Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 17 Sep 2024 12:16:15 -0400
Subject: [PATCH 2/8] Remove "NOT YET FULLY SUPPORTED" comment from
 DataType::Utf8View/BinaryView (#6380)

---
 arrow-schema/src/datatype.rs | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index 1848c8b3f76e..b9cfc3d8a848 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -261,9 +261,7 @@ pub enum DataType {
     /// A single LargeBinary array can store up to [`i64::MAX`] bytes
     /// of binary data in total.
     LargeBinary,
-    /// (NOT YET FULLY SUPPORTED) Opaque binary data of variable length.
-    ///
-    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
+    /// Opaque binary data of variable length.
     ///
     /// Logically the same as [`Self::Binary`], but the internal representation uses a view
     /// struct that contains the string length and either the string's entire data
@@ -280,9 +278,7 @@ pub enum DataType {
     /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes
     /// of string data in total.
     LargeUtf8,
-    /// (NOT YET FULLY SUPPORTED)  A variable-length string in Unicode with UTF-8 encoding
-    ///
-    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
+    /// A variable-length string in Unicode with UTF-8 encoding
     ///
     /// Logically the same as [`Self::Utf8`], but the internal representation uses a view
     /// struct that contains the string length and either the string's entire data

From 5414f1d7c0683c64d69cf721a83c17d677c78a71 Mon Sep 17 00:00:00 2001
From: Dario Curreri <48800335+dariocurr@users.noreply.github.com>
Date: Tue, 17 Sep 2024 18:19:10 +0200
Subject: [PATCH 3/8] Move lifetime of `take_iter` from iterator to its items
 (#6403)

---
 arrow-array/src/array/binary_array.rs | 4 ++--
 arrow-array/src/array/string_array.rs | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs
index 6b18cbc2d9f7..8f8a39b2093f 100644
--- a/arrow-array/src/array/binary_array.rs
+++ b/arrow-array/src/array/binary_array.rs
@@ -84,7 +84,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {
     pub fn take_iter<'a>(
         &'a self,
         indexes: impl Iterator<Item = Option<usize>> + 'a,
-    ) -> impl Iterator<Item = Option<&[u8]>> + 'a {
+    ) -> impl Iterator<Item = Option<&'a [u8]>> {
         indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
     }
 
@@ -95,7 +95,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {
     pub unsafe fn take_iter_unchecked<'a>(
         &'a self,
         indexes: impl Iterator<Item = Option<usize>> + 'a,
-    ) -> impl Iterator<Item = Option<&[u8]>> + 'a {
+    ) -> impl Iterator<Item = Option<&'a [u8]>> {
         indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
     }
 }
diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index 4722de55d39d..25581cfaa49d 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -42,7 +42,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
     pub fn take_iter<'a>(
         &'a self,
         indexes: impl Iterator<Item = Option<usize>> + 'a,
-    ) -> impl Iterator<Item = Option<&str>> + 'a {
+    ) -> impl Iterator<Item = Option<&'a str>> {
         indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
     }
 
@@ -53,7 +53,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
     pub unsafe fn take_iter_unchecked<'a>(
         &'a self,
         indexes: impl Iterator<Item = Option<usize>> + 'a,
-    ) -> impl Iterator<Item = Option<&str>> + 'a {
+    ) -> impl Iterator<Item = Option<&'a str>> {
         indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
     }
 

From d7e87022d9ebeba0a9ff4177e70fdc698278b766 Mon Sep 17 00:00:00 2001
From: Tzu Gwo <gotzehsing@gmail.com>
Date: Thu, 19 Sep 2024 04:47:51 +0800
Subject: [PATCH 4/8] Derive `Clone` for `object_store::aws::AmazonS3` (#6414)

---
 object_store/src/aws/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs
index 4a773e7a1879..a27ed053317e 100644
--- a/object_store/src/aws/mod.rs
+++ b/object_store/src/aws/mod.rs
@@ -77,7 +77,7 @@ use crate::client::parts::Parts;
 pub use credential::{AwsAuthorizer, AwsCredential};
 
 /// Interface for [Amazon S3](https://aws.amazon.com/s3/).
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct AmazonS3 {
     client: Arc<S3Client>,
 }

From f5a6382f030d0a89498e543cc8ed97fbabac95d3 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 18 Sep 2024 14:10:50 -0700
Subject: [PATCH 5/8] fix: binary_mut should work if only one input array has
 null buffer (#6396)

* fix: binary_mut should work if only one input array has null buffer

* Avoid copying null buffer in binary_mut

* Update arrow-arith/src/arity.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Update arrow-arith/src/arity.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 arrow-arith/src/arity.rs | 66 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs
index 17c1b0dbccf0..9cf4453d7fca 100644
--- a/arrow-arith/src/arity.rs
+++ b/arrow-arith/src/arity.rs
@@ -313,8 +313,6 @@ where
         ))));
     }
 
-    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
-
     let mut builder = a.into_builder()?;
 
     builder
@@ -323,7 +321,12 @@ where
         .zip(b.values())
         .for_each(|(l, r)| *l = op(*l, *r));
 
-    let array_builder = builder.finish().into_data().into_builder().nulls(nulls);
+    let array = builder.finish();
+
+    // The builder has the null buffer from `a`, it is not changed.
+    let nulls = NullBuffer::union(array.logical_nulls().as_ref(), b.logical_nulls().as_ref());
+
+    let array_builder = array.into_data().into_builder().nulls(nulls);
 
     let array_data = unsafe { array_builder.build_unchecked() };
     Ok(Ok(PrimitiveArray::<T>::from(array_data)))
@@ -413,7 +416,8 @@ where
         try_binary_no_nulls_mut(len, a, b, op)
     } else {
         let nulls =
-            NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap();
+            create_union_null_buffer(a.logical_nulls().as_ref(), b.logical_nulls().as_ref())
+                .unwrap();
 
         let mut builder = a.into_builder()?;
 
@@ -435,6 +439,22 @@ where
     }
 }
 
+/// Computes the union of the nulls in two optional [`NullBuffer`] which
+/// is not shared with the input buffers.
+///
+/// The union of the nulls is the same as `NullBuffer::union(lhs, rhs)` but
+/// it does not increase the reference count of the null buffer.
+fn create_union_null_buffer(
+    lhs: Option<&NullBuffer>,
+    rhs: Option<&NullBuffer>,
+) -> Option<NullBuffer> {
+    match (lhs, rhs) {
+        (Some(lhs), Some(rhs)) => Some(NullBuffer::new(lhs.inner() & rhs.inner())),
+        (Some(n), None) | (None, Some(n)) => Some(NullBuffer::new(n.inner() & n.inner())),
+        (None, None) => None,
+    }
+}
+
 /// This intentional inline(never) attribute helps LLVM optimize the loop.
 #[inline(never)]
 fn try_binary_no_nulls<A: ArrayAccessor, B: ArrayAccessor, F, O>(
@@ -557,6 +577,25 @@ mod tests {
         assert_eq!(c, expected);
     }
 
+    #[test]
+    fn test_binary_mut_null_buffer() {
+        let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]);
+
+        let b = Int32Array::from(vec![Some(10), Some(11), Some(12), Some(13), Some(14)]);
+
+        let r1 = binary_mut(a, &b, |a, b| a + b).unwrap();
+
+        let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]);
+        let b = Int32Array::new(
+            vec![10, 11, 12, 13, 14].into(),
+            Some(vec![true, true, true, true, true].into()),
+        );
+
+        // unwrap here means that no copying occured
+        let r2 = binary_mut(a, &b, |a, b| a + b).unwrap();
+        assert_eq!(r1.unwrap(), r2.unwrap());
+    }
+
     #[test]
     fn test_try_binary_mut() {
         let a = Int32Array::from(vec![15, 14, 9, 8, 1]);
@@ -587,6 +626,25 @@ mod tests {
         .expect_err("should got error");
     }
 
+    #[test]
+    fn test_try_binary_mut_null_buffer() {
+        let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]);
+
+        let b = Int32Array::from(vec![Some(10), Some(11), Some(12), Some(13), Some(14)]);
+
+        let r1 = try_binary_mut(a, &b, |a, b| Ok(a + b)).unwrap();
+
+        let a = Int32Array::from(vec![Some(3), Some(4), Some(5), Some(6), None]);
+        let b = Int32Array::new(
+            vec![10, 11, 12, 13, 14].into(),
+            Some(vec![true, true, true, true, true].into()),
+        );
+
+        // unwrap here means that no copying occured
+        let r2 = try_binary_mut(a, &b, |a, b| Ok(a + b)).unwrap();
+        assert_eq!(r1.unwrap(), r2.unwrap());
+    }
+
     #[test]
     fn test_unary_dict_mut() {
         let values = Int32Array::from(vec![Some(10), Some(20), None]);

From e7598a4fc3d0c24c191e5f021b1156f2777068cf Mon Sep 17 00:00:00 2001
From: Frederic Branczyk <fbranczyk@gmail.com>
Date: Wed, 18 Sep 2024 23:12:45 +0200
Subject: [PATCH 6/8] Fix encoding/decoding REE Dicts when using streaming IPC
 (#6399)

* arrow-ipc: Add test for streaming IPC with REE dicts

* arrow-schema: Include child fields of REE fields
---
 arrow-ipc/src/reader/stream.rs | 63 ++++++++++++++++++++++++++++++++--
 arrow-schema/src/field.rs      |  1 +
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/arrow-ipc/src/reader/stream.rs b/arrow-ipc/src/reader/stream.rs
index 64191a22b33e..de5f5bdd629f 100644
--- a/arrow-ipc/src/reader/stream.rs
+++ b/arrow-ipc/src/reader/stream.rs
@@ -275,8 +275,10 @@ impl StreamDecoder {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::writer::StreamWriter;
-    use arrow_array::{Int32Array, Int64Array, RecordBatch};
+    use crate::writer::{IpcWriteOptions, StreamWriter};
+    use arrow_array::{
+        types::Int32Type, DictionaryArray, Int32Array, Int64Array, RecordBatch, RunArray,
+    };
     use arrow_schema::{DataType, Field, Schema};
 
     // Further tests in arrow-integration-testing/tests/ipc_reader.rs
@@ -315,4 +317,61 @@ mod tests {
         let err = decoder.finish().unwrap_err().to_string();
         assert_eq!(err, "Ipc error: Unexpected End of Stream");
     }
+
+    #[test]
+    fn test_read_ree_dict_record_batches_from_buffer() {
+        let schema = Schema::new(vec![Field::new(
+            "test1",
+            DataType::RunEndEncoded(
+                Arc::new(Field::new("run_ends".to_string(), DataType::Int32, false)),
+                Arc::new(Field::new_dict(
+                    "values".to_string(),
+                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                    true,
+                    0,
+                    false,
+                )),
+            ),
+            true,
+        )]);
+        let batch = RecordBatch::try_new(
+            schema.clone().into(),
+            vec![Arc::new(
+                RunArray::try_new(
+                    &Int32Array::from(vec![1, 2, 3]),
+                    &vec![Some("a"), None, Some("a")]
+                        .into_iter()
+                        .collect::<DictionaryArray<Int32Type>>(),
+                )
+                .expect("Failed to create RunArray"),
+            )],
+        )
+        .expect("Failed to create RecordBatch");
+
+        let mut buffer = vec![];
+        {
+            let mut writer = StreamWriter::try_new_with_options(
+                &mut buffer,
+                &schema,
+                IpcWriteOptions::default().with_preserve_dict_id(false),
+            )
+            .expect("Failed to create StreamWriter");
+            writer.write(&batch).expect("Failed to write RecordBatch");
+            writer.finish().expect("Failed to finish StreamWriter");
+        }
+
+        let mut decoder = StreamDecoder::new();
+        let buf = &mut Buffer::from(buffer.as_slice());
+        while let Some(batch) = decoder
+            .decode(buf)
+            .map_err(|e| {
+                ArrowError::ExternalError(format!("Failed to decode record batch: {}", e).into())
+            })
+            .expect("Failed to decode record batch")
+        {
+            assert_eq!(batch, batch);
+        }
+
+        decoder.finish().expect("Failed to finish decoder");
+    }
 }
diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index a84a6ada334d..fc4852a3d37d 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -375,6 +375,7 @@ impl Field {
             | DataType::FixedSizeList(field, _)
             | DataType::Map(field, _) => field.fields(),
             DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()),
+            DataType::RunEndEncoded(_, field) => field.fields(),
             _ => vec![],
         }
     }

From d274b6965497790f25f7bd8d56b675fc03410edb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 18 Sep 2024 19:52:31 -0600
Subject: [PATCH 7/8] fix: Stop losing precision and scale when casting decimal
 to dictionary (#6383)

* Stop losing precision and scale when casting decimal to dictionary

* address feedback
---
 arrow-cast/src/cast/dictionary.rs | 38 +++++++++++++++++++++++++++----
 arrow-cast/src/cast/mod.rs        | 32 ++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/arrow-cast/src/cast/dictionary.rs b/arrow-cast/src/cast/dictionary.rs
index daaddc4915ef..fc4d99430151 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -202,11 +202,41 @@ pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
         UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
         UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
         UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
-        Decimal128(_, _) => {
-            pack_numeric_to_dictionary::<K, Decimal128Type>(array, dict_value_type, cast_options)
+        Decimal128(p, s) => {
+            let dict = pack_numeric_to_dictionary::<K, Decimal128Type>(
+                array,
+                dict_value_type,
+                cast_options,
+            )?;
+            let dict = dict
+                .as_dictionary::<K>()
+                .downcast_dict::<Decimal128Array>()
+                .unwrap();
+            let value = dict.values().clone();
+            // Set correct precision/scale
+            let value = value.with_precision_and_scale(p, s)?;
+            Ok(Arc::new(DictionaryArray::<K>::try_new(
+                dict.keys().clone(),
+                Arc::new(value),
+            )?))
         }
-        Decimal256(_, _) => {
-            pack_numeric_to_dictionary::<K, Decimal256Type>(array, dict_value_type, cast_options)
+        Decimal256(p, s) => {
+            let dict = pack_numeric_to_dictionary::<K, Decimal256Type>(
+                array,
+                dict_value_type,
+                cast_options,
+            )?;
+            let dict = dict
+                .as_dictionary::<K>()
+                .downcast_dict::<Decimal256Array>()
+                .unwrap();
+            let value = dict.values().clone();
+            // Set correct precision/scale
+            let value = value.with_precision_and_scale(p, s)?;
+            Ok(Arc::new(DictionaryArray::<K>::try_new(
+                dict.keys().clone(),
+                Arc::new(value),
+            )?))
         }
         Float16 => {
             pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index fe59a141cbe2..e80d497c8cba 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -2650,6 +2650,38 @@ mod tests {
                    err.unwrap_err().to_string());
     }
 
+    #[test]
+    fn test_cast_decimal128_to_decimal128_dict() {
+        let p = 20;
+        let s = 3;
+        let input_type = DataType::Decimal128(p, s);
+        let output_type = DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Decimal128(p, s)),
+        );
+        assert!(can_cast_types(&input_type, &output_type));
+        let array = vec![Some(1123456), Some(2123456), Some(3123456), None];
+        let array = create_decimal_array(array, p, s).unwrap();
+        let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap();
+        assert_eq!(cast_array.data_type(), &output_type);
+    }
+
+    #[test]
+    fn test_cast_decimal256_to_decimal256_dict() {
+        let p = 20;
+        let s = 3;
+        let input_type = DataType::Decimal256(p, s);
+        let output_type = DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Decimal256(p, s)),
+        );
+        assert!(can_cast_types(&input_type, &output_type));
+        let array = vec![Some(1123456), Some(2123456), Some(3123456), None];
+        let array = create_decimal_array(array, p, s).unwrap();
+        let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap();
+        assert_eq!(cast_array.data_type(), &output_type);
+    }
+
     #[test]
     fn test_cast_decimal128_to_decimal128_overflow() {
         let input_type = DataType::Decimal128(38, 3);

From 13902836eaecc2c99d603b4cf41281e6d4671a85 Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Thu, 19 Sep 2024 18:55:09 +0800
Subject: [PATCH 8/8] Rephrase doc comment (#6421)

* docs: rephase some

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix all warnings

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* big letter at the beginning

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* Apply suggestions from code review

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

* Update arrow/src/pyarrow.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

* Update arrow-array/src/types.rs

Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com>
---
 arrow-arith/src/arity.rs                        | 17 +++++++++++------
 arrow-arith/src/temporal.rs                     |  8 ++++++--
 .../src/builder/generic_bytes_view_builder.rs   |  3 ++-
 arrow-array/src/ffi_stream.rs                   |  1 +
 arrow-array/src/types.rs                        |  4 +++-
 arrow-buffer/src/builder/null.rs                |  1 +
 arrow-buffer/src/util/bit_mask.rs               |  6 ++++--
 arrow-data/src/data.rs                          |  8 +++++---
 arrow-flight/src/lib.rs                         |  3 +--
 arrow-flight/src/sql/server.rs                  |  1 +
 arrow-ipc/src/writer.rs                         |  4 +++-
 arrow-select/src/filter.rs                      |  6 ++++--
 arrow-string/src/length.rs                      |  1 +
 arrow/src/pyarrow.rs                            |  7 +++++--
 arrow/src/util/bench_util.rs                    |  4 +++-
 parquet/src/basic.rs                            |  2 ++
 parquet/src/bloom_filter/mod.rs                 |  7 ++++---
 parquet/src/encodings/decoding.rs               |  4 ++++
 parquet/src/lib.rs                              |  4 ++--
 parquet/src/record/record_reader.rs             |  3 ++-
 parquet/src/record/record_writer.rs             | 13 ++++++++++---
 parquet_derive/src/lib.rs                       | 10 ++++++----
 22 files changed, 81 insertions(+), 36 deletions(-)

diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs
index 9cf4453d7fca..bb983e1225ac 100644
--- a/arrow-arith/src/arity.rs
+++ b/arrow-arith/src/arity.rs
@@ -332,8 +332,10 @@ where
     Ok(Ok(PrimitiveArray::<T>::from(array_data)))
 }
 
-/// Applies the provided fallible binary operation across `a` and `b`, returning any error,
-/// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a`
+/// Applies the provided fallible binary operation across `a` and `b`.
+///
+/// This will return any error encountered, or collect the results into
+/// a [`PrimitiveArray`]. If any index is null in either `a`
 /// or `b`, the corresponding index in the result will also be null
 ///
 /// Like [`try_unary`] the function is only evaluated for non-null indices
@@ -384,12 +386,15 @@ where
 }
 
 /// Applies the provided fallible binary operation across `a` and `b` by mutating the mutable
-/// [`PrimitiveArray`] `a` with the results, returning any error. If any index is null in
-/// either `a` or `b`, the corresponding index in the result will also be null
+/// [`PrimitiveArray`] `a` with the results.
 ///
-/// Like [`try_unary`] the function is only evaluated for non-null indices
+/// Returns any error encountered, or collects the results into a [`PrimitiveArray`] as return
+/// value. If any index is null in either `a` or `b`, the corresponding index in the result will
+/// also be null.
+///
+/// Like [`try_unary`] the function is only evaluated for non-null indices.
 ///
-/// See [`binary_mut`] for errors and buffer reuse information
+/// See [`binary_mut`] for errors and buffer reuse information.
 pub fn try_binary_mut<T, F>(
     a: PrimitiveArray<T>,
     b: &PrimitiveArray<T>,
diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs
index 5f3eeb325104..09d690d3237c 100644
--- a/arrow-arith/src/temporal.rs
+++ b/arrow-arith/src/temporal.rs
@@ -666,6 +666,7 @@ impl<T: Datelike> ChronoDateExt for T {
 
 /// Parse the given string into a string representing fixed-offset that is correct as of the given
 /// UTC NaiveDateTime.
+///
 /// Note that the offset is function of time and can vary depending on whether daylight savings is
 /// in effect or not. e.g. Australia/Sydney is +10:00 or +11:00 depending on DST.
 #[deprecated(note = "Use arrow_array::timezone::Tz instead")]
@@ -811,6 +812,7 @@ where
 }
 
 /// Extracts the day of a given temporal array as an array of integers.
+///
 /// If the given array isn't temporal primitive or dictionary array,
 /// an `Err` will be returned.
 #[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
@@ -828,7 +830,8 @@ where
     date_part_primitive(array, DatePart::Day)
 }
 
-/// Extracts the day of year of a given temporal array as an array of integers
+/// Extracts the day of year of a given temporal array as an array of integers.
+///
 /// The day of year that ranges from 1 to 366.
 /// If the given array isn't temporal primitive or dictionary array,
 /// an `Err` will be returned.
@@ -837,7 +840,8 @@ pub fn doy_dyn(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
     date_part(array, DatePart::DayOfYear)
 }
 
-/// Extracts the day of year of a given temporal primitive array as an array of integers
+/// Extracts the day of year of a given temporal primitive array as an array of integers.
+///
 /// The day of year that ranges from 1 to 366
 #[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
 pub fn doy<T>(array: &PrimitiveArray<T>) -> Result<Int32Array, ArrowError>
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs
index 3a9cf17c028e..09277c679c16 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -515,7 +515,8 @@ fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {
     u128::from_le_bytes(view_buffer)
 }
 
-/// Create a view based on the given data, block id and offset
+/// Create a view based on the given data, block id and offset.
+///
 /// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G>
 /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined),
 /// which slows down things.
diff --git a/arrow-array/src/ffi_stream.rs b/arrow-array/src/ffi_stream.rs
index 6f3405ead7b0..db44ebad1c22 100644
--- a/arrow-array/src/ffi_stream.rs
+++ b/arrow-array/src/ffi_stream.rs
@@ -275,6 +275,7 @@ fn get_error_code(err: &ArrowError) -> i32 {
 }
 
 /// A `RecordBatchReader` which imports Arrays from `FFI_ArrowArrayStream`.
+///
 /// Struct used to fetch `RecordBatch` from the C Stream Interface.
 /// Its main responsibility is to expose `RecordBatchReader` functionality
 /// that requires [FFI_ArrowArrayStream].
diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs
index 550d1aadf3fa..b39c9c40311b 100644
--- a/arrow-array/src/types.rs
+++ b/arrow-array/src/types.rs
@@ -50,7 +50,9 @@ impl BooleanType {
     pub const DATA_TYPE: DataType = DataType::Boolean;
 }
 
-/// Trait for [primitive values], bridging the dynamic-typed nature of Arrow
+/// Trait for [primitive values].
+///
+/// This trait bridges the dynamic-typed nature of Arrow
 /// (via [`DataType`]) with the static-typed nature of rust types
 /// ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`].
 ///
diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs
index a1cea6ef2cca..ce5e1dc34aa0 100644
--- a/arrow-buffer/src/builder/null.rs
+++ b/arrow-buffer/src/builder/null.rs
@@ -18,6 +18,7 @@
 use crate::{BooleanBufferBuilder, MutableBuffer, NullBuffer};
 
 /// Builder for creating the null bit buffer.
+///
 /// This builder only materializes the buffer when we append `false`.
 /// If you only append `true`s to the builder, what you get will be
 /// `None` when calling [`finish`](#method.finish).
diff --git a/arrow-buffer/src/util/bit_mask.rs b/arrow-buffer/src/util/bit_mask.rs
index 2074f0fab988..e9c80e097f82 100644
--- a/arrow-buffer/src/util/bit_mask.rs
+++ b/arrow-buffer/src/util/bit_mask.rs
@@ -19,8 +19,10 @@
 
 use crate::bit_util::ceil;
 
-/// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the
-/// bits in `data` in the range `[offset_read..offset_read+len]`
+/// Util function to set bits in a slice of bytes.
+///
+/// This will sets all bits on `write_data` in the range `[offset_write..offset_write+len]`
+/// to be equal to the bits in `data` in the range `[offset_read..offset_read+len]`
 /// returns the number of `0` bits `data[offset_read..offset_read+len]`
 /// `offset_write`, `offset_read`, and `len` are in terms of bits
 pub fn set_bits(
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index a14bc4873628..33cbc897a6c1 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -161,9 +161,11 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
     }
 }
 
-/// A generic representation of Arrow array data which encapsulates common attributes and
-/// operations for Arrow array. Specific operations for different arrays types (e.g.,
-/// primitive, list, struct) are implemented in `Array`.
+/// A generic representation of Arrow array data which encapsulates common attributes
+/// and operations for Arrow array.
+///
+/// Specific operations for different arrays types (e.g., primitive, list, struct)
+/// are implemented in `Array`.
 ///
 /// # Memory Layout
 ///
diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs
index 1180264e5ddd..ff9e387dab0b 100644
--- a/arrow-flight/src/lib.rs
+++ b/arrow-flight/src/lib.rs
@@ -50,8 +50,7 @@ use std::{fmt, ops::Deref};
 
 type ArrowResult<T> = std::result::Result<T, ArrowError>;
 
-#[allow(clippy::derive_partial_eq_without_eq)]
-
+#[allow(clippy::all)]
 mod gen {
     include!("arrow.flight.protocol.rs");
 }
diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs
index e348367a91eb..37b2885b5aff 100644
--- a/arrow-flight/src/sql/server.rs
+++ b/arrow-flight/src/sql/server.rs
@@ -979,6 +979,7 @@ fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status {
 
 /// A wrapper around [`Streaming<FlightData>`] that allows "peeking" at the
 /// message at the front of the stream without consuming it.
+///
 /// This is needed because sometimes the first message in the stream will contain
 /// a [`FlightDescriptor`] in addition to potentially any data, and the dispatch logic
 /// must inspect this information.
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index b09dcdc5029b..6ef70cdeaa2c 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -710,7 +710,9 @@ fn into_zero_offset_run_array<R: RunEndIndexType>(
 }
 
 /// Keeps track of dictionaries that have been written, to avoid emitting the same dictionary
-/// multiple times. Can optionally error if an update to an existing dictionary is attempted, which
+/// multiple times.
+///
+/// Can optionally error if an update to an existing dictionary is attempted, which
 /// isn't allowed in the `FileWriter`.
 pub struct DictionaryTracker {
     written: HashMap<i64, ArrayData>,
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index e07b03d1f276..e59ad50dd3f9 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -42,8 +42,9 @@ use arrow_schema::*;
 const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8;
 
 /// An iterator of `(usize, usize)` each representing an interval
-/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each
-/// interval corresponds to a contiguous region of memory to be
+/// `[start, end)` whose slots of a bitmap [Buffer] are true.
+///
+/// Each interval corresponds to a contiguous region of memory to be
 /// "taken" from an array to be filtered.
 ///
 /// ## Notes:
@@ -117,6 +118,7 @@ fn filter_count(filter: &BooleanArray) -> usize {
 pub type Filter<'a> = Box<dyn Fn(&ArrayData) -> ArrayData + 'a>;
 
 /// Returns a prepared function optimized to filter multiple arrays.
+///
 /// Creating this function requires time, but using it is faster than [filter] when the
 /// same filter needs to be applied to multiple arrays (e.g. a multi-column `RecordBatch`).
 /// WARNING: the nulls of `filter` are ignored and the value on its slot is considered.
diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs
index 82fb2e0d109b..97f876a9f953 100644
--- a/arrow-string/src/length.rs
+++ b/arrow-string/src/length.rs
@@ -45,6 +45,7 @@ fn bit_length_impl<P: ArrowPrimitiveType>(
 }
 
 /// Returns an array of Int32/Int64 denoting the length of each value in the array.
+///
 /// For list array, length is the number of elements in each list.
 /// For string array and binary array, length is the number of bytes of each value.
 ///
diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs
index a7b593799835..6ff6df01c454 100644
--- a/arrow/src/pyarrow.rs
+++ b/arrow/src/pyarrow.rs
@@ -18,6 +18,7 @@
 //! Pass Arrow objects from and to PyArrow, using Arrow's
 //! [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
 //! and [pyo3](https://docs.rs/pyo3/latest/pyo3/).
+//!
 //! For underlying implementation, see the [ffi] module.
 //!
 //! One can use these to write Python functions that take and return PyArrow
@@ -472,8 +473,10 @@ impl IntoPyArrow for ArrowArrayStreamReader {
     }
 }
 
-/// A newtype wrapper. When wrapped around a type `T: FromPyArrow`, it
-/// implements `FromPyObject` for the PyArrow objects. When wrapped around a
+/// A newtype wrapper for types implementing [`FromPyArrow`] or [`IntoPyArrow`].
+///
+/// When wrapped around a type `T: FromPyArrow`, it
+/// implements [`FromPyObject`] for the PyArrow objects. When wrapped around a
 /// `T: IntoPyArrow`, it implements `IntoPy<PyObject>` for the wrapped type.
 #[derive(Debug)]
 pub struct PyArrowType<T>(pub T);
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index 2561c925aaec..cd615aa73383 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -108,7 +108,9 @@ where
         .collect()
 }
 
-/// Creates a random (but fixed-seeded) string array of a given size and null density, strings have a random length
+/// Creates a random (but fixed-seeded) string array of a given size and null density.
+///
+/// Strings have a random length
 /// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
 /// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
 pub fn create_string_array<Offset: OffsetSizeTrait>(
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 02c2f44f60c3..8fde542f59c8 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -39,6 +39,7 @@ pub use crate::format::{
 // Mirrors `parquet::Type`
 
 /// Types supported by Parquet.
+///
 /// These physical types are intended to be used in combination with the encodings to
 /// control the on disk storage format.
 /// For example INT16 is not included as a type since a good encoding of INT32
@@ -60,6 +61,7 @@ pub enum Type {
 // Mirrors `parquet::ConvertedType`
 
 /// Common types (converted types) used by frameworks when using Parquet.
+///
 /// This helps map between types in those frameworks to the base types in Parquet.
 /// This is only metadata and not needed to read or write the data.
 ///
diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs
index a8d68d4b6442..f98111416f6a 100644
--- a/parquet/src/bloom_filter/mod.rs
+++ b/parquet/src/bloom_filter/mod.rs
@@ -181,9 +181,10 @@ impl std::ops::IndexMut<usize> for Block {
     }
 }
 
-/// A split block Bloom filter. The creation of this structure is based on the
-/// [`crate::file::properties::BloomFilterProperties`] struct set via [`crate::file::properties::WriterProperties`] and
-/// is thus hidden by default.
+/// A split block Bloom filter.
+///
+/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`]
+/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default.
 #[derive(Debug, Clone)]
 pub struct Sbbf(Vec<Block>);
 
diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs
index b5217d02ff09..e7f437304b7a 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -273,6 +273,7 @@ pub struct PlainDecoderDetails {
 }
 
 /// Plain decoding that supports all types.
+///
 /// Values are encoded back to back. For native types, data is encoded as little endian.
 /// Floating point types are encoded in IEEE.
 /// See [`PlainEncoder`](crate::encoding::PlainEncoder) for more information.
@@ -333,6 +334,7 @@ impl<T: DataType> Decoder<T> for PlainDecoder<T> {
 // RLE_DICTIONARY/PLAIN_DICTIONARY Decoding
 
 /// Dictionary decoder.
+///
 /// The dictionary encoding builds a dictionary of values encountered in a given column.
 /// The dictionary is be stored in a dictionary page per column chunk.
 /// See [`DictEncoder`](crate::encoding::DictEncoder) for more information.
@@ -824,6 +826,7 @@ where
 // DELTA_LENGTH_BYTE_ARRAY Decoding
 
 /// Delta length byte array decoder.
+///
 /// Only applied to byte arrays to separate the length values and the data, the lengths
 /// are encoded using DELTA_BINARY_PACKED encoding.
 /// See [`DeltaLengthByteArrayEncoder`](crate::encoding::DeltaLengthByteArrayEncoder)
@@ -952,6 +955,7 @@ impl<T: DataType> Decoder<T> for DeltaLengthByteArrayDecoder<T> {
 // DELTA_BYTE_ARRAY Decoding
 
 /// Delta byte array decoder.
+///
 /// Prefix lengths are encoded using `DELTA_BINARY_PACKED` encoding, Suffixes are stored
 /// using `DELTA_LENGTH_BYTE_ARRAY` encoding.
 /// See [`DeltaByteArrayEncoder`](crate::encoding::DeltaByteArrayEncoder) for more
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 543c629d3425..a54d4a427635 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -116,8 +116,8 @@ pub mod basic;
 ///
 /// [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
 // see parquet/CONTRIBUTING.md for instructions on regenerating
-#[allow(clippy::derivable_impls, clippy::match_single_binding)]
-// Don't try and format auto generated code
+// Don't try clippy and format auto generated code
+#[allow(clippy::all)]
 #[rustfmt::skip]
 pub mod format;
 
diff --git a/parquet/src/record/record_reader.rs b/parquet/src/record/record_reader.rs
index bcfeb95dcdf4..cfaf14a3d6f8 100644
--- a/parquet/src/record/record_reader.rs
+++ b/parquet/src/record/record_reader.rs
@@ -18,7 +18,8 @@
 use super::super::errors::ParquetError;
 use super::super::file::reader::RowGroupReader;
 
-/// read up to `max_records` records from `row_group_reader` into `self`
+/// Read up to `max_records` records from `row_group_reader` into `self`.
+///
 /// The type parameter `T` is used to work around the rust orphan rule
 /// when implementing on types such as `Vec<T>`.
 pub trait RecordReader<T> {
diff --git a/parquet/src/record/record_writer.rs b/parquet/src/record/record_writer.rs
index 0b2b95ef7dea..56e0aa490e4d 100644
--- a/parquet/src/record/record_writer.rs
+++ b/parquet/src/record/record_writer.rs
@@ -20,16 +20,23 @@ use crate::schema::types::TypePtr;
 use super::super::errors::ParquetError;
 use super::super::file::writer::SerializedRowGroupWriter;
 
-/// `write_to_row_group` writes from `self` into `row_group_writer`
-/// `schema` builds the schema used by `row_group_writer`
+/// Trait describing how to write a record (the implementator) to a row group writer.
+///
+/// [`parquet_derive`] crate provides a derive macro [`ParquetRecordWriter`] for this trait
+/// for unnested structs.
+///
 /// The type parameter `T` is used to work around the rust orphan rule
 /// when implementing on types such as `&[T]`.
+///
+/// [`parquet_derive`]: https://crates.io/crates/parquet_derive
+/// [`ParquetRecordWriter`]: https://docs.rs/parquet_derive/53.0.0/parquet_derive/derive.ParquetRecordWriter.html
 pub trait RecordWriter<T> {
+    /// Writes from `self` into `row_group_writer`.
     fn write_to_row_group<W: std::io::Write + Send>(
         &self,
         row_group_writer: &mut SerializedRowGroupWriter<W>,
     ) -> Result<(), ParquetError>;
 
-    /// Generated schema
+    /// Generated schema used by `row_group_writer`
     fn schema(&self) -> Result<TypePtr, ParquetError>;
 }
diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs
index 9c93e2cca978..038d8fa446e5 100644
--- a/parquet_derive/src/lib.rs
+++ b/parquet_derive/src/lib.rs
@@ -29,8 +29,9 @@ use ::syn::{parse_macro_input, Data, DataStruct, DeriveInput};
 
 mod parquet_field;
 
-/// Derive flat, simple RecordWriter implementations. Works by parsing
-/// a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting
+/// Derive flat, simple RecordWriter implementations.
+///
+/// Works by parsing a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting
 /// the correct writing code for each field of the struct. Column writers
 /// are generated in the order they are defined.
 ///
@@ -143,8 +144,9 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke
   }).into()
 }
 
-/// Derive flat, simple RecordReader implementations. Works by parsing
-/// a struct tagged with `#[derive(ParquetRecordReader)]` and emitting
+/// Derive flat, simple RecordReader implementations.
+///
+/// Works by parsing a struct tagged with `#[derive(ParquetRecordReader)]` and emitting
 /// the correct writing code for each field of the struct. Column readers
 /// are generated by matching names in the schema to the names in the struct.
 ///