From fd2ac15bd19e2b9797445219c8dc7b9b0e69863b Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 03:00:33 -0500 Subject: [PATCH 01/23] update public api Statistics::min to return an option. I first re-named the existing method to `min_unchecked` and made it internal to the crate. I then added a `pub min(&self) -> Opiton<&T>` method. I figure we can first change the public API before deciding what to do about internal usage. Ref: https://github.com/apache/arrow-rs/issues/6093 --- parquet/src/arrow/arrow_reader/statistics.rs | 14 ++-- parquet/src/arrow/arrow_writer/mod.rs | 8 +- parquet/src/column/writer/mod.rs | 82 ++++++++++---------- parquet/src/file/metadata/memory.rs | 4 +- parquet/src/file/statistics.rs | 11 ++- 5 files changed, 62 insertions(+), 57 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index d487967c23b3..987819af142b 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -131,7 +131,7 @@ macro_rules! make_stats_iterator { make_stats_iterator!( MinBooleanStatsIterator, - min, + min_unchecked, ParquetStatistics::Boolean, bool ); @@ -141,13 +141,13 @@ make_stats_iterator!( ParquetStatistics::Boolean, bool ); -make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, i32); +make_stats_iterator!(MinInt32StatsIterator, min_unchecked, ParquetStatistics::Int32, i32); make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32); -make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, i64); +make_stats_iterator!(MinInt64StatsIterator, min_unchecked, ParquetStatistics::Int64, i64); make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64); -make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, f32); +make_stats_iterator!(MinFloatStatsIterator, min_unchecked, ParquetStatistics::Float, f32); make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32); -make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, f64); +make_stats_iterator!(MinDoubleStatsIterator, min_unchecked, ParquetStatistics::Double, f64); make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64); make_stats_iterator!( MinByteArrayStatsIterator, @@ -244,7 +244,7 @@ macro_rules! make_decimal_stats_iterator { make_decimal_stats_iterator!( MinDecimal128StatsIterator, - min, + min_unchecked, min_bytes, i128, from_bytes_to_i128 @@ -258,7 +258,7 @@ make_decimal_stats_iterator!( ); make_decimal_stats_iterator!( MinDecimal256StatsIterator, - min, + min_unchecked, min_bytes, i256, from_bytes_to_i256 diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 0c07f541bd8a..12da55007674 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2543,7 +2543,7 @@ mod tests { let stats = column.statistics().unwrap(); assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { - assert_eq!(*stats.min() as u32, *src_slice.iter().min().unwrap()); + assert_eq!(*stats.min_unchecked() as u32, *src_slice.iter().min().unwrap()); assert_eq!(*stats.max() as u32, *src_slice.iter().max().unwrap()); } else { panic!("Statistics::Int32 missing") @@ -2584,7 +2584,7 @@ mod tests { let stats = column.statistics().unwrap(); assert!(stats.has_min_max_set()); if let Statistics::Int64(stats) = stats { - assert_eq!(*stats.min() as u64, *src_slice.iter().min().unwrap()); + assert_eq!(*stats.min_unchecked() as u64, *src_slice.iter().min().unwrap()); assert_eq!(*stats.max() as u64, *src_slice.iter().max().unwrap()); } else { panic!("Statistics::Int64 missing") @@ -3069,7 +3069,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min(); + let min = byte_array_stats.min_unchecked(); let max = byte_array_stats.max(); assert_eq!(min.as_bytes(), &[b'a']); @@ -3141,7 +3141,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min(); + let min = byte_array_stats.min_unchecked(); let max = byte_array_stats.max(); assert_eq!(min.as_bytes(), &[b'a']); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 519a219f943d..8b13e27a45b7 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -769,7 +769,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } Some(stat) => { // Check if min/max are still ascending/descending across pages - let new_min = stat.min(); + let new_min = stat.min_unchecked(); let new_max = stat.max(); if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max { if self.data_page_boundary_ascending { @@ -1845,7 +1845,7 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &1); + assert_eq!(stats.min_unchecked(), &1); assert_eq!(stats.max(), &4); } else { panic!("expecting Statistics::Int32"); @@ -1889,7 +1889,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!( - stats.min(), + stats.min_unchecked(), &ByteArray::from(vec![ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, 35u8, 231u8, 90u8, 0u8, 0u8, @@ -1925,7 +1925,7 @@ mod tests { if let Some(stats) = metadata.statistics() { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &0,); + assert_eq!(stats.min_unchecked(), &0,); assert_eq!(stats.max(), &5,); } else { panic!("expecting Statistics::Int32"); @@ -1974,7 +1974,7 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &-17); + assert_eq!(stats.min_unchecked(), &-17); assert_eq!(stats.max(), &9000); } else { panic!("expecting Statistics::Int32"); @@ -2222,7 +2222,7 @@ mod tests { // with the deprecated `min` and `max` statistics assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Boolean(stats) = stats { - assert_eq!(stats.min(), &false); + assert_eq!(stats.min_unchecked(), &false); assert_eq!(stats.max(), &true); } else { panic!("expecting Statistics::Boolean, got {stats:?}"); @@ -2235,7 +2235,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &-2); + assert_eq!(stats.min_unchecked(), &-2); assert_eq!(stats.max(), &3); } else { panic!("expecting Statistics::Int32, got {stats:?}"); @@ -2248,7 +2248,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { - assert_eq!(stats.min(), &-2); + assert_eq!(stats.min_unchecked(), &-2); assert_eq!(stats.max(), &3); } else { panic!("expecting Statistics::Int64, got {stats:?}"); @@ -2270,7 +2270,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { - assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30])); + assert_eq!(stats.min_unchecked(), &Int96::from(vec![0, 20, 30])); assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); } else { panic!("expecting Statistics::Int96, got {stats:?}"); @@ -2283,7 +2283,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.min_unchecked(), &-2.0); assert_eq!(stats.max(), &3.0); } else { panic!("expecting Statistics::Float, got {stats:?}"); @@ -2296,7 +2296,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.min_unchecked(), &-2.0); assert_eq!(stats.max(), &3.0); } else { panic!("expecting Statistics::Double, got {stats:?}"); @@ -2314,7 +2314,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { - assert_eq!(stats.min(), &ByteArray::from("aaw")); + assert_eq!(stats.min_unchecked(), &ByteArray::from("aaw")); assert_eq!(stats.max(), &ByteArray::from("zz")); } else { panic!("expecting Statistics::ByteArray, got {stats:?}"); @@ -2333,7 +2333,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::FixedLenByteArray(stats) = stats { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); - assert_eq!(stats.min(), &expected_min); + assert_eq!(stats.min_unchecked(), &expected_min); let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); assert_eq!(stats.max(), &expected_max); } else { @@ -2356,7 +2356,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(-f16::from_f32(2.0))); + assert_eq!(stats.min_unchecked(), &ByteArray::from(-f16::from_f32(2.0))); assert_eq!(stats.max(), &ByteArray::from(f16::from_f32(3.0))); } @@ -2370,7 +2370,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); } @@ -2384,7 +2384,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); } @@ -2398,7 +2398,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); } @@ -2424,7 +2424,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); } @@ -2438,7 +2438,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); } @@ -2452,7 +2452,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max(), &ByteArray::from(f16::PI)); } @@ -2466,7 +2466,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(-f16::PI)); + assert_eq!(stats.min_unchecked(), &ByteArray::from(-f16::PI)); assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); } @@ -2476,7 +2476,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &1.0); + assert_eq!(stats.min_unchecked(), &1.0); assert_eq!(stats.max(), &2.0); } else { panic!("expecting Statistics::Float"); @@ -2489,7 +2489,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &1.0); + assert_eq!(stats.min_unchecked(), &1.0); assert_eq!(stats.max(), &2.0); } else { panic!("expecting Statistics::Float"); @@ -2510,8 +2510,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); + assert_eq!(stats.min_unchecked(), &-0.0); + assert!(stats.min_unchecked().is_sign_negative()); assert_eq!(stats.max(), &0.0); assert!(stats.max().is_sign_positive()); } else { @@ -2525,8 +2525,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); + assert_eq!(stats.min_unchecked(), &-0.0); + assert!(stats.min_unchecked().is_sign_negative()); assert_eq!(stats.max(), &0.0); assert!(stats.max().is_sign_positive()); } else { @@ -2540,8 +2540,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); + assert_eq!(stats.min_unchecked(), &-0.0); + assert!(stats.min_unchecked().is_sign_negative()); assert_eq!(stats.max(), &2.0); } else { panic!("expecting Statistics::Float"); @@ -2554,7 +2554,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.min_unchecked(), &-2.0); assert_eq!(stats.max(), &0.0); assert!(stats.max().is_sign_positive()); } else { @@ -2568,7 +2568,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &1.0); + assert_eq!(stats.min_unchecked(), &1.0); assert_eq!(stats.max(), &2.0); } else { panic!("expecting Statistics::Double"); @@ -2581,7 +2581,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &1.0); + assert_eq!(stats.min_unchecked(), &1.0); assert_eq!(stats.max(), &2.0); } else { panic!("expecting Statistics::Double"); @@ -2602,8 +2602,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); + assert_eq!(stats.min_unchecked(), &-0.0); + assert!(stats.min_unchecked().is_sign_negative()); assert_eq!(stats.max(), &0.0); assert!(stats.max().is_sign_positive()); } else { @@ -2617,8 +2617,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); + assert_eq!(stats.min_unchecked(), &-0.0); + assert!(stats.min_unchecked().is_sign_negative()); assert_eq!(stats.max(), &0.0); assert!(stats.max().is_sign_positive()); } else { @@ -2632,8 +2632,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); + assert_eq!(stats.min_unchecked(), &-0.0); + assert!(stats.min_unchecked().is_sign_negative()); assert_eq!(stats.max(), &2.0); } else { panic!("expecting Statistics::Double"); @@ -2646,7 +2646,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.min_unchecked(), &-2.0); assert_eq!(stats.max(), &0.0); assert!(stats.max().is_sign_positive()); } else { @@ -2955,7 +2955,7 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { - let min_value = _stats.min(); + let min_value = _stats.min_unchecked(); let max_value = _stats.max(); assert!(!_stats.min_is_exact()); @@ -3008,7 +3008,7 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { - let min_value = _stats.min(); + let min_value = _stats.min_unchecked(); let max_value = _stats.max(); assert!(!_stats.min_is_exact()); diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index bb822b4ccbe7..8ab1c6d11dd4 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -184,9 +184,9 @@ impl HeapSize for PageIndex { impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { if self.has_min_max_set() { - return self.min().heap_size() + self.max().heap_size(); + return self.min_unchecked().heap_size() + self.max().heap_size(); } else if self.min_is_exact() { - return self.min().heap_size(); + return self.min_unchecked().heap_size(); } else if self.max_is_exact() { return self.max().heap_size(); } diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index dded9c9c1192..58c002f91c7f 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -33,7 +33,7 @@ //! //! match stats { //! Statistics::Int32(ref typed) => { -//! assert_eq!(*typed.min(), 1); +//! assert_eq!(typed.min(), Some(&1)); //! assert_eq!(*typed.max(), 10); //! } //! _ => {} @@ -538,10 +538,15 @@ impl ValueStatistics { /// /// Panics if min value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. - pub fn min(&self) -> &T { + pub(crate) fn min_unchecked(&self) -> &T { self.min.as_ref().unwrap() } + /// Returns min value of the statistics. + pub fn min(&self) -> Option<&T> { + self.min.as_ref() + } + /// Returns max value of the statistics. /// /// Panics if max value is not set, e.g. all values are `null`. @@ -555,7 +560,7 @@ impl ValueStatistics { /// Panics if min value is not set, use `has_min_max_set` method to check /// if values are set. pub fn min_bytes(&self) -> &[u8] { - self.min().as_bytes() + self.min_unchecked().as_bytes() } /// Returns max value as bytes of the statistics. From 18b467bc09809848288f40f44900d69d7f491f27 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 03:12:41 -0500 Subject: [PATCH 02/23] update public api Statistics::max to return an option. I first re-named the existing method to `max_unchecked` and made it internal to the crate. I then added a `pub max(&self) -> Opiton<&T>` method. I figure we can first change the public API before deciding what to do about internal usage. Ref: https://github.com/apache/arrow-rs/issues/6093 --- parquet/src/arrow/arrow_reader/statistics.rs | 14 ++-- parquet/src/arrow/arrow_writer/mod.rs | 8 +- parquet/src/column/writer/mod.rs | 82 ++++++++++---------- parquet/src/file/metadata/memory.rs | 4 +- parquet/src/file/statistics.rs | 11 ++- 5 files changed, 62 insertions(+), 57 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 987819af142b..a66f42fddfb2 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -137,18 +137,18 @@ make_stats_iterator!( ); make_stats_iterator!( MaxBooleanStatsIterator, - max, + max_unchecked, ParquetStatistics::Boolean, bool ); make_stats_iterator!(MinInt32StatsIterator, min_unchecked, ParquetStatistics::Int32, i32); -make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32); +make_stats_iterator!(MaxInt32StatsIterator, max_unchecked, ParquetStatistics::Int32, i32); make_stats_iterator!(MinInt64StatsIterator, min_unchecked, ParquetStatistics::Int64, i64); -make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64); +make_stats_iterator!(MaxInt64StatsIterator, max_unchecked, ParquetStatistics::Int64, i64); make_stats_iterator!(MinFloatStatsIterator, min_unchecked, ParquetStatistics::Float, f32); -make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32); +make_stats_iterator!(MaxFloatStatsIterator, max_unchecked, ParquetStatistics::Float, f32); make_stats_iterator!(MinDoubleStatsIterator, min_unchecked, ParquetStatistics::Double, f64); -make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64); +make_stats_iterator!(MaxDoubleStatsIterator, max_unchecked, ParquetStatistics::Double, f64); make_stats_iterator!( MinByteArrayStatsIterator, min_bytes, @@ -251,7 +251,7 @@ make_decimal_stats_iterator!( ); make_decimal_stats_iterator!( MaxDecimal128StatsIterator, - max, + max_unchecked, max_bytes, i128, from_bytes_to_i128 @@ -265,7 +265,7 @@ make_decimal_stats_iterator!( ); make_decimal_stats_iterator!( MaxDecimal256StatsIterator, - max, + max_unchecked, max_bytes, i256, from_bytes_to_i256 diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 12da55007674..70fadd5ae996 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2544,7 +2544,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!(*stats.min_unchecked() as u32, *src_slice.iter().min().unwrap()); - assert_eq!(*stats.max() as u32, *src_slice.iter().max().unwrap()); + assert_eq!(*stats.max_unchecked() as u32, *src_slice.iter().max().unwrap()); } else { panic!("Statistics::Int32 missing") } @@ -2585,7 +2585,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int64(stats) = stats { assert_eq!(*stats.min_unchecked() as u64, *src_slice.iter().min().unwrap()); - assert_eq!(*stats.max() as u64, *src_slice.iter().max().unwrap()); + assert_eq!(*stats.max_unchecked() as u64, *src_slice.iter().max().unwrap()); } else { panic!("Statistics::Int64 missing") } @@ -3070,7 +3070,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { let min = byte_array_stats.min_unchecked(); - let max = byte_array_stats.max(); + let max = byte_array_stats.max_unchecked(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); @@ -3142,7 +3142,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { let min = byte_array_stats.min_unchecked(); - let max = byte_array_stats.max(); + let max = byte_array_stats.max_unchecked(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8b13e27a45b7..d07c268e10d0 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -770,7 +770,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Some(stat) => { // Check if min/max are still ascending/descending across pages let new_min = stat.min_unchecked(); - let new_max = stat.max(); + let new_max = stat.max_unchecked(); if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max { if self.data_page_boundary_ascending { // If last min/max are greater than new min/max then not ascending anymore @@ -1846,7 +1846,7 @@ mod tests { assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_unchecked(), &1); - assert_eq!(stats.max(), &4); + assert_eq!(stats.max_unchecked(), &4); } else { panic!("expecting Statistics::Int32"); } @@ -1896,7 +1896,7 @@ mod tests { ]) ); assert_eq!( - stats.max(), + stats.max_unchecked(), &ByteArray::from(vec![ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, 246u8, 44u8, 0u8, 0u8, @@ -1926,7 +1926,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_unchecked(), &0,); - assert_eq!(stats.max(), &5,); + assert_eq!(stats.max_unchecked(), &5,); } else { panic!("expecting Statistics::Int32"); } @@ -1975,7 +1975,7 @@ mod tests { assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_unchecked(), &-17); - assert_eq!(stats.max(), &9000); + assert_eq!(stats.max_unchecked(), &9000); } else { panic!("expecting Statistics::Int32"); } @@ -2223,7 +2223,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Boolean(stats) = stats { assert_eq!(stats.min_unchecked(), &false); - assert_eq!(stats.max(), &true); + assert_eq!(stats.max_unchecked(), &true); } else { panic!("expecting Statistics::Boolean, got {stats:?}"); } @@ -2236,7 +2236,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_unchecked(), &-2); - assert_eq!(stats.max(), &3); + assert_eq!(stats.max_unchecked(), &3); } else { panic!("expecting Statistics::Int32, got {stats:?}"); } @@ -2249,7 +2249,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { assert_eq!(stats.min_unchecked(), &-2); - assert_eq!(stats.max(), &3); + assert_eq!(stats.max_unchecked(), &3); } else { panic!("expecting Statistics::Int64, got {stats:?}"); } @@ -2271,7 +2271,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { assert_eq!(stats.min_unchecked(), &Int96::from(vec![0, 20, 30])); - assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); + assert_eq!(stats.max_unchecked(), &Int96::from(vec![3, 20, 10])); } else { panic!("expecting Statistics::Int96, got {stats:?}"); } @@ -2284,7 +2284,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max(), &3.0); + assert_eq!(stats.max_unchecked(), &3.0); } else { panic!("expecting Statistics::Float, got {stats:?}"); } @@ -2297,7 +2297,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max(), &3.0); + assert_eq!(stats.max_unchecked(), &3.0); } else { panic!("expecting Statistics::Double, got {stats:?}"); } @@ -2315,7 +2315,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!(stats.min_unchecked(), &ByteArray::from("aaw")); - assert_eq!(stats.max(), &ByteArray::from("zz")); + assert_eq!(stats.max_unchecked(), &ByteArray::from("zz")); } else { panic!("expecting Statistics::ByteArray, got {stats:?}"); } @@ -2335,7 +2335,7 @@ mod tests { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); assert_eq!(stats.min_unchecked(), &expected_min); let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); - assert_eq!(stats.max(), &expected_max); + assert_eq!(stats.max_unchecked(), &expected_max); } else { panic!("expecting Statistics::FixedLenByteArray, got {stats:?}"); } @@ -2357,7 +2357,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(-f16::from_f32(2.0))); - assert_eq!(stats.max(), &ByteArray::from(f16::from_f32(3.0))); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::from_f32(3.0))); } #[test] @@ -2371,7 +2371,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ONE + f16::ONE)); } #[test] @@ -2385,7 +2385,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ONE + f16::ONE)); } #[test] @@ -2399,7 +2399,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ONE + f16::ONE)); } #[test] @@ -2425,7 +2425,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2439,7 +2439,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2453,7 +2453,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max(), &ByteArray::from(f16::PI)); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::PI)); } #[test] @@ -2467,7 +2467,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_unchecked(), &ByteArray::from(-f16::PI)); - assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2477,7 +2477,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.max_unchecked(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2490,7 +2490,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.max_unchecked(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2512,8 +2512,8 @@ mod tests { if let Statistics::Float(stats) = stats { assert_eq!(stats.min_unchecked(), &-0.0); assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.max_unchecked(), &0.0); + assert!(stats.max_unchecked().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2527,8 +2527,8 @@ mod tests { if let Statistics::Float(stats) = stats { assert_eq!(stats.min_unchecked(), &-0.0); assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.max_unchecked(), &0.0); + assert!(stats.max_unchecked().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2542,7 +2542,7 @@ mod tests { if let Statistics::Float(stats) = stats { assert_eq!(stats.min_unchecked(), &-0.0); assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.max_unchecked(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2555,8 +2555,8 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.max_unchecked(), &0.0); + assert!(stats.max_unchecked().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2569,7 +2569,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.max_unchecked(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2582,7 +2582,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.max_unchecked(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2604,8 +2604,8 @@ mod tests { if let Statistics::Double(stats) = stats { assert_eq!(stats.min_unchecked(), &-0.0); assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.max_unchecked(), &0.0); + assert!(stats.max_unchecked().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2619,8 +2619,8 @@ mod tests { if let Statistics::Double(stats) = stats { assert_eq!(stats.min_unchecked(), &-0.0); assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.max_unchecked(), &0.0); + assert!(stats.max_unchecked().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2634,7 +2634,7 @@ mod tests { if let Statistics::Double(stats) = stats { assert_eq!(stats.min_unchecked(), &-0.0); assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.max_unchecked(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2647,8 +2647,8 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.max_unchecked(), &0.0); + assert!(stats.max_unchecked().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2956,7 +2956,7 @@ mod tests { assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { let min_value = _stats.min_unchecked(); - let max_value = _stats.max(); + let max_value = _stats.max_unchecked(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); @@ -3009,7 +3009,7 @@ mod tests { assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let min_value = _stats.min_unchecked(); - let max_value = _stats.max(); + let max_value = _stats.max_unchecked(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 8ab1c6d11dd4..8b713bda9faa 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -184,11 +184,11 @@ impl HeapSize for PageIndex { impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { if self.has_min_max_set() { - return self.min_unchecked().heap_size() + self.max().heap_size(); + return self.min_unchecked().heap_size() + self.max_unchecked().heap_size(); } else if self.min_is_exact() { return self.min_unchecked().heap_size(); } else if self.max_is_exact() { - return self.max().heap_size(); + return self.max_unchecked().heap_size(); } 0 } diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 58c002f91c7f..54ee5151221a 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -34,7 +34,7 @@ //! match stats { //! Statistics::Int32(ref typed) => { //! assert_eq!(typed.min(), Some(&1)); -//! assert_eq!(*typed.max(), 10); +//! assert_eq!(typed.max(), Some(&10)); //! } //! _ => {} //! } @@ -551,10 +551,15 @@ impl ValueStatistics { /// /// Panics if max value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. - pub fn max(&self) -> &T { + pub(crate) fn max_unchecked(&self) -> &T { self.max.as_ref().unwrap() } + /// Returns max value of the statistics. + pub fn max(&self) -> Option<&T> { + self.max.as_ref() + } + /// Returns min value as bytes of the statistics. /// /// Panics if min value is not set, use `has_min_max_set` method to check @@ -568,7 +573,7 @@ impl ValueStatistics { /// Panics if max value is not set, use `has_min_max_set` method to check /// if values are set. pub fn max_bytes(&self) -> &[u8] { - self.max().as_bytes() + self.max_unchecked().as_bytes() } /// Whether or not min and max values are set. From 7ffac1a967f96afe855a53194e80f3e7dd953574 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 03:13:48 -0500 Subject: [PATCH 03/23] cargo fmt --- parquet/src/arrow/arrow_reader/statistics.rs | 56 +++++++++++++++++--- parquet/src/arrow/arrow_writer/mod.rs | 20 +++++-- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index a66f42fddfb2..98d327b50b11 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -141,14 +141,54 @@ make_stats_iterator!( ParquetStatistics::Boolean, bool ); -make_stats_iterator!(MinInt32StatsIterator, min_unchecked, ParquetStatistics::Int32, i32); -make_stats_iterator!(MaxInt32StatsIterator, max_unchecked, ParquetStatistics::Int32, i32); -make_stats_iterator!(MinInt64StatsIterator, min_unchecked, ParquetStatistics::Int64, i64); -make_stats_iterator!(MaxInt64StatsIterator, max_unchecked, ParquetStatistics::Int64, i64); -make_stats_iterator!(MinFloatStatsIterator, min_unchecked, ParquetStatistics::Float, f32); -make_stats_iterator!(MaxFloatStatsIterator, max_unchecked, ParquetStatistics::Float, f32); -make_stats_iterator!(MinDoubleStatsIterator, min_unchecked, ParquetStatistics::Double, f64); -make_stats_iterator!(MaxDoubleStatsIterator, max_unchecked, ParquetStatistics::Double, f64); +make_stats_iterator!( + MinInt32StatsIterator, + min_unchecked, + ParquetStatistics::Int32, + i32 +); +make_stats_iterator!( + MaxInt32StatsIterator, + max_unchecked, + ParquetStatistics::Int32, + i32 +); +make_stats_iterator!( + MinInt64StatsIterator, + min_unchecked, + ParquetStatistics::Int64, + i64 +); +make_stats_iterator!( + MaxInt64StatsIterator, + max_unchecked, + ParquetStatistics::Int64, + i64 +); +make_stats_iterator!( + MinFloatStatsIterator, + min_unchecked, + ParquetStatistics::Float, + f32 +); +make_stats_iterator!( + MaxFloatStatsIterator, + max_unchecked, + ParquetStatistics::Float, + f32 +); +make_stats_iterator!( + MinDoubleStatsIterator, + min_unchecked, + ParquetStatistics::Double, + f64 +); +make_stats_iterator!( + MaxDoubleStatsIterator, + max_unchecked, + ParquetStatistics::Double, + f64 +); make_stats_iterator!( MinByteArrayStatsIterator, min_bytes, diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 70fadd5ae996..21972812f056 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2543,8 +2543,14 @@ mod tests { let stats = column.statistics().unwrap(); assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { - assert_eq!(*stats.min_unchecked() as u32, *src_slice.iter().min().unwrap()); - assert_eq!(*stats.max_unchecked() as u32, *src_slice.iter().max().unwrap()); + assert_eq!( + *stats.min_unchecked() as u32, + *src_slice.iter().min().unwrap() + ); + assert_eq!( + *stats.max_unchecked() as u32, + *src_slice.iter().max().unwrap() + ); } else { panic!("Statistics::Int32 missing") } @@ -2584,8 +2590,14 @@ mod tests { let stats = column.statistics().unwrap(); assert!(stats.has_min_max_set()); if let Statistics::Int64(stats) = stats { - assert_eq!(*stats.min_unchecked() as u64, *src_slice.iter().min().unwrap()); - assert_eq!(*stats.max_unchecked() as u64, *src_slice.iter().max().unwrap()); + assert_eq!( + *stats.min_unchecked() as u64, + *src_slice.iter().min().unwrap() + ); + assert_eq!( + *stats.max_unchecked() as u64, + *src_slice.iter().max().unwrap() + ); } else { panic!("Statistics::Int64 missing") } From 87c375a67b3e98d5e2f5dd464e3dd8cd9d5deef2 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 03:15:10 -0500 Subject: [PATCH 04/23] remove Statistics::has_min_max_set from the public api Ref: https://github.com/apache/arrow-rs/issues/6093 --- parquet/src/file/statistics.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 54ee5151221a..448b1efe922d 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -26,7 +26,6 @@ //! //! let stats = Statistics::int32(Some(1), Some(10), None, 3, true); //! assert_eq!(stats.null_count(), 3); -//! assert!(stats.has_min_max_set()); //! assert!(stats.is_min_max_deprecated()); //! assert!(stats.min_is_exact()); //! assert!(stats.max_is_exact()); @@ -396,7 +395,7 @@ impl Statistics { /// Returns `true` if min value and max value are set. /// Normally both min/max values will be set to `Some(value)` or `None`. - pub fn has_min_max_set(&self) -> bool { + pub(crate) fn has_min_max_set(&self) -> bool { statistics_enum_func![self, has_min_max_set] } @@ -578,7 +577,7 @@ impl ValueStatistics { /// Whether or not min and max values are set. /// Normally both min/max values will be set to `Some(value)` or `None`. - pub fn has_min_max_set(&self) -> bool { + pub(crate) fn has_min_max_set(&self) -> bool { self.min.is_some() && self.max.is_some() } From 6950994213e5b73987e05fb0a06feeffe7744d43 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 03:42:48 -0500 Subject: [PATCH 05/23] update impl HeapSize for ValueStatistics to use new min and max api --- parquet/src/file/metadata/memory.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 8b713bda9faa..979904cf4be8 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -183,14 +183,7 @@ impl HeapSize for PageIndex { impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { - if self.has_min_max_set() { - return self.min_unchecked().heap_size() + self.max_unchecked().heap_size(); - } else if self.min_is_exact() { - return self.min_unchecked().heap_size(); - } else if self.max_is_exact() { - return self.max_unchecked().heap_size(); - } - 0 + self.min().map(T::heap_size).unwrap_or(0) + self.max().map(T::heap_size).unwrap_or(0) } } impl HeapSize for bool { From c65a773440ae9d69be84d28ee9957ba5943c20dd Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 03:47:26 -0500 Subject: [PATCH 06/23] migrate all tests to new Statistics min and max api --- parquet/src/arrow/arrow_writer/mod.rs | 16 +-- parquet/src/column/writer/mod.rs | 160 +++++++++++++------------- 2 files changed, 88 insertions(+), 88 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 21972812f056..c8331aabb693 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2544,11 +2544,11 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!( - *stats.min_unchecked() as u32, + *stats.min().unwrap() as u32, *src_slice.iter().min().unwrap() ); assert_eq!( - *stats.max_unchecked() as u32, + *stats.max().unwrap() as u32, *src_slice.iter().max().unwrap() ); } else { @@ -2591,11 +2591,11 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int64(stats) = stats { assert_eq!( - *stats.min_unchecked() as u64, + *stats.min().unwrap() as u64, *src_slice.iter().min().unwrap() ); assert_eq!( - *stats.max_unchecked() as u64, + *stats.max().unwrap() as u64, *src_slice.iter().max().unwrap() ); } else { @@ -3081,8 +3081,8 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min_unchecked(); - let max = byte_array_stats.max_unchecked(); + let min = byte_array_stats.min().unwrap(); + let max = byte_array_stats.max().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); @@ -3153,8 +3153,8 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min_unchecked(); - let max = byte_array_stats.max_unchecked(); + let min = byte_array_stats.min().unwrap(); + let max = byte_array_stats.max().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index d07c268e10d0..cf2ddb4a70ff 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1845,8 +1845,8 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min_unchecked(), &1); - assert_eq!(stats.max_unchecked(), &4); + assert_eq!(stats.min().unwrap(), &1); + assert_eq!(stats.max().unwrap(), &4); } else { panic!("expecting Statistics::Int32"); } @@ -1889,14 +1889,14 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!( - stats.min_unchecked(), + stats.min().unwrap(), &ByteArray::from(vec![ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, 35u8, 231u8, 90u8, 0u8, 0u8, ]) ); assert_eq!( - stats.max_unchecked(), + stats.max().unwrap(), &ByteArray::from(vec![ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, 246u8, 44u8, 0u8, 0u8, @@ -1925,8 +1925,8 @@ mod tests { if let Some(stats) = metadata.statistics() { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min_unchecked(), &0,); - assert_eq!(stats.max_unchecked(), &5,); + assert_eq!(stats.min().unwrap(), &0,); + assert_eq!(stats.max().unwrap(), &5,); } else { panic!("expecting Statistics::Int32"); } @@ -1974,8 +1974,8 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min_unchecked(), &-17); - assert_eq!(stats.max_unchecked(), &9000); + assert_eq!(stats.min().unwrap(), &-17); + assert_eq!(stats.max().unwrap(), &9000); } else { panic!("expecting Statistics::Int32"); } @@ -2222,8 +2222,8 @@ mod tests { // with the deprecated `min` and `max` statistics assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Boolean(stats) = stats { - assert_eq!(stats.min_unchecked(), &false); - assert_eq!(stats.max_unchecked(), &true); + assert_eq!(stats.min().unwrap(), &false); + assert_eq!(stats.max().unwrap(), &true); } else { panic!("expecting Statistics::Boolean, got {stats:?}"); } @@ -2235,8 +2235,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min_unchecked(), &-2); - assert_eq!(stats.max_unchecked(), &3); + assert_eq!(stats.min().unwrap(), &-2); + assert_eq!(stats.max().unwrap(), &3); } else { panic!("expecting Statistics::Int32, got {stats:?}"); } @@ -2248,8 +2248,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { - assert_eq!(stats.min_unchecked(), &-2); - assert_eq!(stats.max_unchecked(), &3); + assert_eq!(stats.min().unwrap(), &-2); + assert_eq!(stats.max().unwrap(), &3); } else { panic!("expecting Statistics::Int64, got {stats:?}"); } @@ -2270,8 +2270,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { - assert_eq!(stats.min_unchecked(), &Int96::from(vec![0, 20, 30])); - assert_eq!(stats.max_unchecked(), &Int96::from(vec![3, 20, 10])); + assert_eq!(stats.min().unwrap(), &Int96::from(vec![0, 20, 30])); + assert_eq!(stats.max().unwrap(), &Int96::from(vec![3, 20, 10])); } else { panic!("expecting Statistics::Int96, got {stats:?}"); } @@ -2283,8 +2283,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max_unchecked(), &3.0); + assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.max().unwrap(), &3.0); } else { panic!("expecting Statistics::Float, got {stats:?}"); } @@ -2296,8 +2296,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max_unchecked(), &3.0); + assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.max().unwrap(), &3.0); } else { panic!("expecting Statistics::Double, got {stats:?}"); } @@ -2314,8 +2314,8 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { - assert_eq!(stats.min_unchecked(), &ByteArray::from("aaw")); - assert_eq!(stats.max_unchecked(), &ByteArray::from("zz")); + assert_eq!(stats.min().unwrap(), &ByteArray::from("aaw")); + assert_eq!(stats.max().unwrap(), &ByteArray::from("zz")); } else { panic!("expecting Statistics::ByteArray, got {stats:?}"); } @@ -2333,9 +2333,9 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::FixedLenByteArray(stats) = stats { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); - assert_eq!(stats.min_unchecked(), &expected_min); + assert_eq!(stats.min().unwrap(), &expected_min); let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); - assert_eq!(stats.max_unchecked(), &expected_max); + assert_eq!(stats.max().unwrap(), &expected_max); } else { panic!("expecting Statistics::FixedLenByteArray, got {stats:?}"); } @@ -2356,8 +2356,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(-f16::from_f32(2.0))); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::from_f32(3.0))); + assert_eq!(stats.min().unwrap(), &ByteArray::from(-f16::from_f32(2.0))); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::from_f32(3.0))); } #[test] @@ -2370,8 +2370,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ONE + f16::ONE)); } #[test] @@ -2384,8 +2384,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ONE + f16::ONE)); } #[test] @@ -2398,8 +2398,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ONE + f16::ONE)); } #[test] @@ -2424,8 +2424,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2438,8 +2438,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2452,8 +2452,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::PI)); + assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::PI)); } #[test] @@ -2466,8 +2466,8 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min_unchecked(), &ByteArray::from(-f16::PI)); - assert_eq!(stats.max_unchecked(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.min().unwrap(), &ByteArray::from(-f16::PI)); + assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2476,8 +2476,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max_unchecked(), &2.0); + assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.max().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2489,8 +2489,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max_unchecked(), &2.0); + assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.max().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2510,10 +2510,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_unchecked(), &-0.0); - assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max_unchecked(), &0.0); - assert!(stats.max_unchecked().is_sign_positive()); + assert_eq!(stats.min().unwrap(), &-0.0); + assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.max().unwrap(), &0.0); + assert!(stats.max().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2525,10 +2525,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_unchecked(), &-0.0); - assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max_unchecked(), &0.0); - assert!(stats.max_unchecked().is_sign_positive()); + assert_eq!(stats.min().unwrap(), &-0.0); + assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.max().unwrap(), &0.0); + assert!(stats.max().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2540,9 +2540,9 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_unchecked(), &-0.0); - assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max_unchecked(), &2.0); + assert_eq!(stats.min().unwrap(), &-0.0); + assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.max().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2554,9 +2554,9 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max_unchecked(), &0.0); - assert!(stats.max_unchecked().is_sign_positive()); + assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.max().unwrap(), &0.0); + assert!(stats.max().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2568,8 +2568,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max_unchecked(), &2.0); + assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.max().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2581,8 +2581,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_unchecked(), &1.0); - assert_eq!(stats.max_unchecked(), &2.0); + assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.max().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2602,10 +2602,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_unchecked(), &-0.0); - assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max_unchecked(), &0.0); - assert!(stats.max_unchecked().is_sign_positive()); + assert_eq!(stats.min().unwrap(), &-0.0); + assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.max().unwrap(), &0.0); + assert!(stats.max().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2617,10 +2617,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_unchecked(), &-0.0); - assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max_unchecked(), &0.0); - assert!(stats.max_unchecked().is_sign_positive()); + assert_eq!(stats.min().unwrap(), &-0.0); + assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.max().unwrap(), &0.0); + assert!(stats.max().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2632,9 +2632,9 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_unchecked(), &-0.0); - assert!(stats.min_unchecked().is_sign_negative()); - assert_eq!(stats.max_unchecked(), &2.0); + assert_eq!(stats.min().unwrap(), &-0.0); + assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.max().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2646,9 +2646,9 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min_unchecked(), &-2.0); - assert_eq!(stats.max_unchecked(), &0.0); - assert!(stats.max_unchecked().is_sign_positive()); + assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.max().unwrap(), &0.0); + assert!(stats.max().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2955,8 +2955,8 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { - let min_value = _stats.min_unchecked(); - let max_value = _stats.max_unchecked(); + let min_value = _stats.min().unwrap(); + let max_value = _stats.max().unwrap(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); @@ -3008,8 +3008,8 @@ mod tests { assert_eq!(stats.null_count(), 0); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { - let min_value = _stats.min_unchecked(); - let max_value = _stats.max_unchecked(); + let min_value = _stats.min().unwrap(); + let max_value = _stats.max().unwrap(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); From 89ea21db4928b15449cf609afbc4ce10a1a90c31 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 13:58:20 -0500 Subject: [PATCH 07/23] make Statistics::null_count return Option This removes ambiguity around whether the between all values are non-null or just that the null count stat is missing Ref: https://github.com/apache/arrow-rs/issues/6215 --- parquet/src/arrow/arrow_reader/statistics.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/column/page.rs | 10 +- parquet/src/column/writer/mod.rs | 22 +- parquet/src/file/metadata/mod.rs | 10 +- parquet/src/file/statistics.rs | 211 +++++++++++++------ parquet/src/file/writer.rs | 6 +- 7 files changed, 171 insertions(+), 92 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 98d327b50b11..dff861bac7d1 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1387,7 +1387,7 @@ impl<'a> StatisticsConverter<'a> { let null_counts = metadatas .into_iter() .map(|x| x.column(parquet_index).statistics()) - .map(|s| s.map(|s| s.null_count())); + .map(|s| s.and_then(|s| s.null_count())); Ok(UInt64Array::from_iter(null_counts)) } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index c8331aabb693..a52809daf3ed 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2620,7 +2620,7 @@ mod tests { assert_eq!(row_group.num_columns(), 1); let column = row_group.column(0); let stats = column.statistics().unwrap(); - assert_eq!(stats.null_count(), 2); + assert_eq!(stats.null_count(), Some(2)); } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 585d1951c14a..e3931dfe9e2b 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -370,7 +370,7 @@ mod tests { encoding: Encoding::PLAIN, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)), }; assert_eq!(data_page.page_type(), PageType::DATA_PAGE); assert_eq!(data_page.buffer(), vec![0, 1, 2].as_slice()); @@ -378,7 +378,7 @@ mod tests { assert_eq!(data_page.encoding(), Encoding::PLAIN); assert_eq!( data_page.statistics(), - Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true)) ); let data_page_v2 = Page::DataPageV2 { @@ -390,7 +390,7 @@ mod tests { def_levels_byte_len: 30, rep_levels_byte_len: 40, is_compressed: false, - statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)), }; assert_eq!(data_page_v2.page_type(), PageType::DATA_PAGE_V2); assert_eq!(data_page_v2.buffer(), vec![0, 1, 2].as_slice()); @@ -398,7 +398,7 @@ mod tests { assert_eq!(data_page_v2.encoding(), Encoding::PLAIN); assert_eq!( data_page_v2.statistics(), - Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true)) ); let dict_page = Page::DictionaryPage { @@ -422,7 +422,7 @@ mod tests { encoding: Encoding::PLAIN, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)), }; let cpage = CompressedPage::new(data_page, 5); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index cf2ddb4a70ff..81922ad20a96 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -897,7 +897,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Some(min), Some(max), None, - self.page_metrics.num_page_nulls, + Some(self.page_metrics.num_page_nulls), false, ), ) @@ -1066,7 +1066,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_metrics.min_column_value.clone(), self.column_metrics.max_column_value.clone(), self.column_metrics.column_distinct_count, - self.column_metrics.num_column_nulls, + Some(self.column_metrics.num_column_nulls), false, ) .with_backwards_compatible_min_max(backwards_compatible_min_max) @@ -1842,7 +1842,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min().unwrap(), &1); @@ -1971,7 +1971,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min().unwrap(), &-17); @@ -2002,7 +2002,7 @@ mod tests { let stats = r.metadata.statistics().unwrap(); assert_eq!(stats.min_bytes(), 1_i32.to_le_bytes()); assert_eq!(stats.max_bytes(), 7_i32.to_le_bytes()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? assert!(stats.distinct_count().is_none()); drop(write); @@ -2028,7 +2028,7 @@ mod tests { let page_statistics = pages[1].statistics().unwrap(); assert_eq!(page_statistics.min_bytes(), 1_i32.to_le_bytes()); assert_eq!(page_statistics.max_bytes(), 7_i32.to_le_bytes()); - assert_eq!(page_statistics.null_count(), 0); + assert_eq!(page_statistics.null_count(), Some(0)); // TODO: None or 0? assert!(page_statistics.distinct_count().is_none()); } @@ -2706,7 +2706,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { // first page is [1,2,3,4] @@ -2761,7 +2761,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(stats) = stats { let column_index_min_value = &column_index.min_values[0]; @@ -2828,7 +2828,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let column_index_min_value = &column_index.min_values[0]; @@ -2952,7 +2952,7 @@ mod tests { let stats = r.metadata.statistics().expect("statistics"); assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { let min_value = _stats.min().unwrap(); @@ -3005,7 +3005,7 @@ mod tests { let stats = r.metadata.statistics().expect("statistics"); assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let min_value = _stats.min().unwrap(); diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 45ef0c546ffe..1fe5f50a42e0 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1613,7 +1613,7 @@ mod tests { .iter() .map(|column_descr| { ColumnChunkMetaData::builder(column_descr.clone()) - .set_statistics(Statistics::new::(None, None, None, 0, false)) + .set_statistics(Statistics::new::(None, None, None, None, false)) .build() }) .collect::>>() @@ -1651,7 +1651,13 @@ mod tests { .iter() .map(|column_descr| { ColumnChunkMetaData::builder(column_descr.clone()) - .set_statistics(Statistics::new::(Some(0), Some(100), None, 0, false)) + .set_statistics(Statistics::new::( + Some(0), + Some(100), + None, + None, + false, + )) .build() }) .collect::>>() diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 448b1efe922d..271acf4fd033 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -24,8 +24,8 @@ //! ```rust //! use parquet::file::statistics::Statistics; //! -//! let stats = Statistics::int32(Some(1), Some(10), None, 3, true); -//! assert_eq!(stats.null_count(), 3); +//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true); +//! assert_eq!(stats.null_count(), Some(3)); //! assert!(stats.is_min_max_deprecated()); //! assert!(stats.min_is_exact()); //! assert!(stats.max_is_exact()); @@ -88,7 +88,7 @@ macro_rules! statistics_new_func { min: $vtype, max: $vtype, distinct: Option, - nulls: u64, + nulls: Option, is_deprecated: bool, ) -> Self { Statistics::$stat(ValueStatistics::new( @@ -136,7 +136,7 @@ pub fn from_thrift( } // Generic null count. - let null_count = null_count as u64; + let null_count = Some(null_count as u64); // Generic distinct count (count of distinct values occurring) let distinct_count = stats.distinct_count.map(|value| value as u64); // Whether or not statistics use deprecated min/max fields. @@ -245,11 +245,7 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { let mut thrift_stats = TStatistics { max: None, min: None, - null_count: if stats.has_nulls() { - Some(stats.null_count() as i64) - } else { - None - }, + null_count: stats.null_count().map(|value| value as i64), distinct_count: stats.distinct_count().map(|value| value as i64), max_value: None, min_value: None, @@ -320,7 +316,7 @@ impl Statistics { min: Option, max: Option, distinct_count: Option, - null_count: u64, + null_count: Option, is_deprecated: bool, ) -> Self { Self::from(ValueStatistics::new( @@ -384,15 +380,10 @@ impl Statistics { /// Returns number of null values for the column. /// Note that this includes all nulls when column is part of the complex type. - pub fn null_count(&self) -> u64 { + pub fn null_count(&self) -> Option { statistics_enum_func![self, null_count] } - /// Returns `true` if statistics collected any null values, `false` otherwise. - pub fn has_nulls(&self) -> bool { - self.null_count() > 0 - } - /// Returns `true` if min value and max value are set. /// Normally both min/max values will be set to `Some(value)` or `None`. pub(crate) fn has_min_max_set(&self) -> bool { @@ -463,7 +454,7 @@ pub struct ValueStatistics { max: Option, // Distinct count could be omitted in some cases distinct_count: Option, - null_count: u64, + null_count: Option, // Whether or not the min or max values are exact, or truncated. is_max_value_exact: bool, @@ -484,7 +475,7 @@ impl ValueStatistics { min: Option, max: Option, distinct_count: Option, - null_count: u64, + null_count: Option, is_min_max_deprecated: bool, ) -> Self { Self { @@ -597,7 +588,7 @@ impl ValueStatistics { } /// Returns null count. - pub fn null_count(&self) -> u64 { + pub fn null_count(&self) -> Option { self.null_count } @@ -639,7 +630,11 @@ impl fmt::Display for ValueStatistics { Some(value) => write!(f, "{value}")?, None => write!(f, "N/A")?, } - write!(f, ", null_count: {}", self.null_count)?; + write!(f, ", null_count: ")?; + match self.null_count { + Some(value) => write!(f, "{value}")?, + None => write!(f, "N/A")?, + } write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?; write!(f, ", max_value_exact: {}", self.is_max_value_exact)?; write!(f, ", min_value_exact: {}", self.is_min_value_exact)?; @@ -651,7 +646,7 @@ impl fmt::Debug for ValueStatistics { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ + "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \ min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}", self.min, self.max, @@ -671,7 +666,7 @@ mod tests { #[test] fn test_statistics_min_max_bytes() { - let stats = Statistics::int32(Some(-123), Some(234), None, 1, false); + let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false); assert!(stats.has_min_max_set()); assert_eq!(stats.min_bytes(), (-123).as_bytes()); assert_eq!(stats.max_bytes(), 234.as_bytes()); @@ -680,7 +675,7 @@ mod tests { Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![3, 4, 5])), None, - 1, + Some(1), true, ); assert!(stats.has_min_max_set()); @@ -713,30 +708,30 @@ mod tests { #[test] fn test_statistics_debug() { - let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true); assert_eq!( format!("{stats:?}"), - "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ + "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \ min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})" ); - let stats = Statistics::int32(None, None, None, 7, false); + let stats = Statistics::int32(None, None, None, Some(7), false); assert_eq!( format!("{stats:?}"), - "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ + "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \ min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})" ) } #[test] fn test_statistics_display() { - let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true); assert_eq!( format!("{stats}"), "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" ); - let stats = Statistics::int64(None, None, None, 7, false); + let stats = Statistics::int64(None, None, None, Some(7), false); assert_eq!( format!("{stats}"), "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ @@ -747,7 +742,7 @@ mod tests { Some(Int96::from(vec![1, 0, 0])), Some(Int96::from(vec![2, 3, 4])), None, - 3, + Some(3), true, ); assert_eq!( @@ -761,7 +756,7 @@ mod tests { Some(ByteArray::from(vec![1u8])), Some(ByteArray::from(vec![2u8])), Some(5), - 7, + Some(7), false, ) .with_max_is_exact(false) @@ -775,22 +770,22 @@ mod tests { #[test] fn test_statistics_partial_eq() { - let expected = Statistics::int32(Some(12), Some(45), None, 11, true); + let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true); - assert!(Statistics::int32(Some(12), Some(45), None, 11, true) == expected); - assert!(Statistics::int32(Some(11), Some(45), None, 11, true) != expected); - assert!(Statistics::int32(Some(12), Some(44), None, 11, true) != expected); - assert!(Statistics::int32(Some(12), Some(45), None, 23, true) != expected); - assert!(Statistics::int32(Some(12), Some(45), None, 11, false) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected); + assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected); + assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected); assert!( - Statistics::int32(Some(12), Some(45), None, 11, false) - != Statistics::int64(Some(12), Some(45), None, 11, false) + Statistics::int32(Some(12), Some(45), None, Some(11), false) + != Statistics::int64(Some(12), Some(45), None, Some(11), false) ); assert!( - Statistics::boolean(Some(false), Some(true), None, 0, true) - != Statistics::double(Some(1.2), Some(4.5), None, 0, true) + Statistics::boolean(Some(false), Some(true), None, None, true) + != Statistics::double(Some(1.2), Some(4.5), None, None, true) ); assert!( @@ -798,13 +793,13 @@ mod tests { Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![1, 2, 3])), None, - 0, + None, true ) != Statistics::fixed_len_byte_array( Some(ByteArray::from(vec![1, 2, 3]).into()), Some(ByteArray::from(vec![1, 2, 3]).into()), None, - 0, + None, true, ) ); @@ -814,14 +809,14 @@ mod tests { Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) != Statistics::ByteArray( ValueStatistics::new( Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) .with_max_is_exact(false) @@ -833,14 +828,14 @@ mod tests { Some(FixedLenByteArray::from(vec![1, 2, 3])), Some(FixedLenByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) != Statistics::FixedLenByteArray( ValueStatistics::new( Some(FixedLenByteArray::from(vec![1, 2, 3])), Some(FixedLenByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) .with_min_is_exact(false) @@ -857,45 +852,123 @@ mod tests { assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats)); } - check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); - check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); - check_stats(Statistics::boolean(Some(false), Some(true), None, 0, false)); - check_stats(Statistics::boolean(Some(true), Some(true), None, 7, true)); - check_stats(Statistics::boolean(Some(false), Some(false), None, 7, true)); - check_stats(Statistics::boolean(None, None, None, 7, true)); + check_stats(Statistics::boolean( + Some(false), + Some(true), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean( + Some(false), + Some(true), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean( + Some(false), + Some(true), + None, + Some(0), + false, + )); + check_stats(Statistics::boolean( + Some(true), + Some(true), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean( + Some(false), + Some(false), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean(None, None, None, Some(7), true)); - check_stats(Statistics::int32(Some(-100), Some(500), None, 7, true)); - check_stats(Statistics::int32(Some(-100), Some(500), None, 0, false)); - check_stats(Statistics::int32(None, None, None, 7, true)); + check_stats(Statistics::int32( + Some(-100), + Some(500), + None, + Some(7), + true, + )); + check_stats(Statistics::int32( + Some(-100), + Some(500), + None, + Some(0), + false, + )); + check_stats(Statistics::int32(None, None, None, Some(7), true)); - check_stats(Statistics::int64(Some(-100), Some(200), None, 7, true)); - check_stats(Statistics::int64(Some(-100), Some(200), None, 0, false)); - check_stats(Statistics::int64(None, None, None, 7, true)); + check_stats(Statistics::int64( + Some(-100), + Some(200), + None, + Some(7), + true, + )); + check_stats(Statistics::int64( + Some(-100), + Some(200), + None, + Some(0), + false, + )); + check_stats(Statistics::int64(None, None, None, Some(7), true)); - check_stats(Statistics::float(Some(1.2), Some(3.4), None, 7, true)); - check_stats(Statistics::float(Some(1.2), Some(3.4), None, 0, false)); - check_stats(Statistics::float(None, None, None, 7, true)); + check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true)); + check_stats(Statistics::float( + Some(1.2), + Some(3.4), + None, + Some(0), + false, + )); + check_stats(Statistics::float(None, None, None, Some(7), true)); - check_stats(Statistics::double(Some(1.2), Some(3.4), None, 7, true)); - check_stats(Statistics::double(Some(1.2), Some(3.4), None, 0, false)); - check_stats(Statistics::double(None, None, None, 7, true)); + check_stats(Statistics::double( + Some(1.2), + Some(3.4), + None, + Some(7), + true, + )); + check_stats(Statistics::double( + Some(1.2), + Some(3.4), + None, + Some(0), + false, + )); + check_stats(Statistics::double(None, None, None, Some(7), true)); check_stats(Statistics::byte_array( Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![3, 4, 5])), None, - 7, + Some(7), true, )); - check_stats(Statistics::byte_array(None, None, None, 7, true)); + check_stats(Statistics::byte_array(None, None, None, Some(7), true)); check_stats(Statistics::fixed_len_byte_array( Some(ByteArray::from(vec![1, 2, 3]).into()), Some(ByteArray::from(vec![3, 4, 5]).into()), None, - 7, + Some(7), + true, + )); + check_stats(Statistics::fixed_len_byte_array( + None, + None, + None, + Some(7), true, )); - check_stats(Statistics::fixed_len_byte_array(None, None, None, 7, true)); } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index c84d06a2ce70..37681838e7c4 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1397,7 +1397,7 @@ mod tests { encoding: Encoding::DELTA_BINARY_PACKED, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + statistics: Some(Statistics::int32(Some(1), Some(3), None, Some(7), true)), }, Page::DataPageV2 { buf: Bytes::from(vec![4; 128]), @@ -1408,7 +1408,7 @@ mod tests { def_levels_byte_len: 24, rep_levels_byte_len: 32, is_compressed: false, - statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + statistics: Some(Statistics::int32(Some(1), Some(3), None, Some(7), true)), }, ]; @@ -1431,7 +1431,7 @@ mod tests { encoding: Encoding::DELTA_BINARY_PACKED, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + statistics: Some(Statistics::int32(Some(1), Some(3), None, Some(7), true)), }, Page::DataPageV2 { buf: Bytes::from(vec![4; 128]), From f2785f8c60c7bd88ce0ca04652b60ff6911f754b Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 14:19:12 -0500 Subject: [PATCH 08/23] update expected metadata memory size tests Changing null_count from u64 to Option increases the memory size and layout of the metadata. I included these tests as a separate commit to call extra attention to it. --- parquet/src/file/metadata/mod.rs | 4 +-- parquet/tests/arrow_writer_layout.rs | 52 ++++++++++++++-------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 1fe5f50a42e0..ec89ce964aac 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1671,7 +1671,7 @@ mod tests { let row_group_meta_with_stats = vec![row_group_meta_with_stats]; let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta_with_stats); - let base_expected_size = 2280; + let base_expected_size = 2312; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1699,7 +1699,7 @@ mod tests { ]]), ); - let bigger_expected_size = 2784; + let bigger_expected_size = 2816; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 3e0f6ce3a8b3..9a66d13f84d7 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -189,7 +189,7 @@ fn test_primitive() { pages: (0..8) .map(|_| Page { rows: 250, - page_header_size: 36, + page_header_size: 38, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -218,14 +218,14 @@ fn test_primitive() { pages: vec![ Page { rows: 250, - page_header_size: 36, + page_header_size: 38, compressed_size: 258, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1750, - page_header_size: 36, + page_header_size: 38, compressed_size: 7000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -233,7 +233,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 250, - page_header_size: 36, + page_header_size: 38, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -260,42 +260,42 @@ fn test_primitive() { pages: vec![ Page { rows: 400, - page_header_size: 36, + page_header_size: 38, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 36, + page_header_size: 38, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 36, + page_header_size: 38, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 36, + page_header_size: 38, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 36, + page_header_size: 38, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 36, + page_header_size: 38, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -303,7 +303,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 36, + page_header_size: 38, compressed_size: 8000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -329,7 +329,7 @@ fn test_primitive() { pages: (0..20) .map(|_| Page { rows: 100, - page_header_size: 36, + page_header_size: 38, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -364,14 +364,14 @@ fn test_string() { pages: (0..15) .map(|_| Page { rows: 130, - page_header_size: 36, + page_header_size: 38, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }) .chain(std::iter::once(Page { rows: 50, - page_header_size: 35, + page_header_size: 37, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -400,21 +400,21 @@ fn test_string() { pages: vec![ Page { rows: 130, - page_header_size: 36, + page_header_size: 38, compressed_size: 138, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1250, - page_header_size: 38, + page_header_size: 40, compressed_size: 10000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }, Page { rows: 620, - page_header_size: 36, + page_header_size: 38, compressed_size: 4960, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -422,7 +422,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 130, - page_header_size: 36, + page_header_size: 38, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -449,42 +449,42 @@ fn test_string() { pages: vec![ Page { rows: 400, - page_header_size: 36, + page_header_size: 38, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 36, + page_header_size: 38, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 36, + page_header_size: 38, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 36, + page_header_size: 38, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 36, + page_header_size: 38, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 36, + page_header_size: 38, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -492,7 +492,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 36, + page_header_size: 38, compressed_size: 16000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -532,7 +532,7 @@ fn test_list() { pages: (0..10) .map(|_| Page { rows: 20, - page_header_size: 36, + page_header_size: 38, compressed_size: 672, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, From 9a4921f0ea284a89200ecaa91f12c4a040f0bdb4 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Fri, 9 Aug 2024 14:45:14 -0500 Subject: [PATCH 09/23] add TODO question on is_min_max_backwards_compatible --- parquet/src/file/statistics.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 271acf4fd033..d19be2877f12 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -601,6 +601,7 @@ impl ValueStatistics { /// using signed comparison. This resulted in an undefined ordering for unsigned /// quantities, such as booleans and unsigned integers. /// + // TODO: I don't see `min_value` or `max_value` in this struct. Should these be something else? /// These fields were therefore deprecated in favour of `min_value` and `max_value`, /// which have a type-defined sort order. /// From 7e082a7ccd91a2e9a87584f301127ef28f7ec124 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Tue, 13 Aug 2024 17:14:33 -0500 Subject: [PATCH 10/23] Apply suggestions from code review Co-authored-by: Andrew Lamb --- parquet/src/file/statistics.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index d19be2877f12..efd19085df23 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -378,7 +378,7 @@ impl Statistics { statistics_enum_func![self, distinct_count] } - /// Returns number of null values for the column. + /// Returns number of null values for the column, if known. /// Note that this includes all nulls when column is part of the complex type. pub fn null_count(&self) -> Option { statistics_enum_func![self, null_count] @@ -532,7 +532,7 @@ impl ValueStatistics { self.min.as_ref().unwrap() } - /// Returns min value of the statistics. + /// Returns min value of the statistics, if known. pub fn min(&self) -> Option<&T> { self.min.as_ref() } From ddd8498a905e59f5a59762093e67d1980e400f7b Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 17:20:20 -0500 Subject: [PATCH 11/23] update ValueStatistics::max docs --- parquet/src/file/statistics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index efd19085df23..935e9f0e91fc 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -545,7 +545,7 @@ impl ValueStatistics { self.max.as_ref().unwrap() } - /// Returns max value of the statistics. + /// Returns max value of the statistics, if known. pub fn max(&self) -> Option<&T> { self.max.as_ref() } From e92321b0915c0e5320cd4e4d8b48fe8fba881697 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 17:25:02 -0500 Subject: [PATCH 12/23] rename new optional ValueStatistics::max to max_opt Per PR review, we will deprecate the old API instead of introducing a brekaing change. Ref: https://github.com/apache/arrow-rs/pull/6216#pullrequestreview-2236537291 --- parquet/src/arrow/arrow_reader/statistics.rs | 34 ++----- parquet/src/arrow/arrow_writer/mod.rs | 8 +- parquet/src/column/writer/mod.rs | 94 +++++++++++--------- parquet/src/file/metadata/memory.rs | 2 +- parquet/src/file/statistics.rs | 8 +- 5 files changed, 69 insertions(+), 77 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index dff861bac7d1..dd11c1088111 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -137,7 +137,7 @@ make_stats_iterator!( ); make_stats_iterator!( MaxBooleanStatsIterator, - max_unchecked, + max, ParquetStatistics::Boolean, bool ); @@ -147,48 +147,28 @@ make_stats_iterator!( ParquetStatistics::Int32, i32 ); -make_stats_iterator!( - MaxInt32StatsIterator, - max_unchecked, - ParquetStatistics::Int32, - i32 -); +make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32); make_stats_iterator!( MinInt64StatsIterator, min_unchecked, ParquetStatistics::Int64, i64 ); -make_stats_iterator!( - MaxInt64StatsIterator, - max_unchecked, - ParquetStatistics::Int64, - i64 -); +make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64); make_stats_iterator!( MinFloatStatsIterator, min_unchecked, ParquetStatistics::Float, f32 ); -make_stats_iterator!( - MaxFloatStatsIterator, - max_unchecked, - ParquetStatistics::Float, - f32 -); +make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32); make_stats_iterator!( MinDoubleStatsIterator, min_unchecked, ParquetStatistics::Double, f64 ); -make_stats_iterator!( - MaxDoubleStatsIterator, - max_unchecked, - ParquetStatistics::Double, - f64 -); +make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64); make_stats_iterator!( MinByteArrayStatsIterator, min_bytes, @@ -291,7 +271,7 @@ make_decimal_stats_iterator!( ); make_decimal_stats_iterator!( MaxDecimal128StatsIterator, - max_unchecked, + max, max_bytes, i128, from_bytes_to_i128 @@ -305,7 +285,7 @@ make_decimal_stats_iterator!( ); make_decimal_stats_iterator!( MaxDecimal256StatsIterator, - max_unchecked, + max, max_bytes, i256, from_bytes_to_i256 diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index a52809daf3ed..91db98be70b8 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2548,7 +2548,7 @@ mod tests { *src_slice.iter().min().unwrap() ); assert_eq!( - *stats.max().unwrap() as u32, + *stats.max_opt().unwrap() as u32, *src_slice.iter().max().unwrap() ); } else { @@ -2595,7 +2595,7 @@ mod tests { *src_slice.iter().min().unwrap() ); assert_eq!( - *stats.max().unwrap() as u64, + *stats.max_opt().unwrap() as u64, *src_slice.iter().max().unwrap() ); } else { @@ -3082,7 +3082,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { let min = byte_array_stats.min().unwrap(); - let max = byte_array_stats.max().unwrap(); + let max = byte_array_stats.max_opt().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); @@ -3154,7 +3154,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { let min = byte_array_stats.min().unwrap(); - let max = byte_array_stats.max().unwrap(); + let max = byte_array_stats.max_opt().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 81922ad20a96..1d8cdf8f9819 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -770,7 +770,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Some(stat) => { // Check if min/max are still ascending/descending across pages let new_min = stat.min_unchecked(); - let new_max = stat.max_unchecked(); + let new_max = stat.max(); if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max { if self.data_page_boundary_ascending { // If last min/max are greater than new min/max then not ascending anymore @@ -1846,7 +1846,7 @@ mod tests { assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min().unwrap(), &1); - assert_eq!(stats.max().unwrap(), &4); + assert_eq!(stats.max_opt().unwrap(), &4); } else { panic!("expecting Statistics::Int32"); } @@ -1896,7 +1896,7 @@ mod tests { ]) ); assert_eq!( - stats.max().unwrap(), + stats.max_opt().unwrap(), &ByteArray::from(vec![ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, 246u8, 44u8, 0u8, 0u8, @@ -1926,7 +1926,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min().unwrap(), &0,); - assert_eq!(stats.max().unwrap(), &5,); + assert_eq!(stats.max_opt().unwrap(), &5,); } else { panic!("expecting Statistics::Int32"); } @@ -1975,7 +1975,7 @@ mod tests { assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min().unwrap(), &-17); - assert_eq!(stats.max().unwrap(), &9000); + assert_eq!(stats.max_opt().unwrap(), &9000); } else { panic!("expecting Statistics::Int32"); } @@ -2223,7 +2223,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Boolean(stats) = stats { assert_eq!(stats.min().unwrap(), &false); - assert_eq!(stats.max().unwrap(), &true); + assert_eq!(stats.max_opt().unwrap(), &true); } else { panic!("expecting Statistics::Boolean, got {stats:?}"); } @@ -2236,7 +2236,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min().unwrap(), &-2); - assert_eq!(stats.max().unwrap(), &3); + assert_eq!(stats.max_opt().unwrap(), &3); } else { panic!("expecting Statistics::Int32, got {stats:?}"); } @@ -2249,7 +2249,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { assert_eq!(stats.min().unwrap(), &-2); - assert_eq!(stats.max().unwrap(), &3); + assert_eq!(stats.max_opt().unwrap(), &3); } else { panic!("expecting Statistics::Int64, got {stats:?}"); } @@ -2271,7 +2271,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { assert_eq!(stats.min().unwrap(), &Int96::from(vec![0, 20, 30])); - assert_eq!(stats.max().unwrap(), &Int96::from(vec![3, 20, 10])); + assert_eq!(stats.max_opt().unwrap(), &Int96::from(vec![3, 20, 10])); } else { panic!("expecting Statistics::Int96, got {stats:?}"); } @@ -2284,7 +2284,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min().unwrap(), &-2.0); - assert_eq!(stats.max().unwrap(), &3.0); + assert_eq!(stats.max_opt().unwrap(), &3.0); } else { panic!("expecting Statistics::Float, got {stats:?}"); } @@ -2297,7 +2297,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min().unwrap(), &-2.0); - assert_eq!(stats.max().unwrap(), &3.0); + assert_eq!(stats.max_opt().unwrap(), &3.0); } else { panic!("expecting Statistics::Double, got {stats:?}"); } @@ -2315,7 +2315,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!(stats.min().unwrap(), &ByteArray::from("aaw")); - assert_eq!(stats.max().unwrap(), &ByteArray::from("zz")); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from("zz")); } else { panic!("expecting Statistics::ByteArray, got {stats:?}"); } @@ -2335,7 +2335,7 @@ mod tests { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); assert_eq!(stats.min().unwrap(), &expected_min); let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); - assert_eq!(stats.max().unwrap(), &expected_max); + assert_eq!(stats.max_opt().unwrap(), &expected_max); } else { panic!("expecting Statistics::FixedLenByteArray, got {stats:?}"); } @@ -2357,7 +2357,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(-f16::from_f32(2.0))); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::from_f32(3.0))); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::from_f32(3.0)) + ); } #[test] @@ -2371,7 +2374,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::ONE + f16::ONE) + ); } #[test] @@ -2385,7 +2391,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::ONE + f16::ONE) + ); } #[test] @@ -2399,7 +2408,10 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::ONE + f16::ONE) + ); } #[test] @@ -2425,7 +2437,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2439,7 +2451,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2453,7 +2465,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::PI)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::PI)); } #[test] @@ -2467,7 +2479,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min().unwrap(), &ByteArray::from(-f16::PI)); - assert_eq!(stats.max().unwrap(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2477,7 +2489,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min().unwrap(), &1.0); - assert_eq!(stats.max().unwrap(), &2.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2490,7 +2502,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min().unwrap(), &1.0); - assert_eq!(stats.max().unwrap(), &2.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2512,8 +2524,8 @@ mod tests { if let Statistics::Float(stats) = stats { assert_eq!(stats.min().unwrap(), &-0.0); assert!(stats.min().unwrap().is_sign_negative()); - assert_eq!(stats.max().unwrap(), &0.0); - assert!(stats.max().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2527,8 +2539,8 @@ mod tests { if let Statistics::Float(stats) = stats { assert_eq!(stats.min().unwrap(), &-0.0); assert!(stats.min().unwrap().is_sign_negative()); - assert_eq!(stats.max().unwrap(), &0.0); - assert!(stats.max().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2542,7 +2554,7 @@ mod tests { if let Statistics::Float(stats) = stats { assert_eq!(stats.min().unwrap(), &-0.0); assert!(stats.min().unwrap().is_sign_negative()); - assert_eq!(stats.max().unwrap(), &2.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2555,8 +2567,8 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min().unwrap(), &-2.0); - assert_eq!(stats.max().unwrap(), &0.0); - assert!(stats.max().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2569,7 +2581,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min().unwrap(), &1.0); - assert_eq!(stats.max().unwrap(), &2.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2582,7 +2594,7 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min().unwrap(), &1.0); - assert_eq!(stats.max().unwrap(), &2.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2604,8 +2616,8 @@ mod tests { if let Statistics::Double(stats) = stats { assert_eq!(stats.min().unwrap(), &-0.0); assert!(stats.min().unwrap().is_sign_negative()); - assert_eq!(stats.max().unwrap(), &0.0); - assert!(stats.max().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2619,8 +2631,8 @@ mod tests { if let Statistics::Double(stats) = stats { assert_eq!(stats.min().unwrap(), &-0.0); assert!(stats.min().unwrap().is_sign_negative()); - assert_eq!(stats.max().unwrap(), &0.0); - assert!(stats.max().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2634,7 +2646,7 @@ mod tests { if let Statistics::Double(stats) = stats { assert_eq!(stats.min().unwrap(), &-0.0); assert!(stats.min().unwrap().is_sign_negative()); - assert_eq!(stats.max().unwrap(), &2.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2647,8 +2659,8 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min().unwrap(), &-2.0); - assert_eq!(stats.max().unwrap(), &0.0); - assert!(stats.max().unwrap().is_sign_positive()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2956,7 +2968,7 @@ mod tests { assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { let min_value = _stats.min().unwrap(); - let max_value = _stats.max().unwrap(); + let max_value = _stats.max_opt().unwrap(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); @@ -3009,7 +3021,7 @@ mod tests { assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let min_value = _stats.min().unwrap(); - let max_value = _stats.max().unwrap(); + let max_value = _stats.max_opt().unwrap(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 979904cf4be8..8d68e941e13b 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -183,7 +183,7 @@ impl HeapSize for PageIndex { impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { - self.min().map(T::heap_size).unwrap_or(0) + self.max().map(T::heap_size).unwrap_or(0) + self.min().map(T::heap_size).unwrap_or(0) + self.max_opt().map(T::heap_size).unwrap_or(0) } } impl HeapSize for bool { diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 935e9f0e91fc..12d588136d91 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -33,7 +33,7 @@ //! match stats { //! Statistics::Int32(ref typed) => { //! assert_eq!(typed.min(), Some(&1)); -//! assert_eq!(typed.max(), Some(&10)); +//! assert_eq!(typed.max_opt(), Some(&10)); //! } //! _ => {} //! } @@ -541,12 +541,12 @@ impl ValueStatistics { /// /// Panics if max value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. - pub(crate) fn max_unchecked(&self) -> &T { + pub fn max(&self) -> &T { self.max.as_ref().unwrap() } /// Returns max value of the statistics, if known. - pub fn max(&self) -> Option<&T> { + pub fn max_opt(&self) -> Option<&T> { self.max.as_ref() } @@ -563,7 +563,7 @@ impl ValueStatistics { /// Panics if max value is not set, use `has_min_max_set` method to check /// if values are set. pub fn max_bytes(&self) -> &[u8] { - self.max_unchecked().as_bytes() + self.max().as_bytes() } /// Whether or not min and max values are set. From e4db777c983e094f95cb10b0414ab57a4e58a378 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 17:36:28 -0500 Subject: [PATCH 13/23] rename new optional ValueStatistics::min to min_opt --- parquet/src/arrow/arrow_reader/statistics.rs | 34 ++------ parquet/src/arrow/arrow_writer/mod.rs | 8 +- parquet/src/column/writer/mod.rs | 85 ++++++++++---------- parquet/src/file/metadata/memory.rs | 3 +- parquet/src/file/statistics.rs | 8 +- 5 files changed, 61 insertions(+), 77 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index dd11c1088111..ea636ddbc8bb 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -131,7 +131,7 @@ macro_rules! make_stats_iterator { make_stats_iterator!( MinBooleanStatsIterator, - min_unchecked, + min, ParquetStatistics::Boolean, bool ); @@ -141,33 +141,13 @@ make_stats_iterator!( ParquetStatistics::Boolean, bool ); -make_stats_iterator!( - MinInt32StatsIterator, - min_unchecked, - ParquetStatistics::Int32, - i32 -); +make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, i32); make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32); -make_stats_iterator!( - MinInt64StatsIterator, - min_unchecked, - ParquetStatistics::Int64, - i64 -); +make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, i64); make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64); -make_stats_iterator!( - MinFloatStatsIterator, - min_unchecked, - ParquetStatistics::Float, - f32 -); +make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, f32); make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32); -make_stats_iterator!( - MinDoubleStatsIterator, - min_unchecked, - ParquetStatistics::Double, - f64 -); +make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, f64); make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64); make_stats_iterator!( MinByteArrayStatsIterator, @@ -264,7 +244,7 @@ macro_rules! make_decimal_stats_iterator { make_decimal_stats_iterator!( MinDecimal128StatsIterator, - min_unchecked, + min, min_bytes, i128, from_bytes_to_i128 @@ -278,7 +258,7 @@ make_decimal_stats_iterator!( ); make_decimal_stats_iterator!( MinDecimal256StatsIterator, - min_unchecked, + min, min_bytes, i256, from_bytes_to_i256 diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 91db98be70b8..bc64c0225c9c 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2544,7 +2544,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!( - *stats.min().unwrap() as u32, + *stats.min_opt().unwrap() as u32, *src_slice.iter().min().unwrap() ); assert_eq!( @@ -2591,7 +2591,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::Int64(stats) = stats { assert_eq!( - *stats.min().unwrap() as u64, + *stats.min_opt().unwrap() as u64, *src_slice.iter().min().unwrap() ); assert_eq!( @@ -3081,7 +3081,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min().unwrap(); + let min = byte_array_stats.min_opt().unwrap(); let max = byte_array_stats.max_opt().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); @@ -3153,7 +3153,7 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min().unwrap(); + let min = byte_array_stats.min_opt().unwrap(); let max = byte_array_stats.max_opt().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 1d8cdf8f9819..17c130b14d68 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -769,7 +769,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } Some(stat) => { // Check if min/max are still ascending/descending across pages - let new_min = stat.min_unchecked(); + let new_min = stat.min(); let new_max = stat.max(); if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max { if self.data_page_boundary_ascending { @@ -1845,7 +1845,7 @@ mod tests { assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min().unwrap(), &1); + assert_eq!(stats.min_opt().unwrap(), &1); assert_eq!(stats.max_opt().unwrap(), &4); } else { panic!("expecting Statistics::Int32"); @@ -1889,7 +1889,7 @@ mod tests { assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!( - stats.min().unwrap(), + stats.min_opt().unwrap(), &ByteArray::from(vec![ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, 35u8, 231u8, 90u8, 0u8, 0u8, @@ -1925,7 +1925,7 @@ mod tests { if let Some(stats) = metadata.statistics() { assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min().unwrap(), &0,); + assert_eq!(stats.min_opt().unwrap(), &0,); assert_eq!(stats.max_opt().unwrap(), &5,); } else { panic!("expecting Statistics::Int32"); @@ -1974,7 +1974,7 @@ mod tests { assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min().unwrap(), &-17); + assert_eq!(stats.min_opt().unwrap(), &-17); assert_eq!(stats.max_opt().unwrap(), &9000); } else { panic!("expecting Statistics::Int32"); @@ -2222,7 +2222,7 @@ mod tests { // with the deprecated `min` and `max` statistics assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Boolean(stats) = stats { - assert_eq!(stats.min().unwrap(), &false); + assert_eq!(stats.min_opt().unwrap(), &false); assert_eq!(stats.max_opt().unwrap(), &true); } else { panic!("expecting Statistics::Boolean, got {stats:?}"); @@ -2235,7 +2235,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min().unwrap(), &-2); + assert_eq!(stats.min_opt().unwrap(), &-2); assert_eq!(stats.max_opt().unwrap(), &3); } else { panic!("expecting Statistics::Int32, got {stats:?}"); @@ -2248,7 +2248,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { - assert_eq!(stats.min().unwrap(), &-2); + assert_eq!(stats.min_opt().unwrap(), &-2); assert_eq!(stats.max_opt().unwrap(), &3); } else { panic!("expecting Statistics::Int64, got {stats:?}"); @@ -2270,7 +2270,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { - assert_eq!(stats.min().unwrap(), &Int96::from(vec![0, 20, 30])); + assert_eq!(stats.min_opt().unwrap(), &Int96::from(vec![0, 20, 30])); assert_eq!(stats.max_opt().unwrap(), &Int96::from(vec![3, 20, 10])); } else { panic!("expecting Statistics::Int96, got {stats:?}"); @@ -2283,7 +2283,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.min_opt().unwrap(), &-2.0); assert_eq!(stats.max_opt().unwrap(), &3.0); } else { panic!("expecting Statistics::Float, got {stats:?}"); @@ -2296,7 +2296,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.min_opt().unwrap(), &-2.0); assert_eq!(stats.max_opt().unwrap(), &3.0); } else { panic!("expecting Statistics::Double, got {stats:?}"); @@ -2314,7 +2314,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { - assert_eq!(stats.min().unwrap(), &ByteArray::from("aaw")); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from("aaw")); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from("zz")); } else { panic!("expecting Statistics::ByteArray, got {stats:?}"); @@ -2333,7 +2333,7 @@ mod tests { assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::FixedLenByteArray(stats) = stats { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); - assert_eq!(stats.min().unwrap(), &expected_min); + assert_eq!(stats.min_opt().unwrap(), &expected_min); let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); assert_eq!(stats.max_opt().unwrap(), &expected_max); } else { @@ -2356,7 +2356,10 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(-f16::from_f32(2.0))); + assert_eq!( + stats.min_opt().unwrap(), + &ByteArray::from(-f16::from_f32(2.0)) + ); assert_eq!( stats.max_opt().unwrap(), &ByteArray::from(f16::from_f32(3.0)) @@ -2373,7 +2376,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( stats.max_opt().unwrap(), &ByteArray::from(f16::ONE + f16::ONE) @@ -2390,7 +2393,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( stats.max_opt().unwrap(), &ByteArray::from(f16::ONE + f16::ONE) @@ -2407,7 +2410,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( stats.max_opt().unwrap(), &ByteArray::from(f16::ONE + f16::ONE) @@ -2436,7 +2439,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } @@ -2450,7 +2453,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } @@ -2464,7 +2467,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::PI)); } @@ -2478,7 +2481,7 @@ mod tests { let stats = float16_statistics_roundtrip(&input); assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min().unwrap(), &ByteArray::from(-f16::PI)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(-f16::PI)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } @@ -2488,7 +2491,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); @@ -2501,7 +2504,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); @@ -2522,8 +2525,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min().unwrap(), &-0.0); - assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2537,8 +2540,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min().unwrap(), &-0.0); - assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2552,8 +2555,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min().unwrap(), &-0.0); - assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); @@ -2566,7 +2569,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.min_opt().unwrap(), &-2.0); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2580,7 +2583,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); @@ -2593,7 +2596,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min().unwrap(), &1.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); @@ -2614,8 +2617,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min().unwrap(), &-0.0); - assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2629,8 +2632,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min().unwrap(), &-0.0); - assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2644,8 +2647,8 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min().unwrap(), &-0.0); - assert!(stats.min().unwrap().is_sign_negative()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); @@ -2658,7 +2661,7 @@ mod tests { assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min().unwrap(), &-2.0); + assert_eq!(stats.min_opt().unwrap(), &-2.0); assert_eq!(stats.max_opt().unwrap(), &0.0); assert!(stats.max_opt().unwrap().is_sign_positive()); } else { @@ -2967,7 +2970,7 @@ mod tests { assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { - let min_value = _stats.min().unwrap(); + let min_value = _stats.min_opt().unwrap(); let max_value = _stats.max_opt().unwrap(); assert!(!_stats.min_is_exact()); @@ -3020,7 +3023,7 @@ mod tests { assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { - let min_value = _stats.min().unwrap(); + let min_value = _stats.min_opt().unwrap(); let max_value = _stats.max_opt().unwrap(); assert!(!_stats.min_is_exact()); diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 8d68e941e13b..ad452267901a 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -183,7 +183,8 @@ impl HeapSize for PageIndex { impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { - self.min().map(T::heap_size).unwrap_or(0) + self.max_opt().map(T::heap_size).unwrap_or(0) + self.min_opt().map(T::heap_size).unwrap_or(0) + + self.max_opt().map(T::heap_size).unwrap_or(0) } } impl HeapSize for bool { diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 12d588136d91..9d5cdd8420f6 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -32,7 +32,7 @@ //! //! match stats { //! Statistics::Int32(ref typed) => { -//! assert_eq!(typed.min(), Some(&1)); +//! assert_eq!(typed.min_opt(), Some(&1)); //! assert_eq!(typed.max_opt(), Some(&10)); //! } //! _ => {} @@ -528,12 +528,12 @@ impl ValueStatistics { /// /// Panics if min value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. - pub(crate) fn min_unchecked(&self) -> &T { + pub fn min(&self) -> &T { self.min.as_ref().unwrap() } /// Returns min value of the statistics, if known. - pub fn min(&self) -> Option<&T> { + pub fn min_opt(&self) -> Option<&T> { self.min.as_ref() } @@ -555,7 +555,7 @@ impl ValueStatistics { /// Panics if min value is not set, use `has_min_max_set` method to check /// if values are set. pub fn min_bytes(&self) -> &[u8] { - self.min_unchecked().as_bytes() + self.min().as_bytes() } /// Returns max value as bytes of the statistics. From bd83a38d8a2eb2de6e26193aa9f9367f39f741a5 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 18:14:38 -0500 Subject: [PATCH 14/23] add Statistics:{min,max}_bytes_opt This adds the API and migrates all of the test usage. The old APIs will be deprecated next. --- parquet/src/column/writer/mod.rs | 62 +++++++++++++++++---------- parquet/src/file/serialized_reader.rs | 5 ++- parquet/src/file/statistics.rs | 41 +++++++++++++----- 3 files changed, 75 insertions(+), 33 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 17c130b14d68..1f2a76a7a432 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -797,12 +797,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { null_page, self.truncate_min_value( self.props.column_index_truncate_length(), - stat.min_bytes(), + stat.min_bytes_opt().unwrap(), ) .0, self.truncate_max_value( self.props.column_index_truncate_length(), - stat.max_bytes(), + stat.max_bytes_opt().unwrap(), ) .0, self.page_metrics.num_page_nulls as i64, @@ -810,8 +810,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } else { self.column_index_builder.append( null_page, - stat.min_bytes().to_vec(), - stat.max_bytes().to_vec(), + stat.min_bytes_opt().unwrap().to_vec(), + stat.max_bytes_opt().unwrap().to_vec(), self.page_metrics.num_page_nulls as i64, ); } @@ -1076,11 +1076,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Statistics::ByteArray(stats) if stats.has_min_max_set() => { let (min, did_truncate_min) = self.truncate_min_value( self.props.statistics_truncate_length(), - stats.min_bytes(), + stats.min_bytes_opt().unwrap(), ); let (max, did_truncate_max) = self.truncate_max_value( self.props.statistics_truncate_length(), - stats.max_bytes(), + stats.max_bytes_opt().unwrap(), ); Statistics::ByteArray( ValueStatistics::new( @@ -1099,11 +1099,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { { let (min, did_truncate_min) = self.truncate_min_value( self.props.statistics_truncate_length(), - stats.min_bytes(), + stats.min_bytes_opt().unwrap(), ); let (max, did_truncate_max) = self.truncate_max_value( self.props.statistics_truncate_length(), - stats.max_bytes(), + stats.max_bytes_opt().unwrap(), ); Statistics::FixedLenByteArray( ValueStatistics::new( @@ -2000,8 +2000,8 @@ mod tests { let r = writer.close().unwrap(); let stats = r.metadata.statistics().unwrap(); - assert_eq!(stats.min_bytes(), 1_i32.to_le_bytes()); - assert_eq!(stats.max_bytes(), 7_i32.to_le_bytes()); + assert_eq!(stats.min_bytes_opt().unwrap(), 1_i32.to_le_bytes()); + assert_eq!(stats.max_bytes_opt().unwrap(), 7_i32.to_le_bytes()); assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? assert!(stats.distinct_count().is_none()); @@ -2026,8 +2026,14 @@ mod tests { assert_eq!(pages[1].page_type(), PageType::DATA_PAGE); let page_statistics = pages[1].statistics().unwrap(); - assert_eq!(page_statistics.min_bytes(), 1_i32.to_le_bytes()); - assert_eq!(page_statistics.max_bytes(), 7_i32.to_le_bytes()); + assert_eq!( + page_statistics.min_bytes_opt().unwrap(), + 1_i32.to_le_bytes() + ); + assert_eq!( + page_statistics.max_bytes_opt().unwrap(), + 7_i32.to_le_bytes() + ); assert_eq!(page_statistics.null_count(), Some(0)); // TODO: None or 0? assert!(page_statistics.distinct_count().is_none()); } @@ -2727,8 +2733,14 @@ mod tests { // first page is [1,2,3,4] // second page is [-5,2,4,8] // note that we don't increment here, as this is a non BinaryArray type. - assert_eq!(stats.min_bytes(), column_index.min_values[1].as_slice()); - assert_eq!(stats.max_bytes(), column_index.max_values.get(1).unwrap()); + assert_eq!( + stats.min_bytes_opt(), + Some(column_index.min_values[1].as_slice()) + ); + assert_eq!( + stats.max_bytes_opt(), + column_index.max_values.get(1).map(Vec::as_slice) + ); } else { panic!("expecting Statistics::Int32"); } @@ -2783,8 +2795,14 @@ mod tests { let column_index_max_value = &column_index.max_values[0]; // Column index stats are truncated, while the column chunk's aren't. - assert_ne!(stats.min_bytes(), column_index_min_value.as_slice()); - assert_ne!(stats.max_bytes(), column_index_max_value.as_slice()); + assert_ne!( + stats.min_bytes_opt(), + Some(column_index_min_value.as_slice()) + ); + assert_ne!( + stats.max_bytes_opt(), + Some(column_index_max_value.as_slice()) + ); assert_eq!( column_index_min_value.len(), @@ -2855,8 +2873,8 @@ mod tests { assert_eq!("B".as_bytes(), column_index_min_value.as_slice()); assert_eq!("C".as_bytes(), column_index_max_value.as_slice()); - assert_ne!(column_index_min_value, stats.min_bytes()); - assert_ne!(column_index_max_value, stats.max_bytes()); + assert_ne!(column_index_min_value, stats.min_bytes_opt().unwrap()); + assert_ne!(column_index_max_value, stats.max_bytes_opt().unwrap()); } else { panic!("expecting Statistics::FixedLenByteArray"); } @@ -2892,8 +2910,8 @@ mod tests { let stats = r.metadata.statistics().unwrap(); assert!(stats.has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { - let stats_min_bytes = stats.min_bytes(); - let stats_max_bytes = stats.max_bytes(); + let stats_min_bytes = stats.min_bytes_opt().unwrap(); + let stats_max_bytes = stats.max_bytes_opt().unwrap(); assert_eq!(expected_value, stats_min_bytes); assert_eq!(expected_value, stats_max_bytes); } else { @@ -2932,8 +2950,8 @@ mod tests { let stats = r.metadata.statistics().unwrap(); assert!(stats.has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { - let stats_min_bytes = stats.min_bytes(); - let stats_max_bytes = stats.max_bytes(); + let stats_min_bytes = stats.min_bytes_opt().unwrap(); + let stats_max_bytes = stats.max_bytes_opt().unwrap(); assert_eq!(expected_value, stats_min_bytes); assert_eq!(expected_value, stats_max_bytes); } else { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 07e80f7f6998..0a3e51931867 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1544,7 +1544,10 @@ mod tests { fn get_row_group_min_max_bytes(r: &RowGroupMetaData, col_num: usize) -> (&[u8], &[u8]) { let statistics = r.column(col_num).statistics().unwrap(); - (statistics.min_bytes(), statistics.max_bytes()) + ( + statistics.min_bytes_opt().unwrap_or_default(), + statistics.max_bytes_opt().unwrap_or_default(), + ) } #[test] diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 9d5cdd8420f6..46ae5625d457 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -256,8 +256,8 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { // Get min/max if set. let (min, max, min_exact, max_exact) = if stats.has_min_max_set() { ( - Some(stats.min_bytes().to_vec()), - Some(stats.max_bytes().to_vec()), + Some(stats.min_bytes_opt().unwrap().to_vec()), + Some(stats.max_bytes_opt().unwrap().to_vec()), Some(stats.min_is_exact()), Some(stats.max_is_exact()), ) @@ -400,16 +400,26 @@ impl Statistics { statistics_enum_func![self, max_is_exact] } + /// Returns slice of bytes that represent min value, if min value is known. + pub fn min_bytes_opt(&self) -> Option<&[u8]> { + statistics_enum_func![self, min_bytes_opt] + } + /// Returns slice of bytes that represent min value. /// Panics if min value is not set. pub fn min_bytes(&self) -> &[u8] { - statistics_enum_func![self, min_bytes] + self.min_bytes_opt().unwrap() + } + + /// Returns slice of bytes that represent max value, if max value is known. + pub fn max_bytes_opt(&self) -> Option<&[u8]> { + statistics_enum_func![self, max_bytes_opt] } /// Returns slice of bytes that represent max value. /// Panics if max value is not set. pub fn max_bytes(&self) -> &[u8] { - statistics_enum_func![self, max_bytes] + self.max_bytes_opt().unwrap() } /// Returns physical type associated with statistics. @@ -550,20 +560,31 @@ impl ValueStatistics { self.max.as_ref() } + /// Returns min value as bytes of the statistics, if min value is known. + pub fn min_bytes_opt(&self) -> Option<&[u8]> { + self.min_opt().map(AsBytes::as_bytes) + } + /// Returns min value as bytes of the statistics. /// /// Panics if min value is not set, use `has_min_max_set` method to check /// if values are set. pub fn min_bytes(&self) -> &[u8] { - self.min().as_bytes() + self.min_bytes_opt().unwrap() + } + + /// Returns max value as bytes of the statistics, if max value is known. + pub fn max_bytes_opt(&self) -> Option<&[u8]> { + self.max_opt().map(AsBytes::as_bytes) } /// Returns max value as bytes of the statistics. /// /// Panics if max value is not set, use `has_min_max_set` method to check /// if values are set. + // TODO: deprecate pub fn max_bytes(&self) -> &[u8] { - self.max().as_bytes() + self.max_bytes_opt().unwrap() } /// Whether or not min and max values are set. @@ -669,8 +690,8 @@ mod tests { fn test_statistics_min_max_bytes() { let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false); assert!(stats.has_min_max_set()); - assert_eq!(stats.min_bytes(), (-123).as_bytes()); - assert_eq!(stats.max_bytes(), 234.as_bytes()); + assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes())); + assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes())); let stats = Statistics::byte_array( Some(ByteArray::from(vec![1, 2, 3])), @@ -680,8 +701,8 @@ mod tests { true, ); assert!(stats.has_min_max_set()); - assert_eq!(stats.min_bytes(), &[1, 2, 3]); - assert_eq!(stats.max_bytes(), &[3, 4, 5]); + assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]); + assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]); } #[test] From 4e2909b3eca2109797aebf9c430efb64eb1666d7 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 20:36:40 -0500 Subject: [PATCH 15/23] update make_stats_iterator macro to use *_opt methods --- parquet/src/arrow/arrow_reader/statistics.rs | 98 ++++++++++++++------ 1 file changed, 71 insertions(+), 27 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index ea636ddbc8bb..2f3912be940b 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -116,7 +116,7 @@ macro_rules! make_stats_iterator { let next = self.iter.next(); next.map(|x| { x.and_then(|stats| match stats { - $parquet_statistics_type(s) if stats.has_min_max_set() => Some(s.$func()), + $parquet_statistics_type(s) => s.$func(), _ => None, }) }) @@ -131,45 +131,85 @@ macro_rules! make_stats_iterator { make_stats_iterator!( MinBooleanStatsIterator, - min, + min_opt, ParquetStatistics::Boolean, bool ); make_stats_iterator!( MaxBooleanStatsIterator, - max, + max_opt, ParquetStatistics::Boolean, bool ); -make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, i32); -make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32); -make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, i64); -make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64); -make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, f32); -make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32); -make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, f64); -make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64); +make_stats_iterator!( + MinInt32StatsIterator, + min_opt, + ParquetStatistics::Int32, + i32 +); +make_stats_iterator!( + MaxInt32StatsIterator, + max_opt, + ParquetStatistics::Int32, + i32 +); +make_stats_iterator!( + MinInt64StatsIterator, + min_opt, + ParquetStatistics::Int64, + i64 +); +make_stats_iterator!( + MaxInt64StatsIterator, + max_opt, + ParquetStatistics::Int64, + i64 +); +make_stats_iterator!( + MinFloatStatsIterator, + min_opt, + ParquetStatistics::Float, + f32 +); +make_stats_iterator!( + MaxFloatStatsIterator, + max_opt, + ParquetStatistics::Float, + f32 +); +make_stats_iterator!( + MinDoubleStatsIterator, + min_opt, + ParquetStatistics::Double, + f64 +); +make_stats_iterator!( + MaxDoubleStatsIterator, + max_opt, + ParquetStatistics::Double, + f64 +); make_stats_iterator!( MinByteArrayStatsIterator, - min_bytes, + min_bytes_opt, ParquetStatistics::ByteArray, [u8] ); make_stats_iterator!( MaxByteArrayStatsIterator, - max_bytes, + max_bytes_opt, ParquetStatistics::ByteArray, [u8] ); make_stats_iterator!( MinFixedLenByteArrayStatsIterator, - min_bytes, + min_bytes_opt, ParquetStatistics::FixedLenByteArray, [u8] ); make_stats_iterator!( MaxFixedLenByteArrayStatsIterator, - max_bytes, + max_bytes_opt, ParquetStatistics::FixedLenByteArray, [u8] ); @@ -223,11 +263,15 @@ macro_rules! make_decimal_stats_iterator { return None; } match stats { - ParquetStatistics::Int32(s) => Some($stat_value_type::from(*s.$func())), - ParquetStatistics::Int64(s) => Some($stat_value_type::from(*s.$func())), - ParquetStatistics::ByteArray(s) => Some($convert_func(s.$bytes_func())), + ParquetStatistics::Int32(s) => { + s.$func().map(|x| $stat_value_type::from(*x)) + } + ParquetStatistics::Int64(s) => { + s.$func().map(|x| $stat_value_type::from(*x)) + } + ParquetStatistics::ByteArray(s) => s.$bytes_func().map($convert_func), ParquetStatistics::FixedLenByteArray(s) => { - Some($convert_func(s.$bytes_func())) + s.$bytes_func().map($convert_func) } _ => None, } @@ -244,29 +288,29 @@ macro_rules! make_decimal_stats_iterator { make_decimal_stats_iterator!( MinDecimal128StatsIterator, - min, - min_bytes, + min_opt, + min_bytes_opt, i128, from_bytes_to_i128 ); make_decimal_stats_iterator!( MaxDecimal128StatsIterator, - max, - max_bytes, + max_opt, + max_bytes_opt, i128, from_bytes_to_i128 ); make_decimal_stats_iterator!( MinDecimal256StatsIterator, - min, - min_bytes, + min_opt, + min_bytes_opt, i256, from_bytes_to_i256 ); make_decimal_stats_iterator!( MaxDecimal256StatsIterator, - max, - max_bytes, + max_opt, + max_bytes_opt, i256, from_bytes_to_i256 ); From 2f3e21eeb092d34935a6d54b13e25cc3e44a0e49 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 20:47:16 -0500 Subject: [PATCH 16/23] deprecate non *_opt Statistics and ValueStatistics methods --- parquet/src/column/writer/mod.rs | 4 ++-- parquet/src/file/statistics.rs | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 1f2a76a7a432..d1ced5600d24 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -769,8 +769,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } Some(stat) => { // Check if min/max are still ascending/descending across pages - let new_min = stat.min(); - let new_max = stat.max(); + let new_min = stat.min_opt().unwrap(); + let new_max = stat.max_opt().unwrap(); if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max { if self.data_page_boundary_ascending { // If last min/max are greater than new min/max then not ascending anymore diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 46ae5625d457..5f514a4199ca 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -254,17 +254,12 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { }; // Get min/max if set. - let (min, max, min_exact, max_exact) = if stats.has_min_max_set() { - ( - Some(stats.min_bytes_opt().unwrap().to_vec()), - Some(stats.max_bytes_opt().unwrap().to_vec()), - Some(stats.min_is_exact()), - Some(stats.max_is_exact()), - ) - } else { - (None, None, None, None) - }; - + let (min, max, min_exact, max_exact) = ( + stats.min_bytes_opt().map(|x| x.to_vec()), + stats.max_bytes_opt().map(|x| x.to_vec()), + Some(stats.min_is_exact()), + Some(stats.max_is_exact()), + ); if stats.is_min_max_backwards_compatible() { // Copy to deprecated min, max values for compatibility with older readers thrift_stats.min.clone_from(&min); @@ -386,7 +381,7 @@ impl Statistics { /// Returns `true` if min value and max value are set. /// Normally both min/max values will be set to `Some(value)` or `None`. - pub(crate) fn has_min_max_set(&self) -> bool { + pub fn has_min_max_set(&self) -> bool { statistics_enum_func![self, has_min_max_set] } @@ -407,6 +402,7 @@ impl Statistics { /// Returns slice of bytes that represent min value. /// Panics if min value is not set. + #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")] pub fn min_bytes(&self) -> &[u8] { self.min_bytes_opt().unwrap() } @@ -418,6 +414,7 @@ impl Statistics { /// Returns slice of bytes that represent max value. /// Panics if max value is not set. + #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")] pub fn max_bytes(&self) -> &[u8] { self.max_bytes_opt().unwrap() } @@ -538,6 +535,7 @@ impl ValueStatistics { /// /// Panics if min value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. + #[deprecated(since = "53.0.0", note = "Use `min_opt` instead")] pub fn min(&self) -> &T { self.min.as_ref().unwrap() } @@ -551,6 +549,7 @@ impl ValueStatistics { /// /// Panics if max value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. + #[deprecated(since = "53.0.0", note = "Use `max_opt` instead")] pub fn max(&self) -> &T { self.max.as_ref().unwrap() } @@ -569,6 +568,7 @@ impl ValueStatistics { /// /// Panics if min value is not set, use `has_min_max_set` method to check /// if values are set. + #[deprecated(since = "53.0.0", note = "Use `min_bytes_opt` instead")] pub fn min_bytes(&self) -> &[u8] { self.min_bytes_opt().unwrap() } @@ -582,14 +582,14 @@ impl ValueStatistics { /// /// Panics if max value is not set, use `has_min_max_set` method to check /// if values are set. - // TODO: deprecate + #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")] pub fn max_bytes(&self) -> &[u8] { self.max_bytes_opt().unwrap() } /// Whether or not min and max values are set. /// Normally both min/max values will be set to `Some(value)` or `None`. - pub(crate) fn has_min_max_set(&self) -> bool { + pub fn has_min_max_set(&self) -> bool { self.min.is_some() && self.max.is_some() } From 050ba89c2acbe9df27d4f58d0773a2bb8a47894f Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 20:52:48 -0500 Subject: [PATCH 17/23] remove stale TODO comments --- parquet/src/column/writer/mod.rs | 8 ++++---- parquet/src/file/statistics.rs | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index d1ced5600d24..d9264c51ad18 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1842,7 +1842,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? + assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1); @@ -1971,7 +1971,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? + assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-17); @@ -2002,7 +2002,7 @@ mod tests { let stats = r.metadata.statistics().unwrap(); assert_eq!(stats.min_bytes_opt().unwrap(), 1_i32.to_le_bytes()); assert_eq!(stats.max_bytes_opt().unwrap(), 7_i32.to_le_bytes()); - assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0? + assert_eq!(stats.null_count(), Some(0)); assert!(stats.distinct_count().is_none()); drop(write); @@ -2034,7 +2034,7 @@ mod tests { page_statistics.max_bytes_opt().unwrap(), 7_i32.to_le_bytes() ); - assert_eq!(page_statistics.null_count(), Some(0)); // TODO: None or 0? + assert_eq!(page_statistics.null_count(), Some(0)); assert!(page_statistics.distinct_count().is_none()); } diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 5f514a4199ca..ec13f213c449 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -622,7 +622,6 @@ impl ValueStatistics { /// using signed comparison. This resulted in an undefined ordering for unsigned /// quantities, such as booleans and unsigned integers. /// - // TODO: I don't see `min_value` or `max_value` in this struct. Should these be something else? /// These fields were therefore deprecated in favour of `min_value` and `max_value`, /// which have a type-defined sort order. /// From fc0ed2027e6db87d7850b491f31b6e07c475d98e Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 21:46:37 -0500 Subject: [PATCH 18/23] remove has_min_max_set check from make_decimal_stats_iterator The check is unnecessary now that the stats funcs return Option when unset. --- parquet/src/arrow/arrow_reader/statistics.rs | 25 ++++++++------------ 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 2f3912be940b..597af61a6263 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -258,23 +258,18 @@ macro_rules! make_decimal_stats_iterator { fn next(&mut self) -> Option { let next = self.iter.next(); next.map(|x| { - x.and_then(|stats| { - if !stats.has_min_max_set() { - return None; + x.and_then(|stats| match stats { + ParquetStatistics::Int32(s) => { + s.$func().map(|x| $stat_value_type::from(*x)) } - match stats { - ParquetStatistics::Int32(s) => { - s.$func().map(|x| $stat_value_type::from(*x)) - } - ParquetStatistics::Int64(s) => { - s.$func().map(|x| $stat_value_type::from(*x)) - } - ParquetStatistics::ByteArray(s) => s.$bytes_func().map($convert_func), - ParquetStatistics::FixedLenByteArray(s) => { - s.$bytes_func().map($convert_func) - } - _ => None, + ParquetStatistics::Int64(s) => { + s.$func().map(|x| $stat_value_type::from(*x)) + } + ParquetStatistics::ByteArray(s) => s.$bytes_func().map($convert_func), + ParquetStatistics::FixedLenByteArray(s) => { + s.$bytes_func().map($convert_func) } + _ => None, }) }) } From 2366942f9fe0486ca08585204877ac29ceefdf5e Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 21:49:05 -0500 Subject: [PATCH 19/23] deprecate has_min_max_set An internal version was also created because it is used so extensively in testing. --- parquet/src/arrow/arrow_writer/mod.rs | 4 +- parquet/src/column/writer/mod.rs | 90 +++++++++++++-------------- parquet/src/file/statistics.rs | 25 ++++++-- 3 files changed, 68 insertions(+), 51 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index bc64c0225c9c..aebc3015d7b2 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2541,7 +2541,7 @@ mod tests { row_offset += column.num_values() as usize; let stats = column.statistics().unwrap(); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!( *stats.min_opt().unwrap() as u32, @@ -2588,7 +2588,7 @@ mod tests { row_offset += column.num_values() as usize; let stats = column.statistics().unwrap(); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); if let Statistics::Int64(stats) = stats { assert_eq!( *stats.min_opt().unwrap() as u64, diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index d9264c51ad18..d5fb394048c4 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1073,7 +1073,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .into(); let statistics = match statistics { - Statistics::ByteArray(stats) if stats.has_min_max_set() => { + Statistics::ByteArray(stats) if stats._internal_has_min_max_set() => { let (min, did_truncate_min) = self.truncate_min_value( self.props.statistics_truncate_length(), stats.min_bytes_opt().unwrap(), @@ -1095,7 +1095,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { ) } Statistics::FixedLenByteArray(stats) - if (stats.has_min_max_set() && self.can_truncate_value()) => + if (stats._internal_has_min_max_set() && self.can_truncate_value()) => { let (min, did_truncate_min) = self.truncate_min_value( self.props.statistics_truncate_length(), @@ -1841,7 +1841,7 @@ mod tests { assert_eq!(metadata.data_page_offset(), 0); assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { @@ -1886,7 +1886,7 @@ mod tests { .unwrap(); let metadata = writer.close().unwrap().metadata; if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!( stats.min_opt().unwrap(), @@ -1923,7 +1923,7 @@ mod tests { writer.write_batch(&[0, 1, 2, 3, 4, 5], None, None).unwrap(); let metadata = writer.close().unwrap().metadata; if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &0,); assert_eq!(stats.max_opt().unwrap(), &5,); @@ -1970,7 +1970,7 @@ mod tests { assert_eq!(metadata.data_page_offset(), 0); assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { @@ -2223,7 +2223,7 @@ mod tests { #[test] fn test_bool_statistics() { let stats = statistics_roundtrip::(&[true, false, false, true]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); // Booleans have an unsigned sort order and so are not compatible // with the deprecated `min` and `max` statistics assert!(!stats.is_min_max_backwards_compatible()); @@ -2238,7 +2238,7 @@ mod tests { #[test] fn test_int32_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2); @@ -2251,7 +2251,7 @@ mod tests { #[test] fn test_int64_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2); @@ -2273,7 +2273,7 @@ mod tests { .collect::>(); let stats = statistics_roundtrip::(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &Int96::from(vec![0, 20, 30])); @@ -2286,7 +2286,7 @@ mod tests { #[test] fn test_float_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2299,7 +2299,7 @@ mod tests { #[test] fn test_double_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2318,7 +2318,7 @@ mod tests { let stats = statistics_roundtrip::(&input); assert!(!stats.is_min_max_backwards_compatible()); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &ByteArray::from("aaw")); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from("zz")); @@ -2335,7 +2335,7 @@ mod tests { .collect::>(); let stats = statistics_roundtrip::(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::FixedLenByteArray(stats) = stats { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); @@ -2360,7 +2360,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!( stats.min_opt().unwrap(), @@ -2380,7 +2380,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( @@ -2397,7 +2397,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( @@ -2414,7 +2414,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( @@ -2431,7 +2431,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(!stats.has_min_max_set()); + assert!(!stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); } @@ -2443,7 +2443,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); @@ -2457,7 +2457,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); @@ -2471,7 +2471,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::PI)); @@ -2485,7 +2485,7 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(-f16::PI)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); @@ -2494,7 +2494,7 @@ mod tests { #[test] fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2507,7 +2507,7 @@ mod tests { #[test] fn test_float_statistics_nan_start() { let stats = statistics_roundtrip::(&[f32::NAN, 1.0, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2520,7 +2520,7 @@ mod tests { #[test] fn test_float_statistics_nan_only() { let stats = statistics_roundtrip::(&[f32::NAN, f32::NAN]); - assert!(!stats.has_min_max_set()); + assert!(!stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert!(matches!(stats, Statistics::Float(_))); } @@ -2528,7 +2528,7 @@ mod tests { #[test] fn test_float_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2543,7 +2543,7 @@ mod tests { #[test] fn test_float_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2558,7 +2558,7 @@ mod tests { #[test] fn test_float_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f32::NAN, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2572,7 +2572,7 @@ mod tests { #[test] fn test_float_statistics_neg_zero_max() { let stats = statistics_roundtrip::(&[-0.0, -1.0, f32::NAN, -2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2586,7 +2586,7 @@ mod tests { #[test] fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2599,7 +2599,7 @@ mod tests { #[test] fn test_double_statistics_nan_start() { let stats = statistics_roundtrip::(&[f64::NAN, 1.0, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2612,7 +2612,7 @@ mod tests { #[test] fn test_double_statistics_nan_only() { let stats = statistics_roundtrip::(&[f64::NAN, f64::NAN]); - assert!(!stats.has_min_max_set()); + assert!(!stats._internal_has_min_max_set()); assert!(matches!(stats, Statistics::Double(_))); assert!(stats.is_min_max_backwards_compatible()); } @@ -2620,7 +2620,7 @@ mod tests { #[test] fn test_double_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2635,7 +2635,7 @@ mod tests { #[test] fn test_double_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2650,7 +2650,7 @@ mod tests { #[test] fn test_double_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f64::NAN, 2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2664,7 +2664,7 @@ mod tests { #[test] fn test_double_statistics_neg_zero_max() { let stats = statistics_roundtrip::(&[-0.0, -1.0, f64::NAN, -2.0]); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2726,7 +2726,7 @@ mod tests { } if let Some(stats) = r.metadata.statistics() { - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { @@ -2787,7 +2787,7 @@ mod tests { assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); if let Some(stats) = r.metadata.statistics() { - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(stats) = stats { @@ -2860,7 +2860,7 @@ mod tests { assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); if let Some(stats) = r.metadata.statistics() { - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { @@ -2908,7 +2908,7 @@ mod tests { // ensure bytes weren't truncated for statistics let stats = r.metadata.statistics().unwrap(); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { let stats_min_bytes = stats.min_bytes_opt().unwrap(); let stats_max_bytes = stats.max_bytes_opt().unwrap(); @@ -2948,7 +2948,7 @@ mod tests { // ensure bytes weren't truncated for statistics let stats = r.metadata.statistics().unwrap(); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { let stats_min_bytes = stats.min_bytes_opt().unwrap(); let stats_max_bytes = stats.max_bytes_opt().unwrap(); @@ -2984,7 +2984,7 @@ mod tests { assert_eq!(1, r.rows_written); let stats = r.metadata.statistics().expect("statistics"); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { @@ -3037,7 +3037,7 @@ mod tests { assert_eq!(1, r.rows_written); let stats = r.metadata.statistics().expect("statistics"); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { @@ -3359,7 +3359,7 @@ mod tests { } else { panic!("metadata missing statistics"); }; - assert!(!stats.has_min_max_set()); + assert!(!stats._internal_has_min_max_set()); } fn write_multiple_pages( diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index ec13f213c449..ac2d69549493 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -379,10 +379,20 @@ impl Statistics { statistics_enum_func![self, null_count] } - /// Returns `true` if min value and max value are set. + /// Whether or not min and max values are set. /// Normally both min/max values will be set to `Some(value)` or `None`. + #[deprecated( + since = "53.0.0", + note = "Use `min_bytes_opt` and `max_bytes_opt` methods instead" + )] pub fn has_min_max_set(&self) -> bool { - statistics_enum_func![self, has_min_max_set] + self._internal_has_min_max_set() + } + + /// Returns `true` if min value and max value are set. + /// Normally both min/max values will be set to `Some(value)` or `None`. + pub(crate) fn _internal_has_min_max_set(&self) -> bool { + statistics_enum_func![self, _internal_has_min_max_set] } /// Returns `true` if the min value is set, and is an exact min value. @@ -589,7 +599,14 @@ impl ValueStatistics { /// Whether or not min and max values are set. /// Normally both min/max values will be set to `Some(value)` or `None`. + #[deprecated(since = "53.0.0", note = "Use `min_opt` and `max_opt` methods instead")] pub fn has_min_max_set(&self) -> bool { + self._internal_has_min_max_set() + } + + /// Whether or not min and max values are set. + /// Normally both min/max values will be set to `Some(value)` or `None`. + pub(crate) fn _internal_has_min_max_set(&self) -> bool { self.min.is_some() && self.max.is_some() } @@ -688,7 +705,7 @@ mod tests { #[test] fn test_statistics_min_max_bytes() { let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes())); assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes())); @@ -699,7 +716,7 @@ mod tests { Some(1), true, ); - assert!(stats.has_min_max_set()); + assert!(stats._internal_has_min_max_set()); assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]); assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]); } From 82b40b898ca84d575fa693c4f13a996f8e27bacc Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Tue, 13 Aug 2024 22:08:43 -0500 Subject: [PATCH 20/23] switch to null_count_opt and reintroduce deprecated null_count and has_nulls --- parquet/src/arrow/arrow_reader/statistics.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/column/writer/mod.rs | 22 ++++++------- parquet/src/file/statistics.rs | 33 +++++++++++++++++--- 4 files changed, 41 insertions(+), 18 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 597af61a6263..602a9ad5e506 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1386,7 +1386,7 @@ impl<'a> StatisticsConverter<'a> { let null_counts = metadatas .into_iter() .map(|x| x.column(parquet_index).statistics()) - .map(|s| s.and_then(|s| s.null_count())); + .map(|s| s.and_then(|s| s.null_count_opt())); Ok(UInt64Array::from_iter(null_counts)) } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index aebc3015d7b2..e5742338cf9d 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2620,7 +2620,7 @@ mod tests { assert_eq!(row_group.num_columns(), 1); let column = row_group.column(0); let stats = column.statistics().unwrap(); - assert_eq!(stats.null_count(), Some(2)); + assert_eq!(stats.null_count_opt(), Some(2)); } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index d5fb394048c4..f8260bdc42da 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1087,7 +1087,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Some(min.into()), Some(max.into()), stats.distinct_count(), - stats.null_count(), + stats.null_count_opt(), backwards_compatible_min_max, ) .with_max_is_exact(!did_truncate_max) @@ -1110,7 +1110,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Some(min.into()), Some(max.into()), stats.distinct_count(), - stats.null_count(), + stats.null_count_opt(), backwards_compatible_min_max, ) .with_max_is_exact(!did_truncate_max) @@ -1842,7 +1842,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert!(stats._internal_has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1); @@ -1971,7 +1971,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert!(stats._internal_has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-17); @@ -2002,7 +2002,7 @@ mod tests { let stats = r.metadata.statistics().unwrap(); assert_eq!(stats.min_bytes_opt().unwrap(), 1_i32.to_le_bytes()); assert_eq!(stats.max_bytes_opt().unwrap(), 7_i32.to_le_bytes()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert!(stats.distinct_count().is_none()); drop(write); @@ -2034,7 +2034,7 @@ mod tests { page_statistics.max_bytes_opt().unwrap(), 7_i32.to_le_bytes() ); - assert_eq!(page_statistics.null_count(), Some(0)); + assert_eq!(page_statistics.null_count_opt(), Some(0)); assert!(page_statistics.distinct_count().is_none()); } @@ -2727,7 +2727,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert!(stats._internal_has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { // first page is [1,2,3,4] @@ -2788,7 +2788,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert!(stats._internal_has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(stats) = stats { let column_index_min_value = &column_index.min_values[0]; @@ -2861,7 +2861,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert!(stats._internal_has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let column_index_min_value = &column_index.min_values[0]; @@ -2985,7 +2985,7 @@ mod tests { let stats = r.metadata.statistics().expect("statistics"); assert!(stats._internal_has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { let min_value = _stats.min_opt().unwrap(); @@ -3038,7 +3038,7 @@ mod tests { let stats = r.metadata.statistics().expect("statistics"); assert!(stats._internal_has_min_max_set()); - assert_eq!(stats.null_count(), Some(0)); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let min_value = _stats.min_opt().unwrap(); diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index ac2d69549493..612a5c174a53 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -25,7 +25,7 @@ //! use parquet::file::statistics::Statistics; //! //! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true); -//! assert_eq!(stats.null_count(), Some(3)); +//! assert_eq!(stats.null_count_opt(), Some(3)); //! assert!(stats.is_min_max_deprecated()); //! assert!(stats.min_is_exact()); //! assert!(stats.max_is_exact()); @@ -245,7 +245,7 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { let mut thrift_stats = TStatistics { max: None, min: None, - null_count: stats.null_count().map(|value| value as i64), + null_count: stats.null_count_opt().map(|value| value as i64), distinct_count: stats.distinct_count().map(|value| value as i64), max_value: None, min_value: None, @@ -373,10 +373,25 @@ impl Statistics { statistics_enum_func![self, distinct_count] } + /// Returns number of null values for the column. + /// Note that this includes all nulls when column is part of the complex type. + #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] + pub fn null_count(&self) -> u64 { + // 0 to remain consistent behavior prior to `null_count_opt` + self.null_count_opt().unwrap_or(0) + } + + /// Returns `true` if statistics collected any null values, `false` otherwise. + #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] + #[allow(deprecated)] + pub fn has_nulls(&self) -> bool { + self.null_count() > 0 + } + /// Returns number of null values for the column, if known. /// Note that this includes all nulls when column is part of the complex type. - pub fn null_count(&self) -> Option { - statistics_enum_func![self, null_count] + pub fn null_count_opt(&self) -> Option { + statistics_enum_func![self, null_count_opt] } /// Whether or not min and max values are set. @@ -625,8 +640,16 @@ impl ValueStatistics { self.distinct_count } + /// Returns number of null values for the column. + /// Note that this includes all nulls when column is part of the complex type. + #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] + pub fn null_count(&self) -> u64 { + // 0 to remain consistent behavior prior to `null_count_opt` + self.null_count_opt().unwrap_or(0) + } + /// Returns null count. - pub fn null_count(&self) -> Option { + pub fn null_count_opt(&self) -> Option { self.null_count } From 3efc631776e81772efb20482052f7050ca92dbb1 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 14 Aug 2024 09:43:24 -0500 Subject: [PATCH 21/23] remove redundant test assertions of stats._internal_has_min_max_set This removes the assertion from any test that subsequently unwraps both min_opt and max_opt. --- parquet/src/arrow/arrow_writer/mod.rs | 2 -- parquet/src/column/writer/mod.rs | 39 --------------------------- parquet/src/file/statistics.rs | 2 -- 3 files changed, 43 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index e5742338cf9d..5c5f28ac0f72 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2541,7 +2541,6 @@ mod tests { row_offset += column.num_values() as usize; let stats = column.statistics().unwrap(); - assert!(stats._internal_has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!( *stats.min_opt().unwrap() as u32, @@ -2588,7 +2587,6 @@ mod tests { row_offset += column.num_values() as usize; let stats = column.statistics().unwrap(); - assert!(stats._internal_has_min_max_set()); if let Statistics::Int64(stats) = stats { assert_eq!( *stats.min_opt().unwrap() as u64, diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index f8260bdc42da..c663fe6e93ac 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1841,7 +1841,6 @@ mod tests { assert_eq!(metadata.data_page_offset(), 0); assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { @@ -1886,7 +1885,6 @@ mod tests { .unwrap(); let metadata = writer.close().unwrap().metadata; if let Some(stats) = metadata.statistics() { - assert!(stats._internal_has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!( stats.min_opt().unwrap(), @@ -1923,7 +1921,6 @@ mod tests { writer.write_batch(&[0, 1, 2, 3, 4, 5], None, None).unwrap(); let metadata = writer.close().unwrap().metadata; if let Some(stats) = metadata.statistics() { - assert!(stats._internal_has_min_max_set()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &0,); assert_eq!(stats.max_opt().unwrap(), &5,); @@ -1970,7 +1967,6 @@ mod tests { assert_eq!(metadata.data_page_offset(), 0); assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { @@ -2223,7 +2219,6 @@ mod tests { #[test] fn test_bool_statistics() { let stats = statistics_roundtrip::(&[true, false, false, true]); - assert!(stats._internal_has_min_max_set()); // Booleans have an unsigned sort order and so are not compatible // with the deprecated `min` and `max` statistics assert!(!stats.is_min_max_backwards_compatible()); @@ -2238,7 +2233,6 @@ mod tests { #[test] fn test_int32_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2); @@ -2251,7 +2245,6 @@ mod tests { #[test] fn test_int64_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2); @@ -2273,7 +2266,6 @@ mod tests { .collect::>(); let stats = statistics_roundtrip::(&input); - assert!(stats._internal_has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &Int96::from(vec![0, 20, 30])); @@ -2286,7 +2278,6 @@ mod tests { #[test] fn test_float_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2299,7 +2290,6 @@ mod tests { #[test] fn test_double_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2318,7 +2308,6 @@ mod tests { let stats = statistics_roundtrip::(&input); assert!(!stats.is_min_max_backwards_compatible()); - assert!(stats._internal_has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &ByteArray::from("aaw")); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from("zz")); @@ -2335,7 +2324,6 @@ mod tests { .collect::>(); let stats = statistics_roundtrip::(&input); - assert!(stats._internal_has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::FixedLenByteArray(stats) = stats { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); @@ -2360,7 +2348,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!( stats.min_opt().unwrap(), @@ -2380,7 +2367,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( @@ -2397,7 +2383,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( @@ -2414,7 +2399,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); assert_eq!( @@ -2443,7 +2427,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); @@ -2457,7 +2440,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); @@ -2471,7 +2453,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::PI)); @@ -2485,7 +2466,6 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(-f16::PI)); assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); @@ -2494,7 +2474,6 @@ mod tests { #[test] fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2507,7 +2486,6 @@ mod tests { #[test] fn test_float_statistics_nan_start() { let stats = statistics_roundtrip::(&[f32::NAN, 1.0, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2528,7 +2506,6 @@ mod tests { #[test] fn test_float_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2543,7 +2520,6 @@ mod tests { #[test] fn test_float_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2558,7 +2534,6 @@ mod tests { #[test] fn test_float_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f32::NAN, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2572,7 +2547,6 @@ mod tests { #[test] fn test_float_statistics_neg_zero_max() { let stats = statistics_roundtrip::(&[-0.0, -1.0, f32::NAN, -2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2586,7 +2560,6 @@ mod tests { #[test] fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2599,7 +2572,6 @@ mod tests { #[test] fn test_double_statistics_nan_start() { let stats = statistics_roundtrip::(&[f64::NAN, 1.0, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1.0); @@ -2620,7 +2592,6 @@ mod tests { #[test] fn test_double_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2635,7 +2606,6 @@ mod tests { #[test] fn test_double_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2650,7 +2620,6 @@ mod tests { #[test] fn test_double_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f64::NAN, 2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-0.0); @@ -2664,7 +2633,6 @@ mod tests { #[test] fn test_double_statistics_neg_zero_max() { let stats = statistics_roundtrip::(&[-0.0, -1.0, f64::NAN, -2.0]); - assert!(stats._internal_has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-2.0); @@ -2726,7 +2694,6 @@ mod tests { } if let Some(stats) = r.metadata.statistics() { - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { @@ -2787,7 +2754,6 @@ mod tests { assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); if let Some(stats) = r.metadata.statistics() { - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(stats) = stats { @@ -2860,7 +2826,6 @@ mod tests { assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); if let Some(stats) = r.metadata.statistics() { - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { @@ -2908,7 +2873,6 @@ mod tests { // ensure bytes weren't truncated for statistics let stats = r.metadata.statistics().unwrap(); - assert!(stats._internal_has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { let stats_min_bytes = stats.min_bytes_opt().unwrap(); let stats_max_bytes = stats.max_bytes_opt().unwrap(); @@ -2948,7 +2912,6 @@ mod tests { // ensure bytes weren't truncated for statistics let stats = r.metadata.statistics().unwrap(); - assert!(stats._internal_has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { let stats_min_bytes = stats.min_bytes_opt().unwrap(); let stats_max_bytes = stats.max_bytes_opt().unwrap(); @@ -2984,7 +2947,6 @@ mod tests { assert_eq!(1, r.rows_written); let stats = r.metadata.statistics().expect("statistics"); - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { @@ -3037,7 +2999,6 @@ mod tests { assert_eq!(1, r.rows_written); let stats = r.metadata.statistics().expect("statistics"); - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 612a5c174a53..315b83106d53 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -728,7 +728,6 @@ mod tests { #[test] fn test_statistics_min_max_bytes() { let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false); - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes())); assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes())); @@ -739,7 +738,6 @@ mod tests { Some(1), true, ); - assert!(stats._internal_has_min_max_set()); assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]); assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]); } From a23cf8bd4da4e8fe61c543f6994bc6166dbdc575 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 14 Aug 2024 09:48:01 -0500 Subject: [PATCH 22/23] replace negated test assertions of stats._internal_has_mix_max_set with assertions on min_opt and max_opt This removes all use of Statistics::_internal_has_min_max_set from the code base, and so it is also removed. --- parquet/src/column/writer/mod.rs | 12 ++++++++---- parquet/src/file/statistics.rs | 6 ------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index c663fe6e93ac..8ea2878317e8 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2415,7 +2415,8 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(!stats._internal_has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); assert!(stats.is_min_max_backwards_compatible()); } @@ -2498,7 +2499,8 @@ mod tests { #[test] fn test_float_statistics_nan_only() { let stats = statistics_roundtrip::(&[f32::NAN, f32::NAN]); - assert!(!stats._internal_has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); assert!(stats.is_min_max_backwards_compatible()); assert!(matches!(stats, Statistics::Float(_))); } @@ -2584,7 +2586,8 @@ mod tests { #[test] fn test_double_statistics_nan_only() { let stats = statistics_roundtrip::(&[f64::NAN, f64::NAN]); - assert!(!stats._internal_has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); assert!(matches!(stats, Statistics::Double(_))); assert!(stats.is_min_max_backwards_compatible()); } @@ -3320,7 +3323,8 @@ mod tests { } else { panic!("metadata missing statistics"); }; - assert!(!stats._internal_has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); } fn write_multiple_pages( diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 315b83106d53..b94732e9c132 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -401,12 +401,6 @@ impl Statistics { note = "Use `min_bytes_opt` and `max_bytes_opt` methods instead" )] pub fn has_min_max_set(&self) -> bool { - self._internal_has_min_max_set() - } - - /// Returns `true` if min value and max value are set. - /// Normally both min/max values will be set to `Some(value)` or `None`. - pub(crate) fn _internal_has_min_max_set(&self) -> bool { statistics_enum_func![self, _internal_has_min_max_set] } From 7d4e65076f9b1ea61f6931a670436cc08f4f629a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 15 Aug 2024 11:52:13 -0400 Subject: [PATCH 23/23] Revert changes to parquet writing, update comments --- parquet/src/file/statistics.rs | 19 +++++++++- parquet/tests/arrow_writer_layout.rs | 52 ++++++++++++++-------------- 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index b94732e9c132..4134685ffcfb 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -126,6 +126,8 @@ pub fn from_thrift( Ok(match thrift_stats { Some(stats) => { // Number of nulls recorded, when it is not available, we just mark it as 0. + // TODO this should be `None` if there is no information about NULLS. + // see https://github.com/apache/arrow-rs/pull/6216/files let null_count = stats.null_count.unwrap_or(0); if null_count < 0 { @@ -242,10 +244,19 @@ pub fn from_thrift( pub fn to_thrift(stats: Option<&Statistics>) -> Option { let stats = stats?; + // record null counts if greater than zero. + // + // TODO: This should be Some(0) if there are no nulls. + // see https://github.com/apache/arrow-rs/pull/6216/files + let null_count = stats + .null_count_opt() + .map(|value| value as i64) + .filter(|&x| x > 0); + let mut thrift_stats = TStatistics { max: None, min: None, - null_count: stats.null_count_opt().map(|value| value as i64), + null_count, distinct_count: stats.distinct_count().map(|value| value as i64), max_value: None, min_value: None, @@ -375,6 +386,8 @@ impl Statistics { /// Returns number of null values for the column. /// Note that this includes all nulls when column is part of the complex type. + /// + /// Note this API returns 0 if the null count is not available. #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] pub fn null_count(&self) -> u64 { // 0 to remain consistent behavior prior to `null_count_opt` @@ -390,6 +403,10 @@ impl Statistics { /// Returns number of null values for the column, if known. /// Note that this includes all nulls when column is part of the complex type. + /// + /// Note this API returns Some(0) even if the null count was not present + /// in the statistics. + /// See pub fn null_count_opt(&self) -> Option { statistics_enum_func![self, null_count_opt] } diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 9a66d13f84d7..3e0f6ce3a8b3 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -189,7 +189,7 @@ fn test_primitive() { pages: (0..8) .map(|_| Page { rows: 250, - page_header_size: 38, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -218,14 +218,14 @@ fn test_primitive() { pages: vec![ Page { rows: 250, - page_header_size: 38, + page_header_size: 36, compressed_size: 258, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1750, - page_header_size: 38, + page_header_size: 36, compressed_size: 7000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -233,7 +233,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 250, - page_header_size: 38, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -260,42 +260,42 @@ fn test_primitive() { pages: vec![ Page { rows: 400, - page_header_size: 38, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 38, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 38, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -303,7 +303,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 38, + page_header_size: 36, compressed_size: 8000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -329,7 +329,7 @@ fn test_primitive() { pages: (0..20) .map(|_| Page { rows: 100, - page_header_size: 38, + page_header_size: 36, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -364,14 +364,14 @@ fn test_string() { pages: (0..15) .map(|_| Page { rows: 130, - page_header_size: 38, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }) .chain(std::iter::once(Page { rows: 50, - page_header_size: 37, + page_header_size: 35, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -400,21 +400,21 @@ fn test_string() { pages: vec![ Page { rows: 130, - page_header_size: 38, + page_header_size: 36, compressed_size: 138, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1250, - page_header_size: 40, + page_header_size: 38, compressed_size: 10000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }, Page { rows: 620, - page_header_size: 38, + page_header_size: 36, compressed_size: 4960, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -422,7 +422,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 130, - page_header_size: 38, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -449,42 +449,42 @@ fn test_string() { pages: vec![ Page { rows: 400, - page_header_size: 38, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 38, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 38, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -492,7 +492,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 38, + page_header_size: 36, compressed_size: 16000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -532,7 +532,7 @@ fn test_list() { pages: (0..10) .map(|_| Page { rows: 20, - page_header_size: 38, + page_header_size: 36, compressed_size: 672, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE,