From ab5e464901a622e5bfec63e2e9c8a8fe195259a4 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Mon, 21 Oct 2024 10:48:50 +0200 Subject: [PATCH] fix: Ignore Parquet `is_{min,max}_value_exact` when set to `true` (#19344) --- .../src/parquet/statistics/mod.rs | 53 ++++++++++++++----- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/crates/polars-parquet/src/parquet/statistics/mod.rs b/crates/polars-parquet/src/parquet/statistics/mod.rs index b72e2a7c94b1..03335c27817b 100644 --- a/crates/polars-parquet/src/parquet/statistics/mod.rs +++ b/crates/polars-parquet/src/parquet/statistics/mod.rs @@ -11,7 +11,6 @@ pub use primitive::PrimitiveStatistics; use crate::parquet::error::ParquetResult; use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; pub use crate::parquet::thrift_format::Statistics as ParquetStatistics; -use crate::read::ParquetError; #[derive(Debug, PartialEq)] pub enum Statistics { @@ -42,6 +41,34 @@ impl Statistics { } } + pub fn clear_min(&mut self) { + use Statistics as S; + match self { + S::Binary(s) => _ = s.min_value.take(), + S::Boolean(s) => _ = s.min_value.take(), + S::FixedLen(s) => _ = s.min_value.take(), + S::Int32(s) => _ = s.min_value.take(), + S::Int64(s) => _ = s.min_value.take(), + S::Int96(s) => _ = s.min_value.take(), + S::Float(s) => _ = s.min_value.take(), + S::Double(s) => _ = s.min_value.take(), + }; + } + + pub fn clear_max(&mut self) { + use Statistics as S; + match self { + S::Binary(s) => _ = s.max_value.take(), + S::Boolean(s) => _ = s.max_value.take(), + S::FixedLen(s) => _ = s.max_value.take(), + S::Int32(s) => _ = s.max_value.take(), + S::Int64(s) => _ = s.max_value.take(), + S::Int96(s) => _ = s.max_value.take(), + S::Float(s) => _ = s.max_value.take(), + S::Double(s) => _ = s.max_value.take(), + }; + } + /// Deserializes a raw parquet statistics into [`Statistics`]. /// # Error /// This function errors if it is not possible to read the statistics to the @@ -51,19 +78,8 @@ impl Statistics { statistics: &ParquetStatistics, primitive_type: PrimitiveType, ) -> ParquetResult { - if statistics.is_min_value_exact.is_some() { - return Err(ParquetError::not_supported( - "is_min_value_exact in statistics", - )); - } - if statistics.is_max_value_exact.is_some() { - return Err(ParquetError::not_supported( - "is_max_value_exact in statistics", - )); - } - use {PhysicalType as T, PrimitiveStatistics as PrimStat}; - Ok(match primitive_type.physical_type { + let mut stats: Self = match primitive_type.physical_type { T::ByteArray => BinaryStatistics::deserialize(statistics, primitive_type)?.into(), T::Boolean => BooleanStatistics::deserialize(statistics)?.into(), T::Int32 => PrimStat::::deserialize(statistics, primitive_type)?.into(), @@ -74,7 +90,16 @@ impl Statistics { T::FixedLenByteArray(size) => { FixedLenStatistics::deserialize(statistics, size, primitive_type)?.into() }, - }) + }; + + if statistics.is_min_value_exact.is_some_and(|v| !v) { + stats.clear_min(); + } + if statistics.is_max_value_exact.is_some_and(|v| !v) { + stats.clear_max(); + } + + Ok(stats) } }