diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index b94732e9c132..4134685ffcfb 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -126,6 +126,8 @@ pub fn from_thrift( Ok(match thrift_stats { Some(stats) => { // Number of nulls recorded, when it is not available, we just mark it as 0. + // TODO this should be `None` if there is no information about NULLS. + // see https://github.com/apache/arrow-rs/pull/6216/files let null_count = stats.null_count.unwrap_or(0); if null_count < 0 { @@ -242,10 +244,19 @@ pub fn from_thrift( pub fn to_thrift(stats: Option<&Statistics>) -> Option { let stats = stats?; + // record null counts if greater than zero. + // + // TODO: This should be Some(0) if there are no nulls. + // see https://github.com/apache/arrow-rs/pull/6216/files + let null_count = stats + .null_count_opt() + .map(|value| value as i64) + .filter(|&x| x > 0); + let mut thrift_stats = TStatistics { max: None, min: None, - null_count: stats.null_count_opt().map(|value| value as i64), + null_count, distinct_count: stats.distinct_count().map(|value| value as i64), max_value: None, min_value: None, @@ -375,6 +386,8 @@ impl Statistics { /// Returns number of null values for the column. /// Note that this includes all nulls when column is part of the complex type. + /// + /// Note this API returns 0 if the null count is not available. #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] pub fn null_count(&self) -> u64 { // 0 to remain consistent behavior prior to `null_count_opt` @@ -390,6 +403,10 @@ impl Statistics { /// Returns number of null values for the column, if known. /// Note that this includes all nulls when column is part of the complex type. + /// + /// Note this API returns Some(0) even if the null count was not present + /// in the statistics. + /// See pub fn null_count_opt(&self) -> Option { statistics_enum_func![self, null_count_opt] } diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 9a66d13f84d7..3e0f6ce3a8b3 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -189,7 +189,7 @@ fn test_primitive() { pages: (0..8) .map(|_| Page { rows: 250, - page_header_size: 38, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -218,14 +218,14 @@ fn test_primitive() { pages: vec![ Page { rows: 250, - page_header_size: 38, + page_header_size: 36, compressed_size: 258, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1750, - page_header_size: 38, + page_header_size: 36, compressed_size: 7000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -233,7 +233,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 250, - page_header_size: 38, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -260,42 +260,42 @@ fn test_primitive() { pages: vec![ Page { rows: 400, - page_header_size: 38, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 38, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 38, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -303,7 +303,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 38, + page_header_size: 36, compressed_size: 8000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -329,7 +329,7 @@ fn test_primitive() { pages: (0..20) .map(|_| Page { rows: 100, - page_header_size: 38, + page_header_size: 36, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -364,14 +364,14 @@ fn test_string() { pages: (0..15) .map(|_| Page { rows: 130, - page_header_size: 38, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }) .chain(std::iter::once(Page { rows: 50, - page_header_size: 37, + page_header_size: 35, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -400,21 +400,21 @@ fn test_string() { pages: vec![ Page { rows: 130, - page_header_size: 38, + page_header_size: 36, compressed_size: 138, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1250, - page_header_size: 40, + page_header_size: 38, compressed_size: 10000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }, Page { rows: 620, - page_header_size: 38, + page_header_size: 36, compressed_size: 4960, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -422,7 +422,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 130, - page_header_size: 38, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -449,42 +449,42 @@ fn test_string() { pages: vec![ Page { rows: 400, - page_header_size: 38, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 38, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 38, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 38, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -492,7 +492,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 38, + page_header_size: 36, compressed_size: 16000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -532,7 +532,7 @@ fn test_list() { pages: (0..10) .map(|_| Page { rows: 20, - page_header_size: 38, + page_header_size: 36, compressed_size: 672, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE,