Remove test duplication in parquet statistics tets #6185

alamb · 2024-08-02T20:43:48Z

Is your feature request related to a problem or challenge? Please describe what you are trying to do.

When adding additional support to the statistics converter as @Kev1n8 did in #6181 the presence of two sets of tests is quite confusing about where to add tests for the new test

There are tests in both

Describe the solution you'd like
I would like to remove the rundant tests

Describe alternatives you've considered
I think the clearest alternateive is to move all the tests from https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs to https://github.com/apache/arrow-rs/blob/master/parquet/tests/arrow_reader/statistics.rs

We should only move the non duplicated tests.

So that means something like:

Leave a comment with a note about where the tests are in https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs

Keep edge condition tests

arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs

Lines 1513 to 1522 in f2de2cd

    
           #[test] 
        
           fn roundtrip_empty() { 
        
               let empty_bool_array = new_empty_array(&DataType::Boolean); 
        
               Test { 
        
                   input: empty_bool_array.clone(), 
        
                   expected_min: empty_bool_array.clone(), 
        
                   expected_max: empty_bool_array.clone(), 
        
               } 
        
               .run() 
        
           }

Remove tests in

arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs

Lines 1548 to 2113 in f2de2cd

    
           fn roundtrip_int32() { 
        
               Test { 
        
                   input: i32_array([ 
        
                       // row group 1 
        
                       Some(1), 
        
                       None, 
        
                       Some(3), 
        
                       // row group 2 
        
                       Some(0), 
        
                       Some(5), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ]), 
        
                   expected_min: i32_array([Some(1), Some(0), None]), 
        
                   expected_max: i32_array([Some(3), Some(5), None]), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_int64() { 
        
               Test { 
        
                   input: i64_array([ 
        
                       // row group 1 
        
                       Some(1), 
        
                       None, 
        
                       Some(3), 
        
                       // row group 2 
        
                       Some(0), 
        
                       Some(5), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ]), 
        
                   expected_min: i64_array([Some(1), Some(0), None]), 
        
                   expected_max: i64_array(vec![Some(3), Some(5), None]), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_f32() { 
        
               Test { 
        
                   input: f32_array([ 
        
                       // row group 1 
        
                       Some(1.0), 
        
                       None, 
        
                       Some(3.0), 
        
                       // row group 2 
        
                       Some(-1.0), 
        
                       Some(5.0), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ]), 
        
                   expected_min: f32_array([Some(1.0), Some(-1.0), None]), 
        
                   expected_max: f32_array([Some(3.0), Some(5.0), None]), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_f64() { 
        
               Test { 
        
                   input: f64_array([ 
        
                       // row group 1 
        
                       Some(1.0), 
        
                       None, 
        
                       Some(3.0), 
        
                       // row group 2 
        
                       Some(-1.0), 
        
                       Some(5.0), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ]), 
        
                   expected_min: f64_array([Some(1.0), Some(-1.0), None]), 
        
                   expected_max: f64_array([Some(3.0), Some(5.0), None]), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_timestamp() { 
        
               Test { 
        
                   input: timestamp_seconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       None, 
        
                   ), 
        
                   expected_min: timestamp_seconds_array([Some(1), Some(5), None], None), 
        
                   expected_max: timestamp_seconds_array([Some(3), Some(9), None], None), 
        
               } 
        
               .run(); 
        
               Test { 
        
                   input: timestamp_milliseconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       None, 
        
                   ), 
        
                   expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], None), 
        
                   expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], None), 
        
               } 
        
               .run(); 
        
               Test { 
        
                   input: timestamp_microseconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       None, 
        
                   ), 
        
                   expected_min: timestamp_microseconds_array([Some(1), Some(5), None], None), 
        
                   expected_max: timestamp_microseconds_array([Some(3), Some(9), None], None), 
        
               } 
        
               .run(); 
        
               Test { 
        
                   input: timestamp_nanoseconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       None, 
        
                   ), 
        
                   expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], None), 
        
                   expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], None), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_timestamp_timezoned() { 
        
               Test { 
        
                   input: timestamp_seconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       Some("UTC"), 
        
                   ), 
        
                   expected_min: timestamp_seconds_array([Some(1), Some(5), None], Some("UTC")), 
        
                   expected_max: timestamp_seconds_array([Some(3), Some(9), None], Some("UTC")), 
        
               } 
        
               .run(); 
        
               Test { 
        
                   input: timestamp_milliseconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       Some("UTC"), 
        
                   ), 
        
                   expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], Some("UTC")), 
        
                   expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], Some("UTC")), 
        
               } 
        
               .run(); 
        
               Test { 
        
                   input: timestamp_microseconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       Some("UTC"), 
        
                   ), 
        
                   expected_min: timestamp_microseconds_array([Some(1), Some(5), None], Some("UTC")), 
        
                   expected_max: timestamp_microseconds_array([Some(3), Some(9), None], Some("UTC")), 
        
               } 
        
               .run(); 
        
               Test { 
        
                   input: timestamp_nanoseconds_array( 
        
                       [ 
        
                           // row group 1 
        
                           Some(1), 
        
                           None, 
        
                           Some(3), 
        
                           // row group 2 
        
                           Some(9), 
        
                           Some(5), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ], 
        
                       Some("UTC"), 
        
                   ), 
        
                   expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], Some("UTC")), 
        
                   expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], Some("UTC")), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_decimal() { 
        
               Test { 
        
                   input: Arc::new( 
        
                       Decimal128Array::from(vec![ 
        
                           // row group 1 
        
                           Some(100), 
        
                           None, 
        
                           Some(22000), 
        
                           // row group 2 
        
                           Some(500000), 
        
                           Some(330000), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ]) 
        
                       .with_precision_and_scale(9, 2) 
        
                       .unwrap(), 
        
                   ), 
        
                   expected_min: Arc::new( 
        
                       Decimal128Array::from(vec![Some(100), Some(330000), None]) 
        
                           .with_precision_and_scale(9, 2) 
        
                           .unwrap(), 
        
                   ), 
        
                   expected_max: Arc::new( 
        
                       Decimal128Array::from(vec![Some(22000), Some(500000), None]) 
        
                           .with_precision_and_scale(9, 2) 
        
                           .unwrap(), 
        
                   ), 
        
               } 
        
               .run(); 
        
               Test { 
        
                   input: Arc::new( 
        
                       Decimal256Array::from(vec![ 
        
                           // row group 1 
        
                           Some(i256::from(100)), 
        
                           None, 
        
                           Some(i256::from(22000)), 
        
                           // row group 2 
        
                           Some(i256::MAX), 
        
                           Some(i256::MIN), 
        
                           None, 
        
                           // row group 3 
        
                           None, 
        
                           None, 
        
                           None, 
        
                       ]) 
        
                       .with_precision_and_scale(76, 76) 
        
                       .unwrap(), 
        
                   ), 
        
                   expected_min: Arc::new( 
        
                       Decimal256Array::from(vec![Some(i256::from(100)), Some(i256::MIN), None]) 
        
                           .with_precision_and_scale(76, 76) 
        
                           .unwrap(), 
        
                   ), 
        
                   expected_max: Arc::new( 
        
                       Decimal256Array::from(vec![Some(i256::from(22000)), Some(i256::MAX), None]) 
        
                           .with_precision_and_scale(76, 76) 
        
                           .unwrap(), 
        
                   ), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_utf8() { 
        
               Test { 
        
                   input: utf8_array([ 
        
                       // row group 1 
        
                       Some("A"), 
        
                       None, 
        
                       Some("Q"), 
        
                       // row group 2 
        
                       Some("ZZ"), 
        
                       Some("AA"), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ]), 
        
                   expected_min: utf8_array([Some("A"), Some("AA"), None]), 
        
                   expected_max: utf8_array([Some("Q"), Some("ZZ"), None]), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_struct() { 
        
               let mut test = Test { 
        
                   input: struct_array(vec![ 
        
                       // row group 1 
        
                       (Some(true), Some(1)), 
        
                       (None, None), 
        
                       (Some(true), Some(3)), 
        
                       // row group 2 
        
                       (Some(true), Some(0)), 
        
                       (Some(false), Some(5)), 
        
                       (None, None), 
        
                       // row group 3 
        
                       (None, None), 
        
                       (None, None), 
        
                       (None, None), 
        
                   ]), 
        
                   expected_min: struct_array(vec![ 
        
                       (Some(true), Some(1)), 
        
                       (Some(true), Some(0)), 
        
                       (None, None), 
        
                   ]), 
        
                   expected_max: struct_array(vec![ 
        
                       (Some(true), Some(3)), 
        
                       (Some(true), Some(0)), 
        
                       (None, None), 
        
                   ]), 
        
               }; 
        
               // Due to https://github.com/apache/datafusion/issues/8334, 
        
               // statistics for struct arrays are not supported 
        
               test.expected_min = new_null_array(test.input.data_type(), test.expected_min.len()); 
        
               test.expected_max = new_null_array(test.input.data_type(), test.expected_min.len()); 
        
               test.run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_binary() { 
        
               Test { 
        
                   input: Arc::new(BinaryArray::from_opt_vec(vec![ 
        
                       // row group 1 
        
                       Some(b"A"), 
        
                       None, 
        
                       Some(b"Q"), 
        
                       // row group 2 
        
                       Some(b"ZZ"), 
        
                       Some(b"AA"), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ])), 
        
                   expected_min: Arc::new(BinaryArray::from_opt_vec(vec![ 
        
                       Some(b"A"), 
        
                       Some(b"AA"), 
        
                       None, 
        
                   ])), 
        
                   expected_max: Arc::new(BinaryArray::from_opt_vec(vec![ 
        
                       Some(b"Q"), 
        
                       Some(b"ZZ"), 
        
                       None, 
        
                   ])), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_date32() { 
        
               Test { 
        
                   input: date32_array(vec![ 
        
                       // row group 1 
        
                       Some("2021-01-01"), 
        
                       None, 
        
                       Some("2021-01-03"), 
        
                       // row group 2 
        
                       Some("2021-01-01"), 
        
                       Some("2021-01-05"), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ]), 
        
                   expected_min: date32_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]), 
        
                   expected_max: date32_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_date64() { 
        
               Test { 
        
                   input: date64_array(vec![ 
        
                       // row group 1 
        
                       Some("2021-01-01"), 
        
                       None, 
        
                       Some("2021-01-03"), 
        
                       // row group 2 
        
                       Some("2021-01-01"), 
        
                       Some("2021-01-05"), 
        
                       None, 
        
                       // row group 3 
        
                       None, 
        
                       None, 
        
                       None, 
        
                   ]), 
        
                   expected_min: date64_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]), 
        
                   expected_max: date64_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]), 
        
               } 
        
               .run() 
        
           } 
        
           #[test] 
        
           fn roundtrip_large_binary_array() { 
        
               let input: Vec<Option<&[u8]>> = vec![ 
        
                   // row group 1 
        
                   Some(b"A"), 
        
                   None, 
        
                   Some(b"Q"), 
        
                   // row group 2 
        
                   Some(b"ZZ"), 
        
                   Some(b"AA"), 
        
                   None, 
        
                   // row group 3 
        
                   None, 
        
                   None, 
        
                   None, 
        
               ]; 
        
               let expected_min: Vec<Option<&[u8]>> = vec![Some(b"A"), Some(b"AA"), None]; 
        
               let expected_max: Vec<Option<&[u8]>> = vec![Some(b"Q"), Some(b"ZZ"), None]; 
        
               Test { 
        
                   input: large_binary_array(input), 
        
                   expected_min: large_binary_array(expected_min), 
        
                   expected_max: large_binary_array(expected_max), 
        
               } 
        
               .run(); 
        
           } 
        
           #[test] 
        
           fn struct_and_non_struct() { 
        
               // Ensures that statistics for an array that appears *after* a struct 
        
               // array are not wrong 
        
               let struct_col = struct_array(vec![ 
        
                   // row group 1 
        
                   (Some(true), Some(1)), 
        
                   (None, None), 
        
                   (Some(true), Some(3)), 
        
               ]); 
        
               let int_col = i32_array([Some(100), Some(200), Some(300)]); 
        
               let expected_min = i32_array([Some(100)]); 
        
               let expected_max = i32_array(vec![Some(300)]); 
        
               // use a name that shadows a name in the struct column 
        
               match struct_col.data_type() { 
        
                   DataType::Struct(fields) => { 
        
                       assert_eq!(fields.get(1).unwrap().name(), "int_col") 
        
                   } 
        
                   _ => panic!("unexpected data type for struct column"), 
        
               }; 
        
               let input_batch = 
        
                   RecordBatch::try_from_iter([("struct_col", struct_col), ("int_col", int_col)]).unwrap(); 
        
               let schema = input_batch.schema(); 
        
               let metadata = parquet_metadata(schema.clone(), input_batch); 
        
               let parquet_schema = metadata.file_metadata().schema_descr(); 
        
               // read the int_col statistics 
        
               let (idx, _) = parquet_column(parquet_schema, &schema, "int_col").unwrap(); 
        
               assert_eq!(idx, 2); 
        
               let row_groups = metadata.row_groups(); 
        
               let converter = StatisticsConverter::try_new("int_col", &schema, parquet_schema).unwrap(); 
        
               let min = converter.row_group_mins(row_groups.iter()).unwrap(); 
        
               assert_eq!( 
        
                   &min, 
        
                   &expected_min, 
        
                   "Min. Statistics\n\n{}\n\n", 
        
                   DisplayStats(row_groups) 
        
               ); 
        
               let max = converter.row_group_maxes(row_groups.iter()).unwrap(); 
        
               assert_eq!( 
        
                   &max, 
        
                   &expected_max, 
        
                   "Max. Statistics\n\n{}\n\n", 
        
                   DisplayStats(row_groups) 
        
               ); 
        
           }

that have corresponding coverage

Keep data file validation:

arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs

Lines 2115 to 2235 in f2de2cd

    
           fn nan_in_stats() { 
        
               // /parquet-testing/data/nan_in_stats.parquet 
        
               // row_groups: 1 
        
               // "x": Double({min: Some(1.0), max: Some(NaN), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               TestFile::new("nan_in_stats.parquet") 
        
                   .with_column(ExpectedColumn { 
        
                       name: "x", 
        
                       expected_min: Arc::new(Float64Array::from(vec![Some(1.0)])), 
        
                       expected_max: Arc::new(Float64Array::from(vec![Some(f64::NAN)])), 
        
                   }) 
        
                   .run(); 
        
           } 
        
           #[test] 
        
           fn alltypes_plain() { 
        
               // /parquet-testing/data/datapage_v1-snappy-compressed-checksum.parquet 
        
               // row_groups: 1 
        
               // (has no statistics) 
        
               TestFile::new("alltypes_plain.parquet") 
        
                   // No column statistics should be read as NULL, but with the right type 
        
                   .with_column(ExpectedColumn { 
        
                       name: "id", 
        
                       expected_min: i32_array([None]), 
        
                       expected_max: i32_array([None]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "bool_col", 
        
                       expected_min: bool_array([None]), 
        
                       expected_max: bool_array([None]), 
        
                   }) 
        
                   .run(); 
        
           } 
        
           #[test] 
        
           fn alltypes_tiny_pages() { 
        
               // /parquet-testing/data/alltypes_tiny_pages.parquet 
        
               // row_groups: 1 
        
               // "id": Int32({min: Some(0), max: Some(7299), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "bool_col": Boolean({min: Some(false), max: Some(true), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "tinyint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "smallint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "int_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "bigint_col": Int64({min: Some(0), max: Some(90), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "float_col": Float({min: Some(0.0), max: Some(9.9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "double_col": Double({min: Some(0.0), max: Some(90.89999999999999), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "date_string_col": ByteArray({min: Some(ByteArray { data: "01/01/09" }), max: Some(ByteArray { data: "12/31/10" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "string_col": ByteArray({min: Some(ByteArray { data: "0" }), max: Some(ByteArray { data: "9" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "timestamp_col": Int96({min: None, max: None, distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true}) 
        
               // "year": Int32({min: Some(2009), max: Some(2010), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               // "month": Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false}) 
        
               TestFile::new("alltypes_tiny_pages.parquet") 
        
                   .with_column(ExpectedColumn { 
        
                       name: "id", 
        
                       expected_min: i32_array([Some(0)]), 
        
                       expected_max: i32_array([Some(7299)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "bool_col", 
        
                       expected_min: bool_array([Some(false)]), 
        
                       expected_max: bool_array([Some(true)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "tinyint_col", 
        
                       expected_min: i8_array([Some(0)]), 
        
                       expected_max: i8_array([Some(9)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "smallint_col", 
        
                       expected_min: i16_array([Some(0)]), 
        
                       expected_max: i16_array([Some(9)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "int_col", 
        
                       expected_min: i32_array([Some(0)]), 
        
                       expected_max: i32_array([Some(9)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "bigint_col", 
        
                       expected_min: i64_array([Some(0)]), 
        
                       expected_max: i64_array([Some(90)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "float_col", 
        
                       expected_min: f32_array([Some(0.0)]), 
        
                       expected_max: f32_array([Some(9.9)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "double_col", 
        
                       expected_min: f64_array([Some(0.0)]), 
        
                       expected_max: f64_array([Some(90.89999999999999)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "date_string_col", 
        
                       expected_min: utf8_array([Some("01/01/09")]), 
        
                       expected_max: utf8_array([Some("12/31/10")]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "string_col", 
        
                       expected_min: utf8_array([Some("0")]), 
        
                       expected_max: utf8_array([Some("9")]), 
        
                   }) 
        
                   // File has no min/max for timestamp_col 
        
                   .with_column(ExpectedColumn { 
        
                       name: "timestamp_col", 
        
                       expected_min: timestamp_nanoseconds_array([None], None), 
        
                       expected_max: timestamp_nanoseconds_array([None], None), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "year", 
        
                       expected_min: i32_array([Some(2009)]), 
        
                       expected_max: i32_array([Some(2010)]), 
        
                   }) 
        
                   .with_column(ExpectedColumn { 
        
                       name: "month", 
        
                       expected_min: i32_array([Some(1)]), 
        
                       expected_max: i32_array([Some(12)]), 
        
                   }) 
        
                   .run(); 
        
           }

Port fixed length decimal legacy:

arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs

Lines 2237 to 2258 in f2de2cd

    
           fn fixed_length_decimal_legacy() { 
        
               // /parquet-testing/data/fixed_length_decimal_legacy.parquet 
        
               // row_groups: 1 
        
               // "value": FixedLenByteArray({min: Some(FixedLenByteArray(ByteArray { data: Some(ByteBufferPtr { data: b"\0\0\0\0\0\xc8" }) })), max: Some(FixedLenByteArray(ByteArray { data: "\0\0\0\0\t`" })), distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true}) 
        
               TestFile::new("fixed_length_decimal_legacy.parquet") 
        
                   .with_column(ExpectedColumn { 
        
                       name: "value", 
        
                       expected_min: Arc::new( 
        
                           Decimal128Array::from(vec![Some(200)]) 
        
                               .with_precision_and_scale(13, 2) 
        
                               .unwrap(), 
        
                       ), 
        
                       expected_max: Arc::new( 
        
                           Decimal128Array::from(vec![Some(2400)]) 
        
                               .with_precision_and_scale(13, 2) 
        
                               .unwrap(), 
        
                       ), 
        
                   }) 
        
                   .run(); 
        
           }

Additional context
This duplication is a historical artifact of how this code was developed in DataFusion and then it got brought over when @efredine ported the work in #6046

Kev1n8 · 2024-08-03T16:52:04Z

take

alamb added parquet Changes to the parquet crate enhancement Any new improvement worthy of a entry in the changelog labels Aug 2, 2024

alamb mentioned this issue Aug 2, 2024

Add support for StringView and BinaryView statistics in StatisticsConverter #6181

Merged

Kev1n8 mentioned this issue Aug 3, 2024

Remove duplicated statistics tests in parquet #6190

Merged

alamb closed this as completed in #6190 Aug 7, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Remove test duplication in parquet statistics tets #6185

Remove test duplication in parquet statistics tets #6185

alamb commented Aug 2, 2024

Kev1n8 commented Aug 3, 2024

Remove test duplication in parquet statistics tets #6185

Remove test duplication in parquet statistics tets #6185

Comments

alamb commented Aug 2, 2024

Kev1n8 commented Aug 3, 2024