Skip to content

Commit

Permalink
[fix] move parquet_stats_skipping to predicates module (#602)
Browse files Browse the repository at this point in the history
Our builds are actually broken due to a dependency on `mod engine` in
`scan/mod.rs` (which only exists when one of the engine feature-flags is
enabled. as soon as all are off, it fails to build). This fixes the
issue by moving the `parquet_stats_skipping` module out of engine code
and into `expressions` module. This was trivial since it actually
doesn't depend on any engine code.

We pull `parquet_stats_skipping` into `scan/mod.rs` in order to `impl
ParquetStatsProvider for NoStats`. Now instead of having this rely on
engine code, it's moved to the `predicates` module (which is always
included)

Note that I have a new test in CI that will catch this failure in the
future: #601
  • Loading branch information
zachschuermann authored Dec 17, 2024
1 parent b6558d0 commit 37307f5
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 35 deletions.
1 change: 0 additions & 1 deletion kernel/src/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ macro_rules! declare_modules {
declare_modules!(
(pub, arrow_data),
(pub, parquet_row_group_skipping),
(pub, parquet_stats_skipping),
(pub(crate), arrow_get_data),
(pub(crate), arrow_utils),
(pub(crate), ensure_data_types)
Expand Down
2 changes: 1 addition & 1 deletion kernel/src/engine/parquet_row_group_skipping.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//! An implementation of parquet row group skipping using data skipping predicates over footer stats.
use crate::engine::parquet_stats_skipping::{
use crate::predicates::parquet_stats_skipping::{
ParquetStatsProvider, ParquetStatsSkippingFilter as _,
};
use crate::expressions::{ColumnName, Expression, Scalar, UnaryExpression, BinaryExpression, VariadicExpression};
Expand Down
2 changes: 2 additions & 0 deletions kernel/src/predicates/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use crate::schema::DataType;
use std::cmp::Ordering;
use tracing::debug;

pub(crate) mod parquet_stats_skipping;

#[cfg(test)]
mod tests;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! An implementation of data skipping that leverages parquet stats from the file footer.
use crate::expressions::{
BinaryOperator, ColumnName, Expression as Expr, Scalar, UnaryOperator, VariadicOperator, VariadicExpression, BinaryExpression,
BinaryExpression, BinaryOperator, ColumnName, Expression as Expr, Scalar, UnaryOperator,
VariadicExpression, VariadicOperator,
};
use crate::predicates::{
DataSkippingPredicateEvaluator, PredicateEvaluator, PredicateEvaluatorDefaults,
Expand Down Expand Up @@ -157,7 +158,10 @@ impl<T: DataSkippingPredicateEvaluator<Output = bool>> ParquetStatsSkippingFilte
fn eval_sql_where(&self, filter: &Expr) -> Option<bool> {
use Expr::{Binary, Variadic};
match filter {
Variadic(VariadicExpression { op: VariadicOperator::And, exprs }) => {
Variadic(VariadicExpression {
op: VariadicOperator::And,
exprs,
}) => {
let exprs: Vec<_> = exprs
.iter()
.map(|expr| self.eval_sql_where(expr))
Expand All @@ -168,7 +172,9 @@ impl<T: DataSkippingPredicateEvaluator<Output = bool>> ParquetStatsSkippingFilte
.collect();
self.eval_variadic(VariadicOperator::And, &exprs, false)
}
Binary(BinaryExpression { op, left, right }) => self.eval_binary_nullsafe(*op, left, right),
Binary(BinaryExpression { op, left, right }) => {
self.eval_binary_nullsafe(*op, left, right)
}
_ => self.eval_expr(filter, false),
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ impl ParquetStatsProvider for UnimplementedTestFilter {
fn test_junctions() {
use VariadicOperator::*;


let test_cases = &[
// Every combo of 0, 1 and 2 inputs
(&[] as &[Option<bool>], TRUE, FALSE),
Expand Down Expand Up @@ -154,7 +153,6 @@ impl ParquetStatsProvider for MinMaxTestFilter {
}
}


#[test]
fn test_eval_binary_comparisons() {
const FIVE: Scalar = Scalar::Integer(5);
Expand Down Expand Up @@ -236,7 +234,7 @@ impl ParquetStatsProvider for NullCountTestFilter {
fn test_eval_is_null() {
let expressions = [
Expr::is_null(column_expr!("x")),
!Expr::is_null(column_expr!("x"))
!Expr::is_null(column_expr!("x")),
];

let do_test = |nullcount: i64, expected: &[Option<bool>]| {
Expand Down Expand Up @@ -325,46 +323,31 @@ fn test_sql_where() {

// Constrast normal vs SQL WHERE semantics - comparison inside AND
expect_eq!(
AllNullTestFilter.eval_expr(
&Expr::and(NULL, Expr::lt(col.clone(), VAL)),
false
),
AllNullTestFilter.eval_expr(&Expr::and(NULL, Expr::lt(col.clone(), VAL)), false),
None,
"{NULL} AND {col} < {VAL}"
);
expect_eq!(
AllNullTestFilter.eval_sql_where(&Expr::and(
NULL,
Expr::lt(col.clone(), VAL),
)),
AllNullTestFilter.eval_sql_where(&Expr::and(NULL, Expr::lt(col.clone(), VAL),)),
Some(false),
"WHERE {NULL} AND {col} < {VAL}"
);

expect_eq!(
AllNullTestFilter.eval_expr(
&Expr::and(TRUE, Expr::lt(col.clone(), VAL)),
false
),
AllNullTestFilter.eval_expr(&Expr::and(TRUE, Expr::lt(col.clone(), VAL)), false),
None, // NULL (from the NULL check) is stronger than TRUE
"{TRUE} AND {col} < {VAL}"
);
expect_eq!(
AllNullTestFilter.eval_sql_where(&Expr::and(
TRUE,
Expr::lt(col.clone(), VAL),
)),
AllNullTestFilter.eval_sql_where(&Expr::and(TRUE, Expr::lt(col.clone(), VAL),)),
Some(false), // FALSE (from the NULL check) is stronger than TRUE
"WHERE {TRUE} AND {col} < {VAL}"
);

// Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND
expect_eq!(
AllNullTestFilter.eval_expr(
&Expr::and(
TRUE,
Expr::and(NULL, Expr::lt(col.clone(), VAL)),
),
&Expr::and(TRUE, Expr::and(NULL, Expr::lt(col.clone(), VAL)),),
false,
),
None,
Expand All @@ -382,10 +365,7 @@ fn test_sql_where() {
// Semantics are the same for comparison inside OR inside AND
expect_eq!(
AllNullTestFilter.eval_expr(
&Expr::or(
FALSE,
Expr::and(NULL, Expr::lt(col.clone(), VAL)),
),
&Expr::or(FALSE, Expr::and(NULL, Expr::lt(col.clone(), VAL)),),
false,
),
None,
Expand Down
6 changes: 3 additions & 3 deletions kernel/src/scan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ use crate::actions::deletion_vector::{
};
use crate::actions::{get_log_add_schema, get_log_schema, ADD_NAME, REMOVE_NAME};
use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar};
use crate::predicates::parquet_stats_skipping::{
ParquetStatsProvider, ParquetStatsSkippingFilter as _,
};
use crate::scan::state::{DvInfo, Stats};
use crate::schema::{
ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField,
Expand Down Expand Up @@ -183,9 +186,6 @@ impl PhysicalPredicate {
// the predicate allows to statically skip all files. Since this is direct evaluation (not an
// expression rewrite), we use a dummy `ParquetStatsProvider` that provides no stats.
fn can_statically_skip_all_files(predicate: &Expression) -> bool {
use crate::engine::parquet_stats_skipping::{
ParquetStatsProvider, ParquetStatsSkippingFilter as _,
};
struct NoStats;
impl ParquetStatsProvider for NoStats {
fn get_parquet_min_stat(&self, _: &ColumnName, _: &DataType) -> Option<Scalar> {
Expand Down

0 comments on commit 37307f5

Please sign in to comment.