diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/DataSkippingUtils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/DataSkippingUtils.java index 4ca3fd1221..7453f76284 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/DataSkippingUtils.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/DataSkippingUtils.java @@ -167,16 +167,39 @@ private static Optional constructDataSkippingFilter( // NOTE: AND is special -- we can safely skip the file if one leg does not evaluate to // TRUE, even if we cannot construct a skipping filter for the other leg. case "AND": - Optional e1Filter = constructDataSkippingFilter( + Optional e1AndFilter = constructDataSkippingFilter( asPredicate(getLeft(dataFilters)), schemaHelper); - Optional e2Filter = constructDataSkippingFilter( + Optional e2AndFilter = constructDataSkippingFilter( asPredicate(getRight(dataFilters)), schemaHelper); - if (e1Filter.isPresent() && e2Filter.isPresent()) { - return Optional.of(new And(e1Filter.get(), e2Filter.get())); - } else if (e1Filter.isPresent()) { - return e1Filter; + if (e1AndFilter.isPresent() && e2AndFilter.isPresent()) { + return Optional.of(new And(e1AndFilter.get(), e2AndFilter.get())); + } else if (e1AndFilter.isPresent()) { + return e1AndFilter; } else { - return e2Filter; // possibly none + return e2AndFilter; // possibly none + } + + // Push skipping predicate generation through OR (similar to AND case). + // + // constructDataFilters(OR(a, b)) + // ==> OR(constructDataFilters(a), constructDataFilters(b)) + // + // Similar to AND case, if the rewritten predicate does not evaluate to TRUE, then it means + // that neither `constructDataFilters(a)` nor `constructDataFilters(b)` evaluated to TRUE, + // which in turn means that neither `a` nor `b` could evaluate to TRUE for any row the file + // might contain, which proves we have a valid data skipping predicate. + // + // Unlike AND, a single leg of an OR expression provides no filtering power -- we can only + // reject a file if both legs evaluate to false. + case "OR": + Optional e1OrFilter = constructDataSkippingFilter( + asPredicate(getLeft(dataFilters)), schemaHelper); + Optional e2OrFilter = constructDataSkippingFilter( + asPredicate(getRight(dataFilters)), schemaHelper); + if (e1OrFilter.isPresent() && e2OrFilter.isPresent()) { + return Optional.of(new Or(e1OrFilter.get(), e2OrFilter.get())); + } else { + return Optional.empty(); } case "=": case "<": case "<=": case ">": case ">=": @@ -299,6 +322,16 @@ private static Optional constructNotDataSkippingFilters( schemaHelper ); + // Similar to AND, we can (and want to) push the NOT past the OR using deMorgan's law. + case "OR": + return constructDataSkippingFilter( + new And( + new Predicate("NOT", asPredicate(getLeft(childPredicate))), + new Predicate("NOT", asPredicate(getRight(childPredicate))) + ), + schemaHelper + ); + case "=": Expression left = getLeft(childPredicate); Expression right = getRight(childPredicate); diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ScanSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ScanSuite.scala index fdb01bd82d..5a23ea7b9c 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ScanSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ScanSuite.scala @@ -372,11 +372,76 @@ class ScanSuite extends AnyFunSuite with TestUtils with ExpressionTestUtils with ) ) - // Test: 'or statements - simple' Expression: OR - // Test: 'or statements - two fields' Expression: OR - // Test: 'or statements - one side supported' Expression: OR - // Test: 'NOT statements - or' Expression: NOT, OR + testSkipping( + "data skipping - or statements - simple", + """ + {"a": 1} + {"a": 2} + """, + hits = Seq( + // a > 0 or a < -3 + new Or(greaterThan(col("a"), ofInt(0)), lessThan(col("a"), ofInt(-3))), + // a >= 2 or a < -1 + new Or(greaterThanOrEqual(col("a"), ofInt(2)), lessThan(col("a"), ofInt(-1))) + ), + misses = Seq( + // a > 5 or a < -2 + new Or(greaterThan(col("a"), ofInt(5)), lessThan(col("a"), ofInt(-2))) + ) + ) + + testSkipping( + "data skipping - or statements - two fields", + """ + {"a": 1, "b": "2017-09-01"} + {"a": 2, "b": "2017-08-31"} + """, + hits = Seq( + new Or( + lessThan(col("a"), ofInt(0)), + equals(col("b"), ofString("2017-09-01")) + ), + new Or( + equals(col("a"), ofInt(2)), + lessThan(col("b"), ofString("2017-08-30")) + ), + // note startsWith is not supported yet but these should still be hits once supported + new Or( // a < 2 or b like '2017-08-%' + lessThan(col("a"), ofInt(2)), + startsWith(col("b"), ofString("2017-08-")) + ), + new Or( // a >= 2 or b like '2016-08-%' + greaterThanOrEqual(col("a"), ofInt(2)), + startsWith(col("b"), ofString("2016-08-")) + ), + // MOVE BELOW EXPRESSION TO MISSES ONCE SUPPORTED BY DATA SKIPPING + new Or( // a < 0 or b like '2016-%' + lessThan(col("a"), ofInt(0)), + startsWith(col("b"), ofString("2016-")) + ) + ), + misses = Seq() + ) + // One side of OR by itself isn't powerful enough to prune any files. + testSkipping( + "data skipping - or statements - one side unsupported", + """ + {"a": 10, "b": 10} + {"a": 20: "b": 20} + """, + hits = Seq( + // a % 100 < 10 OR b > 20 + new Or(lessThan(aRem100, ofInt(10)), greaterThan(col("b"), ofInt(20))), + // a < 10 OR b % 100 > 20 + new Or(lessThan(col("a"), ofInt(10)), greaterThan(bRem100, ofInt(20))) + ), + misses = Seq( + // a < 10 OR b > 20 + new Or(lessThan(col("a"), ofInt(10)), greaterThan(col("b"), ofInt(20))), + ) + ) + testSkipping( "data skipping - not statements - simple", """ @@ -421,16 +486,45 @@ class ScanSuite extends AnyFunSuite with TestUtils with ExpressionTestUtils with greaterThanOrEqual(aRem100, ofInt(10)), lessThanOrEqual(col("b"), ofInt(20)) ) - ), - // MOVE BELOW EXPRESSION TO MISSES ONCE OR IS SUPPORTED BY DATA SKIPPING + ) + ), + misses = Seq( not( new And( greaterThanOrEqual(col("a"), ofInt(10)), lessThanOrEqual(col("b"), ofInt(20)) ) ) + ) + ) + + // NOT(OR(a, b)) === AND(NOT(a), NOT(b)) => One side by itself is enough to prune. + testSkipping( + "data skipping - not statements - or", + """ + {"a": 1, "b": 10} + {"a": 2, "b": 20} + """, + hits = Seq( + // NOT(a < 1 OR b > 20), + not(new Or(lessThan(col("a"), ofInt(1)), greaterThan(col("b"), ofInt(20)))), + // NOT(a % 100 >= 1 OR b % 100 <= 20) + not(new Or(greaterThanOrEqual(aRem100, ofInt(1)), lessThanOrEqual(bRem100, ofInt(20)))) ), - misses = Seq() + misses = Seq( + // NOT(a >= 1 OR b <= 20) + not( + new Or(greaterThanOrEqual(col("a"), ofInt(1)), lessThanOrEqual(col("b"), ofInt(20))) + ), + // NOT(a % 100 >= 1 OR b <= 20), + not( + new Or(greaterThanOrEqual(aRem100, ofInt(1)), lessThanOrEqual(col("b"), ofInt(20))) + ), + // NOT(a >= 1 OR b % 100 <= 20) + not( + new Or(greaterThanOrEqual(col("a"), ofInt(1)), lessThanOrEqual(bRem100, ofInt(20))) + ) + ) ) // If a column does not have stats, it does not participate in data skipping, which disqualifies @@ -450,16 +544,15 @@ class ScanSuite extends AnyFunSuite with TestUtils with ExpressionTestUtils with new Or( // a < 1 OR (a >= 1 AND b < 10): ==> a < 1 OR a >=1 ==> TRUE lessThan(col("a"), ofInt(1)), new And(greaterThanOrEqual(col("a"), ofInt(1)), lessThan(col("b"), ofInt(10))) - ), - // MOVE BELOW EXPRESSION TO MISSES ONCE SUPPORTED BY DATA SKIPPING - new Or( // a < 1 OR (a > 10 AND b < 10): ==> a < 1 OR a > 10 ==> FALSE - lessThan(col("a"), ofInt(1)), - new And(greaterThan(col("a"), ofInt(10)), lessThan(col("b"), ofInt(10))) ) ), misses = Seq( new And( // a < 1 AND b < 10: ==> a < 1 ==> FALSE - lessThan(col("a"), ofInt(1)), lessThan(col("b"), ofInt(10))) + lessThan(col("a"), ofInt(1)), lessThan(col("b"), ofInt(10))), + new Or( // a < 1 OR (a > 10 AND b < 10): ==> a < 1 OR a > 10 ==> FALSE + lessThan(col("a"), ofInt(1)), + new And(greaterThan(col("a"), ofInt(10)), lessThan(col("b"), ofInt(10))) + ) ) )