Skip to content

Commit

Permalink
impl + tests for OR and NOT(OR)
Browse files Browse the repository at this point in the history
  • Loading branch information
allisonport-db committed Jan 11, 2024
1 parent 69e0416 commit 5267152
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -167,16 +167,39 @@ private static Optional<Predicate> constructDataSkippingFilter(
// NOTE: AND is special -- we can safely skip the file if one leg does not evaluate to
// TRUE, even if we cannot construct a skipping filter for the other leg.
case "AND":
Optional<Predicate> e1Filter = constructDataSkippingFilter(
Optional<Predicate> e1AndFilter = constructDataSkippingFilter(
asPredicate(getLeft(dataFilters)), schemaHelper);
Optional<Predicate> e2Filter = constructDataSkippingFilter(
Optional<Predicate> e2AndFilter = constructDataSkippingFilter(
asPredicate(getRight(dataFilters)), schemaHelper);
if (e1Filter.isPresent() && e2Filter.isPresent()) {
return Optional.of(new And(e1Filter.get(), e2Filter.get()));
} else if (e1Filter.isPresent()) {
return e1Filter;
if (e1AndFilter.isPresent() && e2AndFilter.isPresent()) {
return Optional.of(new And(e1AndFilter.get(), e2AndFilter.get()));
} else if (e1AndFilter.isPresent()) {
return e1AndFilter;
} else {
return e2Filter; // possibly none
return e2AndFilter; // possibly none
}

// Push skipping predicate generation through OR (similar to AND case).
//
// constructDataFilters(OR(a, b))
// ==> OR(constructDataFilters(a), constructDataFilters(b))
//
// Similar to AND case, if the rewritten predicate does not evaluate to TRUE, then it means
// that neither `constructDataFilters(a)` nor `constructDataFilters(b)` evaluated to TRUE,
// which in turn means that neither `a` nor `b` could evaluate to TRUE for any row the file
// might contain, which proves we have a valid data skipping predicate.
//
// Unlike AND, a single leg of an OR expression provides no filtering power -- we can only
// reject a file if both legs evaluate to false.
case "OR":
Optional<Predicate> e1OrFilter = constructDataSkippingFilter(
asPredicate(getLeft(dataFilters)), schemaHelper);
Optional<Predicate> e2OrFilter = constructDataSkippingFilter(
asPredicate(getRight(dataFilters)), schemaHelper);
if (e1OrFilter.isPresent() && e2OrFilter.isPresent()) {
return Optional.of(new Or(e1OrFilter.get(), e2OrFilter.get()));
} else {
return Optional.empty();
}

case "=": case "<": case "<=": case ">": case ">=":
Expand Down Expand Up @@ -299,6 +322,16 @@ private static Optional<Predicate> constructNotDataSkippingFilters(
schemaHelper
);

// Similar to AND, we can (and want to) push the NOT past the OR using deMorgan's law.
case "OR":
return constructDataSkippingFilter(
new And(
new Predicate("NOT", asPredicate(getLeft(childPredicate))),
new Predicate("NOT", asPredicate(getRight(childPredicate)))
),
schemaHelper
);

case "=":
Expression left = getLeft(childPredicate);
Expression right = getRight(childPredicate);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,11 +372,76 @@ class ScanSuite extends AnyFunSuite with TestUtils with ExpressionTestUtils with
)
)

// Test: 'or statements - simple' Expression: OR
// Test: 'or statements - two fields' Expression: OR
// Test: 'or statements - one side supported' Expression: OR
// Test: 'NOT statements - or' Expression: NOT, OR
testSkipping(
"data skipping - or statements - simple",
"""
{"a": 1}
{"a": 2}
""",
hits = Seq(
// a > 0 or a < -3
new Or(greaterThan(col("a"), ofInt(0)), lessThan(col("a"), ofInt(-3))),
// a >= 2 or a < -1
new Or(greaterThanOrEqual(col("a"), ofInt(2)), lessThan(col("a"), ofInt(-1)))
),
misses = Seq(
// a > 5 or a < -2
new Or(greaterThan(col("a"), ofInt(5)), lessThan(col("a"), ofInt(-2)))
)
)

testSkipping(
"data skipping - or statements - two fields",
"""
{"a": 1, "b": "2017-09-01"}
{"a": 2, "b": "2017-08-31"}
""",
hits = Seq(
new Or(
lessThan(col("a"), ofInt(0)),
equals(col("b"), ofString("2017-09-01"))
),
new Or(
equals(col("a"), ofInt(2)),
lessThan(col("b"), ofString("2017-08-30"))
),
// note startsWith is not supported yet but these should still be hits once supported
new Or( // a < 2 or b like '2017-08-%'
lessThan(col("a"), ofInt(2)),
startsWith(col("b"), ofString("2017-08-"))
),
new Or( // a >= 2 or b like '2016-08-%'
greaterThanOrEqual(col("a"), ofInt(2)),
startsWith(col("b"), ofString("2016-08-"))
),
// MOVE BELOW EXPRESSION TO MISSES ONCE SUPPORTED BY DATA SKIPPING
new Or( // a < 0 or b like '2016-%'
lessThan(col("a"), ofInt(0)),
startsWith(col("b"), ofString("2016-"))
)
),
misses = Seq()
)

// One side of OR by itself isn't powerful enough to prune any files.
testSkipping(
"data skipping - or statements - one side unsupported",
"""
{"a": 10, "b": 10}
{"a": 20: "b": 20}
""",
hits = Seq(
// a % 100 < 10 OR b > 20
new Or(lessThan(aRem100, ofInt(10)), greaterThan(col("b"), ofInt(20))),
// a < 10 OR b % 100 > 20
new Or(lessThan(col("a"), ofInt(10)), greaterThan(bRem100, ofInt(20)))
),
misses = Seq(
// a < 10 OR b > 20
new Or(lessThan(col("a"), ofInt(10)), greaterThan(col("b"), ofInt(20))),
)
)

testSkipping(
"data skipping - not statements - simple",
"""
Expand Down Expand Up @@ -421,16 +486,45 @@ class ScanSuite extends AnyFunSuite with TestUtils with ExpressionTestUtils with
greaterThanOrEqual(aRem100, ofInt(10)),
lessThanOrEqual(col("b"), ofInt(20))
)
),
// MOVE BELOW EXPRESSION TO MISSES ONCE OR IS SUPPORTED BY DATA SKIPPING
)
),
misses = Seq(
not(
new And(
greaterThanOrEqual(col("a"), ofInt(10)),
lessThanOrEqual(col("b"), ofInt(20))
)
)
)
)

// NOT(OR(a, b)) === AND(NOT(a), NOT(b)) => One side by itself is enough to prune.
testSkipping(
"data skipping - not statements - or",
"""
{"a": 1, "b": 10}
{"a": 2, "b": 20}
""",
hits = Seq(
// NOT(a < 1 OR b > 20),
not(new Or(lessThan(col("a"), ofInt(1)), greaterThan(col("b"), ofInt(20)))),
// NOT(a % 100 >= 1 OR b % 100 <= 20)
not(new Or(greaterThanOrEqual(aRem100, ofInt(1)), lessThanOrEqual(bRem100, ofInt(20))))
),
misses = Seq()
misses = Seq(
// NOT(a >= 1 OR b <= 20)
not(
new Or(greaterThanOrEqual(col("a"), ofInt(1)), lessThanOrEqual(col("b"), ofInt(20)))
),
// NOT(a % 100 >= 1 OR b <= 20),
not(
new Or(greaterThanOrEqual(aRem100, ofInt(1)), lessThanOrEqual(col("b"), ofInt(20)))
),
// NOT(a >= 1 OR b % 100 <= 20)
not(
new Or(greaterThanOrEqual(col("a"), ofInt(1)), lessThanOrEqual(bRem100, ofInt(20)))
)
)
)

// If a column does not have stats, it does not participate in data skipping, which disqualifies
Expand All @@ -450,16 +544,15 @@ class ScanSuite extends AnyFunSuite with TestUtils with ExpressionTestUtils with
new Or( // a < 1 OR (a >= 1 AND b < 10): ==> a < 1 OR a >=1 ==> TRUE
lessThan(col("a"), ofInt(1)),
new And(greaterThanOrEqual(col("a"), ofInt(1)), lessThan(col("b"), ofInt(10)))
),
// MOVE BELOW EXPRESSION TO MISSES ONCE SUPPORTED BY DATA SKIPPING
new Or( // a < 1 OR (a > 10 AND b < 10): ==> a < 1 OR a > 10 ==> FALSE
lessThan(col("a"), ofInt(1)),
new And(greaterThan(col("a"), ofInt(10)), lessThan(col("b"), ofInt(10)))
)
),
misses = Seq(
new And( // a < 1 AND b < 10: ==> a < 1 ==> FALSE
lessThan(col("a"), ofInt(1)), lessThan(col("b"), ofInt(10)))
lessThan(col("a"), ofInt(1)), lessThan(col("b"), ofInt(10))),
new Or( // a < 1 OR (a > 10 AND b < 10): ==> a < 1 OR a > 10 ==> FALSE
lessThan(col("a"), ofInt(1)),
new And(greaterThan(col("a"), ofInt(10)), lessThan(col("b"), ofInt(10)))
)
)
)

Expand Down

0 comments on commit 5267152

Please sign in to comment.