Skip to content

Commit

Permalink
API: add isNaN and notNaN predicates (apache#1747)
Browse files Browse the repository at this point in the history
  • Loading branch information
yyanyy authored and Peter Vary committed Dec 7, 2020
1 parent c0de0fc commit 7c05292
Show file tree
Hide file tree
Showing 28 changed files with 937 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.apache.iceberg.expressions;

import org.apache.iceberg.util.NaNUtil;

public class BoundUnaryPredicate<T> extends BoundPredicate<T> {
BoundUnaryPredicate(Operation op, BoundTerm<T> term) {
super(op, term);
Expand Down Expand Up @@ -46,6 +48,10 @@ public boolean test(T value) {
return value == null;
case NOT_NULL:
return value != null;
case IS_NAN:
return NaNUtil.isNaN(value);
case NOT_NAN:
return !NaNUtil.isNaN(value);
default:
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + op());
}
Expand All @@ -58,6 +64,10 @@ public String toString() {
return "is_null(" + term() + ")";
case NOT_NULL:
return "not_null(" + term() + ")";
case IS_NAN:
return "is_nan(" + term() + ")";
case NOT_NAN:
return "not_nan(" + term() + ")";
default:
return "Invalid unary predicate: operation = " + op();
}
Expand Down
11 changes: 11 additions & 0 deletions api/src/main/java/org/apache/iceberg/expressions/Evaluator.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.iceberg.StructLike;
import org.apache.iceberg.expressions.ExpressionVisitors.BoundVisitor;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.NaNUtil;

/**
* Evaluates an {@link Expression} for data described by a {@link StructType}.
Expand Down Expand Up @@ -91,6 +92,16 @@ public <T> Boolean notNull(Bound<T> valueExpr) {
return valueExpr.eval(struct) != null;
}

@Override
public <T> Boolean isNaN(Bound<T> valueExpr) {
return NaNUtil.isNaN(valueExpr.eval(struct));
}

@Override
public <T> Boolean notNaN(Bound<T> valueExpr) {
return !NaNUtil.isNaN(valueExpr.eval(struct));
}

@Override
public <T> Boolean lt(Bound<T> valueExpr, Literal<T> lit) {
Comparator<T> cmp = lit.comparator();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ enum Operation {
FALSE,
IS_NULL,
NOT_NULL,
IS_NAN,
NOT_NAN,
LT,
LT_EQ,
GT,
Expand All @@ -52,6 +54,10 @@ public Operation negate() {
return Operation.NOT_NULL;
case NOT_NULL:
return Operation.IS_NULL;
case IS_NAN:
return Operation.NOT_NAN;
case NOT_NAN:
return Operation.IS_NAN;
case LT:
return Operation.GT_EQ;
case LT_EQ:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ public <T> R notNull(BoundReference<T> ref) {
return null;
}

public <T> R isNaN(BoundReference<T> ref) {
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement isNaN");
}

public <T> R notNaN(BoundReference<T> ref) {
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement notNaN");
}

public <T> R lt(BoundReference<T> ref, Literal<T> lit) {
return null;
}
Expand Down Expand Up @@ -143,6 +151,10 @@ public <T> R predicate(BoundPredicate<T> pred) {
return isNull((BoundReference<T>) pred.term());
case NOT_NULL:
return notNull((BoundReference<T>) pred.term());
case IS_NAN:
return isNaN((BoundReference<T>) pred.term());
case NOT_NAN:
return notNaN((BoundReference<T>) pred.term());
default:
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op());
}
Expand Down Expand Up @@ -176,6 +188,14 @@ public <T> R notNull(Bound<T> expr) {
return null;
}

public <T> R isNaN(Bound<T> expr) {
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement isNaN");
}

public <T> R notNaN(Bound<T> expr) {
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement notNaN");
}

public <T> R lt(Bound<T> expr, Literal<T> lit) {
return null;
}
Expand Down Expand Up @@ -241,6 +261,10 @@ public <T> R predicate(BoundPredicate<T> pred) {
return isNull(pred.term());
case NOT_NULL:
return notNull(pred.term());
case IS_NAN:
return isNaN(pred.term());
case NOT_NAN:
return notNaN(pred.term());
default:
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op());
}
Expand Down
47 changes: 45 additions & 2 deletions api/src/main/java/org/apache/iceberg/expressions/Expressions.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.iceberg.transforms.Transform;
import org.apache.iceberg.transforms.Transforms;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.NaNUtil;

/**
* Factory methods for creating {@link Expression expressions}.
Expand Down Expand Up @@ -123,51 +124,79 @@ public static <T> UnboundPredicate<T> notNull(UnboundTerm<T> expr) {
return new UnboundPredicate<>(Expression.Operation.NOT_NULL, expr);
}

public static <T> UnboundPredicate<T> isNaN(String name) {
return new UnboundPredicate<>(Expression.Operation.IS_NAN, ref(name));
}

public static <T> UnboundPredicate<T> isNaN(UnboundTerm<T> expr) {
return new UnboundPredicate<>(Expression.Operation.IS_NAN, expr);
}

public static <T> UnboundPredicate<T> notNaN(String name) {
return new UnboundPredicate<>(Expression.Operation.NOT_NAN, ref(name));
}

public static <T> UnboundPredicate<T> notNaN(UnboundTerm<T> expr) {
return new UnboundPredicate<>(Expression.Operation.NOT_NAN, expr);
}

public static <T> UnboundPredicate<T> lessThan(String name, T value) {
validateInput("lessThan", value);
return new UnboundPredicate<>(Expression.Operation.LT, ref(name), value);
}

public static <T> UnboundPredicate<T> lessThan(UnboundTerm<T> expr, T value) {
validateInput("lessThan", value);
return new UnboundPredicate<>(Expression.Operation.LT, expr, value);
}

public static <T> UnboundPredicate<T> lessThanOrEqual(String name, T value) {
validateInput("lessThanOrEqual", value);
return new UnboundPredicate<>(Expression.Operation.LT_EQ, ref(name), value);
}

public static <T> UnboundPredicate<T> lessThanOrEqual(UnboundTerm<T> expr, T value) {
validateInput("lessThanOrEqual", value);
return new UnboundPredicate<>(Expression.Operation.LT_EQ, expr, value);
}

public static <T> UnboundPredicate<T> greaterThan(String name, T value) {
validateInput("greaterThan", value);
return new UnboundPredicate<>(Expression.Operation.GT, ref(name), value);
}

public static <T> UnboundPredicate<T> greaterThan(UnboundTerm<T> expr, T value) {
validateInput("greaterThan", value);
return new UnboundPredicate<>(Expression.Operation.GT, expr, value);
}

public static <T> UnboundPredicate<T> greaterThanOrEqual(String name, T value) {
validateInput("greaterThanOrEqual", value);
return new UnboundPredicate<>(Expression.Operation.GT_EQ, ref(name), value);
}

public static <T> UnboundPredicate<T> greaterThanOrEqual(UnboundTerm<T> expr, T value) {
validateInput("greaterThanOrEqual", value);
return new UnboundPredicate<>(Expression.Operation.GT_EQ, expr, value);
}

public static <T> UnboundPredicate<T> equal(String name, T value) {
validateInput("equal", value);
return new UnboundPredicate<>(Expression.Operation.EQ, ref(name), value);
}

public static <T> UnboundPredicate<T> equal(UnboundTerm<T> expr, T value) {
validateInput("equal", value);
return new UnboundPredicate<>(Expression.Operation.EQ, expr, value);
}

public static <T> UnboundPredicate<T> notEqual(String name, T value) {
validateInput("notEqual", value);
return new UnboundPredicate<>(Expression.Operation.NOT_EQ, ref(name), value);
}

public static <T> UnboundPredicate<T> notEqual(UnboundTerm<T> expr, T value) {
validateInput("notEqual", value);
return new UnboundPredicate<>(Expression.Operation.NOT_EQ, expr, value);
}

Expand Down Expand Up @@ -216,29 +245,43 @@ public static <T> UnboundPredicate<T> notIn(UnboundTerm<T> expr, Iterable<T> val
}

public static <T> UnboundPredicate<T> predicate(Operation op, String name, T value) {
validateInput(op.toString(), value);
return predicate(op, name, Literals.from(value));
}

public static <T> UnboundPredicate<T> predicate(Operation op, String name, Literal<T> lit) {
Preconditions.checkArgument(op != Operation.IS_NULL && op != Operation.NOT_NULL,
Preconditions.checkArgument(
op != Operation.IS_NULL && op != Operation.NOT_NULL && op != Operation.IS_NAN && op != Operation.NOT_NAN,
"Cannot create %s predicate inclusive a value", op);
return new UnboundPredicate<T>(op, ref(name), lit);
}

public static <T> UnboundPredicate<T> predicate(Operation op, String name, Iterable<T> values) {
validateInput(op.toString(), values);
return predicate(op, ref(name), values);
}

public static <T> UnboundPredicate<T> predicate(Operation op, String name) {
Preconditions.checkArgument(op == Operation.IS_NULL || op == Operation.NOT_NULL,
Preconditions.checkArgument(
op == Operation.IS_NULL || op == Operation.NOT_NULL || op == Operation.IS_NAN || op == Operation.NOT_NAN,
"Cannot create %s predicate without a value", op);
return new UnboundPredicate<>(op, ref(name));
}

private static <T> UnboundPredicate<T> predicate(Operation op, UnboundTerm<T> expr, Iterable<T> values) {
validateInput(op.toString(), values);
return new UnboundPredicate<>(op, expr, values);
}

private static <T> void validateInput(String op, T value) {
Preconditions.checkArgument(!NaNUtil.isNaN(value), String.format("Cannot create %s predicate with NaN", op));
}

private static <T> void validateInput(String op, Iterable<T> values) {
Preconditions.checkArgument(Lists.newArrayList(values).stream().noneMatch(NaNUtil::isNaN),
String.format("Cannot create %s predicate with NaN", op));
}

public static True alwaysTrue() {
return True.INSTANCE;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ public boolean eval(ContentFile<?> file) {
private class MetricsEvalVisitor extends BoundExpressionVisitor<Boolean> {
private Map<Integer, Long> valueCounts = null;
private Map<Integer, Long> nullCounts = null;
private Map<Integer, Long> nanCounts = null;
private Map<Integer, ByteBuffer> lowerBounds = null;
private Map<Integer, ByteBuffer> upperBounds = null;

Expand All @@ -93,6 +94,7 @@ private boolean eval(ContentFile<?> file) {

this.valueCounts = file.valueCounts();
this.nullCounts = file.nullValueCounts();
this.nanCounts = file.nanValueCounts();
this.lowerBounds = file.lowerBounds();
this.upperBounds = file.upperBounds();

Expand Down Expand Up @@ -150,6 +152,34 @@ public <T> Boolean notNull(BoundReference<T> ref) {
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean isNaN(BoundReference<T> ref) {
Integer id = ref.fieldId();

if (nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) == 0) {
return ROWS_CANNOT_MATCH;
}

// when there's no nanCounts information, but we already know the column only contains null,
// it's guaranteed that there's no NaN value
if (containsNullsOnly(id)) {
return ROWS_CANNOT_MATCH;
}

return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notNaN(BoundReference<T> ref) {
Integer id = ref.fieldId();

if (containsNaNsOnly(id)) {
return ROWS_CANNOT_MATCH;
}

return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();
Expand Down Expand Up @@ -347,5 +377,10 @@ private boolean containsNullsOnly(Integer id) {
nullCounts != null && nullCounts.containsKey(id) &&
valueCounts.get(id) - nullCounts.get(id) == 0;
}

private boolean containsNaNsOnly(Integer id) {
return nanCounts != null && nanCounts.containsKey(id) &&
valueCounts != null && nanCounts.get(id).equals(valueCounts.get(id));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,31 @@ public <T> Boolean notNull(BoundReference<T> ref) {
int pos = Accessors.toPosition(ref.accessor());
// containsNull encodes whether at least one partition value is null, lowerBound is null if
// all partition values are null.
ByteBuffer lowerBound = stats.get(pos).lowerBound();
if (lowerBound == null) {
if (stats.get(pos).containsNull() && stats.get(pos).lowerBound() == null) {
return ROWS_CANNOT_MATCH; // all values are null
}

return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean isNaN(BoundReference<T> ref) {
int pos = Accessors.toPosition(ref.accessor());
// containsNull encodes whether at least one partition value is null, lowerBound is null if
// all partition values are null.
if (stats.get(pos).containsNull() && stats.get(pos).lowerBound() == null) {
return ROWS_CANNOT_MATCH; // all values are null
}

return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notNaN(BoundReference<T> ref) {
// we don't have enough information to tell if there is no NaN value
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
int pos = Accessors.toPosition(ref.accessor());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.iceberg.StructLike;
import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor;
import org.apache.iceberg.transforms.Transform;
import org.apache.iceberg.util.NaNUtil;

/**
* Finds the residuals for an {@link Expression} the partitions in the given {@link PartitionSpec}.
Expand Down Expand Up @@ -152,6 +153,16 @@ public <T> Expression notNull(BoundReference<T> ref) {
return (ref.eval(struct) != null) ? alwaysTrue() : alwaysFalse();
}

@Override
public <T> Expression isNaN(BoundReference<T> ref) {
return NaNUtil.isNaN(ref.eval(struct)) ? alwaysTrue() : alwaysFalse();
}

@Override
public <T> Expression notNaN(BoundReference<T> ref) {
return NaNUtil.isNaN(ref.eval(struct)) ? alwaysFalse() : alwaysTrue();
}

@Override
public <T> Expression lt(BoundReference<T> ref, Literal<T> lit) {
Comparator<T> cmp = lit.comparator();
Expand Down
Loading

0 comments on commit 7c05292

Please sign in to comment.