Skip to content

Commit

Permalink
[SPARK-2395][SQL] Optimize common LIKE patterns.
Browse files Browse the repository at this point in the history
Author: Michael Armbrust <michael@databricks.com>

Closes #1325 from marmbrus/slowLike and squashes the following commits:

023c3eb [Michael Armbrust] add comment.
8b421c2 [Michael Armbrust] Handle the case where the final % is actually escaped.
d34d37e [Michael Armbrust] add periods.
3bbf35f [Michael Armbrust] Roll back changes to SparkBuild
53894b1 [Michael Armbrust] Fix grammar.
4094462 [Michael Armbrust] Fix grammar.
6d3d0a0 [Michael Armbrust] Optimize common LIKE patterns.
  • Loading branch information
marmbrus committed Jul 8, 2014
1 parent 56e009d commit cc3e0a1
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,54 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE

override def toString() = s"Lower($child)"
}

/** A base class for functions that compare two strings, returning a boolean. */
abstract class StringComparison extends Expression {
self: Product =>

type EvaluatedType = Any

def left: Expression
def right: Expression

override def references = children.flatMap(_.references).toSet
override def children = left :: right :: Nil

override def nullable: Boolean = true
override def dataType: DataType = BooleanType

def compare(l: String, r: String): Boolean

override def eval(input: Row): Any = {
val leftEval = left.eval(input).asInstanceOf[String]
if(leftEval == null) {
null
} else {
val rightEval = right.eval(input).asInstanceOf[String]
if (rightEval == null) null else compare(leftEval, rightEval)
}
}

override def toString() = s"$nodeName($left, $right)"
}

/**
* A function that returns true if the string `left` contains the string `right`.
*/
case class Contains(left: Expression, right: Expression) extends StringComparison {
override def compare(l: String, r: String) = l.contains(r)
}

/**
* A function that returns true if the string `left` starts with the string `right`.
*/
case class StartsWith(left: Expression, right: Expression) extends StringComparison {
def compare(l: String, r: String) = l.startsWith(r)
}

/**
* A function that returns true if the string `left` ends with the string `right`.
*/
case class EndsWith(left: Expression, right: Expression) extends StringComparison {
def compare(l: String, r: String) = l.endsWith(r)
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] {
Batch("ConstantFolding", FixedPoint(100),
NullPropagation,
ConstantFolding,
LikeSimplification,
BooleanSimplification,
SimplifyFilters,
SimplifyCasts,
Expand Down Expand Up @@ -111,6 +112,28 @@ object ColumnPruning extends Rule[LogicalPlan] {
}
}

/**
* Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition.
* For example, when the expression is just checking to see if a string starts with a given
* pattern.
*/
object LikeSimplification extends Rule[LogicalPlan] {
// if guards below protect from escapes on trailing %.
// Cases like "something\%" are not optimized, but this does not affect correctness.
val startsWith = "([^_%]+)%".r
val endsWith = "%([^_%]+)".r
val contains = "%([^_%]+)%".r

def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
case Like(l, Literal(startsWith(pattern), StringType)) if !pattern.endsWith("\\") =>
StartsWith(l, Literal(pattern))
case Like(l, Literal(endsWith(pattern), StringType)) =>
EndsWith(l, Literal(pattern))
case Like(l, Literal(contains(pattern), StringType)) if !pattern.endsWith("\\") =>
Contains(l, Literal(pattern))
}
}

/**
* Replaces [[Expression Expressions]] that can be statically evaluated with
* equivalent [[Literal]] values. This rule is more specific with
Expand Down

0 comments on commit cc3e0a1

Please sign in to comment.