-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-26078][SQL] Dedup self-join attributes on IN subqueries #23057
Changes from 2 commits
2af656a
a71b1c6
86106fa
3d010fd
65fca4f
1beb40c
0312558
6528582
6172f52
ec710d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,10 +54,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { | |
val aliasMap = AttributeMap(duplicates.map { dup => | ||
dup -> Alias(dup, dup.toString)() | ||
}.toSeq) | ||
val aliasedExpressions = right.output.map { ref => | ||
aliasMap.getOrElse(ref, ref) | ||
} | ||
val newRight = Project(aliasedExpressions, right) | ||
val newRight = rewriteDedupPlan(right, aliasMap) | ||
val newJoinCond = joinCond.map { condExpr => | ||
condExpr transform { | ||
case a: Attribute => aliasMap.getOrElse(a, a).toAttribute | ||
|
@@ -70,6 +67,27 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { | |
case _ => joinPlan | ||
} | ||
|
||
private def rewriteDedupPlan(plan: LogicalPlan, rewrites: AttributeMap[Alias]): LogicalPlan = { | ||
val aliasedExpressions = plan.output.map { ref => | ||
rewrites.getOrElse(ref, ref) | ||
} | ||
Project(aliasedExpressions, plan) | ||
} | ||
|
||
private def dedupSubqueryOnSelfJoin(values: Seq[Expression], sub: LogicalPlan): LogicalPlan = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a simple code comment for this method? |
||
val leftRefs = AttributeSet.fromAttributeSets(values.map(_.references)) | ||
val rightRefs = AttributeSet(sub.output) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just |
||
val duplicates = leftRefs.intersect(rightRefs) | ||
if (duplicates.isEmpty) { | ||
sub | ||
} else { | ||
val aliasMap = AttributeMap(duplicates.map { dup => | ||
dup -> Alias(dup, dup.toString)() | ||
}.toSeq) | ||
rewriteDedupPlan(sub, aliasMap) | ||
} | ||
} | ||
|
||
def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
case Filter(condition, child) => | ||
val (withSubquery, withoutSubquery) = | ||
|
@@ -92,18 +110,20 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { | |
// Deduplicate conflicting attributes if any. | ||
dedupJoin(Join(outerPlan, sub, LeftAnti, joinCond)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like we don't need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes sense to dedup the subquery only when the join condition has not been created yet (so in the case of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
What I'd like to do is to unify
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the main problem is that in the other cases, so when exists is there, the condition is already created. So we would need to complicate quite a lot the method in order to handle the 2 cases and I am not sure wether it is worth. For instance, the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah thanks for the explanation! Makes sense to me. |
||
case (p, InSubquery(values, ListQuery(sub, conditions, _, _))) => | ||
val inConditions = values.zip(sub.output).map(EqualTo.tupled) | ||
val newSub = dedupSubqueryOnSelfJoin(values, sub) | ||
val inConditions = values.zip(newSub.output).map(EqualTo.tupled) | ||
val (joinCond, outerPlan) = rewriteExistentialExpr(inConditions ++ conditions, p) | ||
// Deduplicate conflicting attributes if any. | ||
dedupJoin(Join(outerPlan, sub, LeftSemi, joinCond)) | ||
dedupJoin(Join(outerPlan, newSub, LeftSemi, joinCond)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we still need to dedup here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we don't, let me remove it. |
||
case (p, Not(InSubquery(values, ListQuery(sub, conditions, _, _)))) => | ||
// This is a NULL-aware (left) anti join (NAAJ) e.g. col NOT IN expr | ||
// Construct the condition. A NULL in one of the conditions is regarded as a positive | ||
// result; such a row will be filtered out by the Anti-Join operator. | ||
|
||
// Note that will almost certainly be planned as a Broadcast Nested Loop join. | ||
// Use EXISTS if performance matters to you. | ||
val inConditions = values.zip(sub.output).map(EqualTo.tupled) | ||
val newSub = dedupSubqueryOnSelfJoin(values, sub) | ||
val inConditions = values.zip(newSub.output).map(EqualTo.tupled) | ||
val (joinCond, outerPlan) = rewriteExistentialExpr(inConditions, p) | ||
// Expand the NOT IN expression with the NULL-aware semantic | ||
// to its full form. That is from: | ||
|
@@ -119,7 +139,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { | |
// (A.A1 = B.B1 OR ISNULL(A.A1 = B.B1)) AND (B.B2 = A.A2) AND B.B3 > 1 | ||
val finalJoinCond = (nullAwareJoinConds ++ conditions).reduceLeft(And) | ||
// Deduplicate conflicting attributes if any. | ||
dedupJoin(Join(outerPlan, sub, LeftAnti, Option(finalJoinCond))) | ||
dedupJoin(Join(outerPlan, newSub, LeftAnti, Option(finalJoinCond))) | ||
case (p, predicate) => | ||
val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p) | ||
Project(p.output, Filter(newCond.get, inputPlan)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mmmh... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you try this test case? val df1 = spark.sql(
"""
|SELECT id,num,source FROM (
| SELECT id, num, 'a' as source FROM a
| UNION ALL
| SELECT id, num, 'b' as source FROM b
|) AS c WHERE c.id IN (SELECT id FROM b WHERE num = 2) OR
|c.id IN (SELECT id FROM b WHERE num = 3)
""".stripMargin) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this fails indeed. I'll investigate it, thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for your help here @viirya. I added the check also to |
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import scala.collection.mutable.ArrayBuffer | |
import org.apache.spark.sql.catalyst.expressions.SubqueryExpression | ||
import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, Sort} | ||
import org.apache.spark.sql.test.SharedSQLContext | ||
import org.apache.spark.sql.types._ | ||
|
||
class SubquerySuite extends QueryTest with SharedSQLContext { | ||
import testImplicits._ | ||
|
@@ -1280,4 +1281,34 @@ class SubquerySuite extends QueryTest with SharedSQLContext { | |
assert(subqueries.length == 1) | ||
} | ||
} | ||
|
||
test("SPARK-26078: deduplicate fake self joins for IN subqueries") { | ||
withTempView("a", "b") { | ||
val a = spark.createDataFrame(spark.sparkContext.parallelize(Seq(Row("a", 2), Row("b", 1))), | ||
StructType(Seq(StructField("id", StringType), StructField("num", IntegerType)))) | ||
val b = spark.createDataFrame(spark.sparkContext.parallelize(Seq(Row("a", 2), Row("b", 1))), | ||
StructType(Seq(StructField("id", StringType), StructField("num", IntegerType)))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two schema is the same. We can define it just once? |
||
a.createOrReplaceTempView("a") | ||
b.createOrReplaceTempView("b") | ||
|
||
val df1 = spark.sql( | ||
""" | ||
|SELECT id,num,source FROM ( | ||
| SELECT id, num, 'a' as source FROM a | ||
| UNION ALL | ||
| SELECT id, num, 'b' as source FROM b | ||
|) AS c WHERE c.id IN (SELECT id FROM b WHERE num = 2) | ||
""".stripMargin) | ||
checkAnswer(df1, Seq(Row("a", 2, "a"), Row("a", 2, "b"))) | ||
val df2 = spark.sql( | ||
""" | ||
|SELECT id,num,source FROM ( | ||
| SELECT id, num, 'a' as source FROM a | ||
| UNION ALL | ||
| SELECT id, num, 'b' as source FROM b | ||
|) AS c WHERE c.id NOT IN (SELECT id FROM b WHERE num = 2) | ||
""".stripMargin) | ||
checkAnswer(df2, Seq(Row("b", 1, "a"), Row("b", 1, "b"))) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not related to your PR, but is this corrected? The duplicated attributes in join condition may refer to left or right child, how can we blindly replace them with new attributes from right side?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, I actually think this is useless. Let me try and remove it.