apache · jackylee-ch · Dec 19, 2019 · Dec 19, 2019 · Dec 19, 2019 · Dec 20, 2019
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -55,6 +55,8 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
         child
       case (child, BroadcastDistribution(mode)) =>
         BroadcastExchangeExec(mode, child)
+      case (ShuffleExchangeExec(partitioning, child, _), distribution: OrderedDistribution) =>
+        ShuffleExchangeExec(distribution.createPartitioning(partitioning.numPartitions), child)
       case (child, distribution) =>
         val numPartitions = distribution.requiredNumPartitions
           .getOrElse(defaultNumPreShufflePartitions)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala
@@ -39,9 +39,7 @@ class ConfigBehaviorSuite extends QueryTest with SharedSparkSession {
     def computeChiSquareTest(): Double = {
       val n = 10000
       // Trigger a sort
-      // Range has range partitioning in its output now. To have a range shuffle, we
-      // need to run a repartition first.
-      val data = spark.range(0, n, 1, 1).repartition(10).sort($"id".desc)
+      val data = spark.range(0, n, 1, 10).sort($"id".desc)
         .selectExpr("SPARK_PARTITION_ID() pid", "id").as[(Int, Long)].collect()
 
       // Compute histogram for the number of records per partition post sort
@@ -55,12 +53,12 @@ class ConfigBehaviorSuite extends QueryTest with SharedSparkSession {
 
     withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> numPartitions.toString) {
       // The default chi-sq value should be low
-      assert(computeChiSquareTest() < 100)
+      assert(computeChiSquareTest() < 10)
 
       withSQLConf(SQLConf.RANGE_EXCHANGE_SAMPLE_SIZE_PER_PARTITION.key -> "1") {
         // If we only sample one point, the range boundaries will be pretty bad and the
         // chi-sq value would be very high.
-        assert(computeChiSquareTest() > 300)
+        assert(computeChiSquareTest() > 100)
       }
     }
   }

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -421,6 +421,52 @@ class PlannerSuite extends SharedSparkSession {
     }
   }
 
+  test("SPARK-30036: Remove unnecessary RoundRobinPartitioning " +
+      "if SortExec is followed by RoundRobinPartitioning") {
+    val distribution = OrderedDistribution(SortOrder(Literal(1), Ascending) :: Nil)
+    val partitioning = RoundRobinPartitioning(5)
+    assert(!partitioning.satisfies(distribution))
+
+    val inputPlan = SortExec(SortOrder(Literal(1), Ascending) :: Nil,
+      global = true,
+      child = ShuffleExchangeExec(
+        partitioning,
+        DummySparkPlan(outputPartitioning = partitioning)))
+    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
+    assert(outputPlan.find {
+      case ShuffleExchangeExec(_: RoundRobinPartitioning, _, _) => true
+      case _ => false
+    }.isEmpty,
+      "RoundRobinPartitioning should be changed to RangePartitioning")
+
+    val query = testData.select('key, 'value).repartition(2).sort('key.asc)
+    assert(query.rdd.getNumPartitions == 2)
+    assert(query.rdd.collectPartitions()(0).map(_.get(0)).toSeq == (1 to 50))
+  }
+
+  test("SPARK-30036: Remove unnecessary HashPartitioning " +
+    "if SortExec is followed by HashPartitioning") {
+    val distribution = OrderedDistribution(SortOrder(Literal(1), Ascending) :: Nil)
+    val partitioning = HashPartitioning(Literal(1) :: Nil, 5)
+    assert(!partitioning.satisfies(distribution))
+
+    val inputPlan = SortExec(SortOrder(Literal(1), Ascending) :: Nil,
+      global = true,
+      child = ShuffleExchangeExec(
+        partitioning,
+        DummySparkPlan(outputPartitioning = partitioning)))
+    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
+    assert(outputPlan.find {
+      case ShuffleExchangeExec(_: HashPartitioning, _, _) => true
+      case _ => false
+    }.isEmpty,
+      "HashPartitioning should be changed to RangePartitioning")
+
+    val query = testData.select('key, 'value).repartition(5, 'key).sort('key.asc)
+    assert(query.rdd.getNumPartitions == 5)
+    assert(query.rdd.collectPartitions()(0).map(_.get(0)).toSeq == (1 to 20))
+  }
+
   test("EnsureRequirements does not eliminate Exchange with different partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
     val partitioning = HashPartitioning(Literal(2) :: Nil, 5)