NVIDIA · abellina · Dec 9, 2021 · Nov 17, 2021 · Dec 3, 2021 · Dec 6, 2021
diff --git a/...ugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala b/...ugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.BroadcastQueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.joins._
-import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuHashJoin, JoinTypeChecks, SerializeConcatHostBuffersDeserializeBatch}
+import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuBroadcastHelper, GpuHashJoin, JoinTypeChecks}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuBroadcastHashJoinMeta(
@@ -148,16 +148,16 @@ case class GpuBroadcastHashJoinExec(
 
     val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val broadcastRelation = broadcastExchange.executeColumnarBroadcast[Any]()
 
     val rdd = streamedPlan.executeColumnar()
     rdd.mapPartitions { it =>
       val stIt = new CollectTimeIterator("broadcast join stream", it, streamTime)
-      val builtBatch = broadcastRelation.value.batch
-      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
-      doJoin(builtBatch, stIt, targetSize, spillCallback,
-        numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      withResource(
+          GpuBroadcastHelper.getBroadcastBatch(broadcastRelation, buildPlan)) { builtBatch =>
+        doJoin(builtBatch, stIt, targetSize, spillCallback,
+          numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      }
     }
   }
 }
diff --git a/...n/301until310-nondb/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala b/...n/301until310-nondb/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.adaptive.BroadcastQueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.joins._
-import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuHashJoin, JoinTypeChecks, SerializeConcatHostBuffersDeserializeBatch}
+import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuBroadcastHelper, GpuHashJoin, JoinTypeChecks}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuBroadcastHashJoinMeta(
@@ -147,16 +147,16 @@ case class GpuBroadcastHashJoinExec(
 
     val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val broadcastRelation = broadcastExchange.executeColumnarBroadcast[Any]()
 
     val rdd = streamedPlan.executeColumnar()
     rdd.mapPartitions { it =>
       val stIt = new CollectTimeIterator("broadcast join stream", it, streamTime)
-      val builtBatch = broadcastRelation.value.batch
-      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
-      doJoin(builtBatch, stIt, targetSize, spillCallback,
-        numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      withResource(
+          GpuBroadcastHelper.getBroadcastBatch(broadcastRelation, buildPlan)) { builtBatch =>
+        doJoin(builtBatch, stIt, targetSize, spillCallback,
+          numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      }
     }
   }
 }
diff --git a/...src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala b/...src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.adaptive.BroadcastQueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.joins._
-import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuHashJoin, JoinTypeChecks, SerializeConcatHostBuffersDeserializeBatch}
+import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuBroadcastHelper, GpuHashJoin, JoinTypeChecks}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuBroadcastHashJoinMeta(
@@ -149,16 +149,16 @@ case class GpuBroadcastHashJoinExec(
 
     val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val broadcastRelation = broadcastExchange.executeColumnarBroadcast[Any]()
 
     val rdd = streamedPlan.executeColumnar()
     rdd.mapPartitions { it =>
       val stIt = new CollectTimeIterator("broadcast join stream", it, streamTime)
-      val builtBatch = broadcastRelation.value.batch
-      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
-      doJoin(builtBatch, stIt, targetSize, spillCallback,
-        numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      withResource(
+          GpuBroadcastHelper.getBroadcastBatch(broadcastRelation, buildPlan)) { builtBatch =>
+        doJoin(builtBatch, stIt, targetSize, spillCallback,
+          numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      }
     }
   }
 }
diff --git a/...ugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala b/...ugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/v2/GpuBroadcastHashJoinExec.scala
@@ -17,7 +17,6 @@
 package com.nvidia.spark.rapids.shims.v2
 
 import com.nvidia.spark.rapids._
-
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -28,7 +27,7 @@ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.BroadcastQueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.joins._
-import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuHashJoin, JoinTypeChecks, SerializeConcatHostBuffersDeserializeBatch}
+import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExec, GpuBroadcastHelper, GpuHashJoin, JoinTypeChecks}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuBroadcastHashJoinMeta(
@@ -148,16 +147,16 @@ case class GpuBroadcastHashJoinExec(
 
     val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val broadcastRelation = broadcastExchange.executeColumnarBroadcast[Any]()
 
     val rdd = streamedPlan.executeColumnar()
     rdd.mapPartitions { it =>
       val stIt = new CollectTimeIterator("broadcast join stream", it, streamTime)
-      val builtBatch = broadcastRelation.value.batch
-      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
-      doJoin(builtBatch, stIt, targetSize, spillCallback,
-        numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      withResource(
+          GpuBroadcastHelper.getBroadcastBatch(broadcastRelation, buildPlan)) { builtBatch =>
+        doJoin(builtBatch, stIt, targetSize, spillCallback,
+          numOutputRows, joinOutputRows, numOutputBatches, opTime, joinTime)
+      }
     }
   }
 }
diff --git a/...lugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala b/...lugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
@@ -303,28 +303,36 @@ abstract class GpuBroadcastExchangeExecBase(
             // Setup a job group here so later it may get cancelled by groupId if necessary.
             sparkContext.setJobGroup(_runId.toString, s"broadcast exchange (runId ${_runId})",
               interruptOnCancel = true)
-            val batch = withResource(new NvtxWithMetrics("broadcast collect", NvtxColor.GREEN,
-              collectTime)) { _ =>
-              val data = child.executeColumnar().map(cb => try {
-                new SerializeBatchDeserializeHostBuffer(cb)
-              } finally {
-                cb.close()
-              })
-              val d = data.collect()
-              new SerializeConcatHostBuffersDeserializeBatch(d, output)
-            }
-
-            val numRows = batch.numRows
-            checkRowLimit(numRows)
-            numOutputBatches += 1
-            numOutputRows += numRows
+            var dataSize = 0L
+            val broadcastResult =
+              withResource(new NvtxWithMetrics("broadcast collect", NvtxColor.GREEN,
+                collectTime)) { _ =>
+                val childRdd = child.executeColumnar()
+                val data = childRdd.map(cb => try {
+                  new SerializeBatchDeserializeHostBuffer(cb)
+                } finally {
+                  cb.close()
+                })
+                val d = data.collect()
+                if (d.length == 0) {
+                  // This call for `HashedRelationBroadcastMode` produces
+                  // `EmptyHashedRelation` allowing the AQE rule `EliminateJoinToEmptyRelation` to
+                  // optimize out our parent join given that this is a empty broadcast result.
+                  mode.transform(Iterator.empty, None)
+                } else {
+                  val batch = new SerializeConcatHostBuffersDeserializeBatch(d, output)
+                  val numRows = batch.numRows
+                  checkRowLimit(numRows)
+                  numOutputBatches += 1
+                  numOutputRows += numRows
+                  batch
+                }
+              }
 
             withResource(new NvtxWithMetrics("broadcast build", NvtxColor.DARK_GREEN,
                 buildTime)) { _ =>
               // we only support hashjoin so this is a noop
               // val relation = mode.transform(input, Some(numRows))
-              val dataSize = batch.dataSize
-
               gpuLongMetric("dataSize") += dataSize
               if (dataSize >= MAX_BROADCAST_TABLE_BYTES) {
                 throw new SparkException(
@@ -335,7 +343,7 @@ abstract class GpuBroadcastExchangeExecBase(
             val broadcasted = withResource(new NvtxWithMetrics("broadcast", NvtxColor.CYAN,
                 broadcastTime)) { _ =>
               // Broadcast the relation
-              sparkContext.broadcast(batch.asInstanceOf[Any])
+              sparkContext.broadcast(broadcastResult.asInstanceOf[Any])
             }
 
             SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHelper.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHelper.scala
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids.execution
+
+import com.nvidia.spark.rapids.GpuColumnVector
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.joins.EmptyHashedRelation
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+object GpuBroadcastHelper {
+  /**
+   * Given a broadcast relation get a ColumnarBatch that can be used on the GPU.
+   *
+   * The broadcast relation may or may not contain any data, so we special case
+   * the empty relation case (hash or identity depending on the type of join).
+   *
+   * If a broadcast result is unexpected we throw, but at this moment other
+   * cases are not known, so this is a defensive measure.
+   *
+   * @param broadcastRelation - the broadcast as produced by a broadcast exchange
+   * @param broadcastPlan - the SparkPlan to use to obtain the schema for the broadcast
+   *                      batch
+   * @return a `ColumnarBatch` or throw if the broadcast can't be handled
+   */
+  def getBroadcastBatch(broadcastRelation: Broadcast[Any],
+                        broadcastPlan: SparkPlan): ColumnarBatch = {
+    val broadcastRelationValue = broadcastRelation.value
+    broadcastRelationValue match {
+      case broadcastBatch: SerializeConcatHostBuffersDeserializeBatch =>
+        val builtBatch = broadcastBatch.batch
+        GpuColumnVector.incRefCounts(builtBatch)
+        builtBatch
+      case EmptyHashedRelation =>
+        GpuColumnVector.emptyBatch(broadcastPlan.schema)
+      case identity: Array[Any] if identity.length == 0 =>
+        // A broadcast nested loop join uses `IdentityBroadcastMode` which when
+        // transformed can produce an Array[InternalRow].
+        // In this case we handle the scenario where this is an empty result,
+        // so we return the empty batch, other results are expected to be
+        // `SerializeConcatHostBuffersDeserializeBatch`.
+        GpuColumnVector.emptyBatch(broadcastPlan.schema)
+      case t =>
+        throw new IllegalStateException(s"Invalid broadcast batch received $t")
+    }
+  }
+
+  /**
+   * Given a broadcast relation get the number of rows that the received batch
+   * contains
+   *
+   * The broadcast relation may or may not contain any data, so we special case
+   * the empty relation case (hash or identity depending on the type of join).
+   *
+   * If a broadcast result is unexpected we throw, but at this moment other
+   * cases are not known, so this is a defensive measure.
+   *
+   * @param broadcastRelation - the broadcast as produced by a broadcast exchange
+   * @return number of rows for a batch received, or 0 if it's an empty relation
+   */
+  def getBroadcastBatchNumRows(broadcastRelation: Broadcast[Any]): Int = {
+    val broadcastRelationValue = broadcastRelation.value
+    broadcastRelationValue match {
+      case broadcastBatch: SerializeConcatHostBuffersDeserializeBatch =>
+        broadcastBatch.batch.numRows()
+      case EmptyHashedRelation => 0
+      case identity: Array[Any] if identity.length == 0 => 0
+      case t =>
+        throw new IllegalStateException(s"Invalid broadcast batch received $t")
+    }
+  }
+}
+
diff --git a/...main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala b/...main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
@@ -435,23 +435,26 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
   }
 
   private[this] def makeBuiltBatch(
-      broadcastRelation: Broadcast[SerializeConcatHostBuffersDeserializeBatch],
+      broadcastRelation: Broadcast[Any],
       buildTime: GpuMetric,
       buildDataSize: GpuMetric): ColumnarBatch = {
     withResource(new NvtxWithMetrics("build join table", NvtxColor.GREEN, buildTime)) { _ =>
-      val ret = broadcastRelation.value.batch
-      buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(ret)
-      GpuColumnVector.incRefCounts(ret)
+      withResource(GpuBroadcastHelper.getBroadcastBatch(
+          broadcastRelation, broadcast)) { builtBatch =>
+        GpuColumnVector.incRefCounts(builtBatch)
+        buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(builtBatch)
+        builtBatch
+      }
     }
   }
 
   private[this] def computeBuildRowCount(
-      broadcastRelation: Broadcast[SerializeConcatHostBuffersDeserializeBatch],
+      broadcastRelation: Broadcast[Any],
       buildTime: GpuMetric,
       buildDataSize: GpuMetric): Int = {
     withResource(new NvtxWithMetrics("build join table", NvtxColor.GREEN, buildTime)) { _ =>
       buildDataSize += 0
-      broadcastRelation.value.batch.numRows()
+      GpuBroadcastHelper.getBroadcastBatchNumRows(broadcastRelation)
     }
   }
 
@@ -468,7 +471,7 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
     }
 
     val broadcastRelation =
-      broadcastExchange.executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+      broadcastExchange.executeColumnarBroadcast[Any]()
 
     val joinCondition = boundCondition.orElse {
       // For outer joins use a true condition if there are any columns in the build side
@@ -489,7 +492,7 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
   }
 
   private def leftExistenceJoin(
-      broadcastRelation: Broadcast[SerializeConcatHostBuffersDeserializeBatch],
+      broadcastRelation: Broadcast[Any],
       exists: Boolean,
       buildTime: GpuMetric,
       buildDataSize: GpuMetric): RDD[ColumnarBatch] = {
@@ -504,9 +507,7 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
     }
   }
 
-  private def doUnconditionalJoin(
-      broadcastRelation: Broadcast[SerializeConcatHostBuffersDeserializeBatch]
-  ): RDD[ColumnarBatch] = {
+  private def doUnconditionalJoin(broadcastRelation: Broadcast[Any]): RDD[ColumnarBatch] = {
     if (output.isEmpty) {
       doUnconditionalJoinRowCount(broadcastRelation)
     } else {
@@ -565,9 +566,7 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
   }
 
   /** Special-case handling of an unconditional join that just needs to output a row count. */
-  private def doUnconditionalJoinRowCount(
-      broadcastRelation: Broadcast[SerializeConcatHostBuffersDeserializeBatch]
-  ): RDD[ColumnarBatch] = {
+  private def doUnconditionalJoinRowCount(broadcastRelation: Broadcast[Any]): RDD[ColumnarBatch] = {
     if (joinType == LeftAnti) {
       // degenerate case, no rows are returned.
       left.executeColumnar().mapPartitions { _ =>
@@ -604,13 +603,13 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
   }
 
   private def doConditionalJoin(
-      broadcastRelation: Broadcast[SerializeConcatHostBuffersDeserializeBatch],
+      broadcastRelation: Broadcast[Any],
       boundCondition: Option[GpuExpression],
       numFirstTableColumns: Int): RDD[ColumnarBatch] = {
     val buildTime = gpuLongMetric(BUILD_TIME)
     val buildDataSize = gpuLongMetric(BUILD_DATA_SIZE)
+    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
     lazy val builtBatch = makeBuiltBatch(broadcastRelation, buildTime, buildDataSize)
-
     val streamAttributes = streamed.output
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
@@ -619,7 +618,6 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
     val nestedLoopJoinType = joinType
     val buildSide = getGpuBuildSide
-    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
     streamed.executeColumnar().mapPartitions { streamedIter =>
       val lazyStream = streamedIter.map { cb =>
         withResource(cb) { cb =>