apache · hvanhovell · Mar 30, 2023 · Jul 11, 2023 · Jul 11, 2023 · Jul 11, 2023
diff --git a/.../src/main/scala-2.12/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala b/.../src/main/scala-2.12/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import scala.collection.generic.{GenericCompanion, GenMapFactory}
+import scala.collection.mutable
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.connect.client.arrow.ArrowDeserializers.resolveCompanion
+
+/**
+ * A couple of scala version specific collection utility functions.
+ */
+private[arrow] object ScalaCollectionUtils {
+  def getIterableCompanion(tag: ClassTag[_]): GenericCompanion[Iterable] = {
+    ArrowDeserializers.resolveCompanion[GenericCompanion[Iterable]](tag)
+  }
+  def getMapCompanion(tag: ClassTag[_]): GenMapFactory[Map] = {
+    resolveCompanion[GenMapFactory[Map]](tag)
+  }
+  def wrap[T](array: AnyRef): mutable.WrappedArray[T] = {
+    mutable.WrappedArray.make(array)
+  }
+}
diff --git a/.../src/main/scala-2.13/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala b/.../src/main/scala-2.13/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import scala.collection.{mutable, IterableFactory, MapFactory}
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.connect.client.arrow.ArrowDeserializers.resolveCompanion
+
+/**
+ * A couple of scala version specific collection utility functions.
+ */
+private[arrow] object ScalaCollectionUtils {
+  def getIterableCompanion(tag: ClassTag[_]): IterableFactory[Iterable] = {
+    ArrowDeserializers.resolveCompanion[IterableFactory[Iterable]](tag)
+  }
+  def getMapCompanion(tag: ClassTag[_]): MapFactory[Map] = {
+    resolveCompanion[MapFactory[Map]](tag)
+  }
+  def wrap[T](array: AnyRef): mutable.WrappedArray[T] = {
+    mutable.WrappedArray.make(array.asInstanceOf[Array[T]])
+  }
+}
diff --git a/...r/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/...r/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
@@ -16,53 +16,48 @@
  */
 package org.apache.spark.sql.connect.client
 
-import java.util.Collections
+import java.util.Objects
 
-import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import org.apache.arrow.memory.BufferAllocator
-import org.apache.arrow.vector.FieldVector
-import org.apache.arrow.vector.ipc.ArrowStreamReader
+import org.apache.arrow.vector.ipc.message.{ArrowMessage, ArrowRecordBatch}
+import org.apache.arrow.vector.types.pojo
 
 import org.apache.spark.connect.proto
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ProductEncoder, UnboundRowEncoder}
-import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.Deserializer
-import org.apache.spark.sql.catalyst.types.DataTypeUtils
-import org.apache.spark.sql.connect.client.util.{AutoCloseables, Cleanable}
+import org.apache.spark.sql.connect.client.arrow.{AbstractMessageIterator, ArrowDeserializingIterator, CloseableIterator, ConcatenatingArrowStreamReader, MessageIterator}
+import org.apache.spark.sql.connect.client.util.Cleanable
 import org.apache.spark.sql.connect.common.DataTypeProtoConverter
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.util.ArrowUtils
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 
 private[sql] class SparkResult[T](
     responses: java.util.Iterator[proto.ExecutePlanResponse],
     allocator: BufferAllocator,
     encoder: AgnosticEncoder[T])
     extends AutoCloseable
-    with Cleanable {
+    with Cleanable { self =>
 
   private[this] var numRecords: Int = 0
   private[this] var structType: StructType = _
-  private[this] var boundEncoder: ExpressionEncoder[T] = _
-  private[this] var nextBatchIndex: Int = 0
-  private val idxToBatches = mutable.Map.empty[Int, ColumnarBatch]
-
-  private def createEncoder(schema: StructType): ExpressionEncoder[T] = {
-    val agnosticEncoder = createEncoder(encoder, schema).asInstanceOf[AgnosticEncoder[T]]
-    ExpressionEncoder(agnosticEncoder)
-  }
+  private[this] var arrowSchema: pojo.Schema = _
+  private[this] var nextResultIndex: Int = 0
+  private val resultMap = mutable.Map.empty[Int, (Long, Seq[ArrowMessage])]
 
   /**
    * Update RowEncoder and recursively update the fields of the ProductEncoder if found.
    */
-  private def createEncoder(enc: AgnosticEncoder[_], dataType: DataType): AgnosticEncoder[_] = {
+  private def createEncoder[E](
+      enc: AgnosticEncoder[E],
+      dataType: DataType): AgnosticEncoder[E] = {
     enc match {
       case UnboundRowEncoder =>
         // Replace the row encoder with the encoder inferred from the schema.
-        RowEncoder.encoderFor(dataType.asInstanceOf[StructType])
+        RowEncoder
+          .encoderFor(dataType.asInstanceOf[StructType])
+          .asInstanceOf[AgnosticEncoder[E]]
       case ProductEncoder(clsTag, fields) if ProductEncoder.isTuple(clsTag) =>
         // Recursively continue updating the tuple product encoder
         val schema = dataType.asInstanceOf[StructType]
@@ -76,61 +71,63 @@ private[sql] class SparkResult[T](
     }
   }
 
-  private def processResponses(stopOnFirstNonEmptyResponse: Boolean): Boolean = {
-    while (responses.hasNext) {
+  private def processResponses(
+      stopOnSchema: Boolean = false,
+      stopOnArrowSchema: Boolean = false,
+      stopOnFirstNonEmptyResponse: Boolean = false): Boolean = {
+    var nonEmpty = false
+    var stop = false
+    while (!stop && responses.hasNext) {
       val response = responses.next()
       if (response.hasSchema) {
         // The original schema should arrive before ArrowBatches.
         structType =
           DataTypeProtoConverter.toCatalystType(response.getSchema).asInstanceOf[StructType]
-      } else if (response.hasArrowBatch) {
+        stop |= stopOnSchema
+      }
+      if (response.hasArrowBatch) {
         val ipcStreamBytes = response.getArrowBatch.getData
-        val reader = new ArrowStreamReader(ipcStreamBytes.newInput(), allocator)
-        try {
-          val root = reader.getVectorSchemaRoot
-          if (structType == null) {
-            // If the schema is not available yet, fallback to the schema from Arrow.
-            structType = ArrowUtils.fromArrowSchema(root.getSchema)
-          }
-          // TODO: create encoders that directly operate on arrow vectors.
-          if (boundEncoder == null) {
-            boundEncoder = createEncoder(structType)
-              .resolveAndBind(DataTypeUtils.toAttributes(structType))
-          }
-          while (reader.loadNextBatch()) {
-            val rowCount = root.getRowCount
-            if (rowCount > 0) {
-              val vectors = root.getFieldVectors.asScala
-                .map(v => new ArrowColumnVector(transferToNewVector(v)))
-                .toArray[ColumnVector]
-              idxToBatches.put(nextBatchIndex, new ColumnarBatch(vectors, rowCount))
-              nextBatchIndex += 1
-              numRecords += rowCount
-              if (stopOnFirstNonEmptyResponse) {
-                return true
-              }
-            }
+        val reader = new MessageIterator(ipcStreamBytes.newInput(), allocator)
+        if (arrowSchema == null) {
+          arrowSchema = reader.schema
+          stop |= stopOnArrowSchema
+        } else if (arrowSchema != reader.schema) {
+          // Uh oh...
+        }
+        if (structType == null) {
+          // If the schema is not available yet, fallback to the arrow schema.
+          structType = ArrowUtils.fromArrowSchema(reader.schema)
+        }
+        var numRecordsInBatch = 0
+        val messages = Seq.newBuilder[ArrowMessage]
+        while (reader.hasNext) {
+          val message = reader.next()
+          message match {
+            case batch: ArrowRecordBatch =>
+              numRecordsInBatch += batch.getLength
+            case _ =>
           }
-        } finally {
-          reader.close()
+          messages += message
+        }
+        // Skip the entire result if it is empty.
+        if (numRecordsInBatch > 0) {
+          numRecords += numRecordsInBatch
+          resultMap.put(nextResultIndex, (reader.bytesRead, messages.result()))
+          nextResultIndex += 1
+          nonEmpty |= true
+          stop |= stopOnFirstNonEmptyResponse
         }
       }
     }
-    false
-  }
-
-  private def transferToNewVector(in: FieldVector): FieldVector = {
-    val pair = in.getTransferPair(allocator)
-    pair.transfer()
-    pair.getTo.asInstanceOf[FieldVector]
+    nonEmpty
   }
 
   /**
    * Returns the number of elements in the result.
    */
   def length: Int = {
     // We need to process all responses to make sure numRecords is correct.
-    processResponses(stopOnFirstNonEmptyResponse = false)
+    processResponses()
     numRecords
   }
 
@@ -139,7 +136,9 @@ private[sql] class SparkResult[T](
    *   the schema of the result.
    */
   def schema: StructType = {
-    processResponses(stopOnFirstNonEmptyResponse = true)
+    if (structType == null) {
+      processResponses(stopOnSchema = true)
+    }
     structType
   }
 
@@ -172,52 +171,93 @@ private[sql] class SparkResult[T](
 
   private def buildIterator(destructive: Boolean): java.util.Iterator[T] with AutoCloseable = {
     new java.util.Iterator[T] with AutoCloseable {
-      private[this] var batchIndex: Int = -1
-      private[this] var iterator: java.util.Iterator[InternalRow] = Collections.emptyIterator()
-      private[this] var deserializer: Deserializer[T] = _
+      private[this] var iterator: CloseableIterator[T] = _
 
-      override def hasNext: Boolean = {
-        if (iterator.hasNext) {
-          return true
-        }
-
-        val nextBatchIndex = batchIndex + 1
-        if (destructive) {
-          idxToBatches.remove(batchIndex).foreach(_.close())
+      private def initialize(): Unit = {
+        if (iterator == null) {
+          iterator = new ArrowDeserializingIterator(
+            createEncoder(encoder, schema),
+            new ConcatenatingArrowStreamReader(
+              allocator,
+              Iterator.single(new ResultMessageIterator(destructive)),
+              destructive))
         }
+      }
 
-        val hasNextBatch = if (!idxToBatches.contains(nextBatchIndex)) {
-          processResponses(stopOnFirstNonEmptyResponse = true)
-        } else {
-          true
-        }
-        if (hasNextBatch) {
-          batchIndex = nextBatchIndex
-          iterator = idxToBatches(nextBatchIndex).rowIterator()
-          if (deserializer == null) {
-            deserializer = boundEncoder.createDeserializer()
-          }
-        }
-        hasNextBatch
+      override def hasNext: Boolean = {
+        initialize()
+        iterator.hasNext
       }
 
       override def next(): T = {
-        if (!hasNext) {
-          throw new NoSuchElementException
-        }
-        deserializer(iterator.next())
+        initialize()
+        iterator.next()
       }
 
-      override def close(): Unit = SparkResult.this.close()
+      override def close(): Unit = {
+        if (iterator != null) {
+          iterator.close()
+        }
+      }
     }
   }
 
   /**
    * Close this result, freeing any underlying resources.
    */
-  override def close(): Unit = {
-    idxToBatches.values.foreach(_.close())
+  override def close(): Unit = cleaner.close()
+
+  override val cleaner: AutoCloseable = new SparkResultCloseable(resultMap)
+
+  private class ResultMessageIterator(destructive: Boolean) extends AbstractMessageIterator {
+    private[this] var totalBytesRead = 0L
+    private[this] var nextResultIndex = 0
+    private[this] var current: Iterator[ArrowMessage] = Iterator.empty
+
+    override def bytesRead: Long = totalBytesRead
+
+    override def schema: pojo.Schema = {
+      if (arrowSchema == null) {
+        // We need a schema to proceed. Spark Connect will always
+        // return a result (with a schema) even if the result is empty.
+        processResponses(stopOnArrowSchema = true)
+        Objects.requireNonNull(arrowSchema)
+      }
+      arrowSchema
+    }
+
+    override def hasNext: Boolean = {
+      if (current.hasNext) {
+        return true
+      }
+      val hasNextResult = if (!resultMap.contains(nextResultIndex)) {
+        self.processResponses(stopOnFirstNonEmptyResponse = true)
+      } else {
+        true
+      }
+      if (hasNextResult) {
+        val Some((sizeInBytes, messages)) = if (destructive) {
+          resultMap.remove(nextResultIndex)
+        } else {
+          resultMap.get(nextResultIndex)
+        }
+        totalBytesRead += sizeInBytes
+        current = messages.iterator
+        nextResultIndex += 1
+      }
+      hasNextResult
+    }
+
+    override def next(): ArrowMessage = {
+      if (!hasNext) {
+        throw new NoSuchElementException()
+      }
+      current.next()
+    }
   }
+}
 
-  override def cleaner: AutoCloseable = AutoCloseables(idxToBatches.values.toSeq)
+private[client] class SparkResultCloseable(resultMap: mutable.Map[Int, (Long, Seq[ArrowMessage])])
+    extends AutoCloseable {
+  override def close(): Unit = resultMap.values.foreach(_._2.foreach(_.close()))
 }