From ccc81fd85cd873bccb83a8baeb6c00070fe66e46 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Thu, 30 Mar 2023 13:27:33 -0400
Subject: [PATCH 01/16] Add direct arrow serialization

---
 connector/connect/client/jvm/pom.xml          |  13 +
 .../org/apache/spark/sql/SparkSession.scala   |   5 +-
 .../sql/connect/client/SparkResult.scala      |   1 -
 .../client/arrow/ArrowEncoderUtils.scala      |  53 ++
 .../client/arrow/ArrowSerializer.scala        | 529 +++++++++++
 .../client/arrow/ArrowEncoderSuite.scala      | 837 ++++++++++++++++++
 .../sql/catalyst/JavaTypeInferenceSuite.scala |   9 +-
 7 files changed, 1443 insertions(+), 4 deletions(-)
 create mode 100644 connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
 create mode 100644 connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
 create mode 100644 connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 8543057d0c0d1..1be149803c9b0 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -120,6 +120,19 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>com.google.guava</groupId>
+          <artifactId>guava</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 548545b969d5a..da8852a97d125 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -33,7 +33,8 @@ import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection}
 import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedLongEncoder, UnboundRowEncoder}
 import org.apache.spark.sql.connect.client.{SparkConnectClient, SparkResult}
-import org.apache.spark.sql.connect.client.util.{Cleaner, ConvertToArrow}
+import org.apache.spark.sql.connect.client.arrow.ArrowSerializer
+import org.apache.spark.sql.connect.client.util.Cleaner
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -118,7 +119,7 @@ class SparkSession private[sql] (
         .setSchema(encoder.schema.json)
       if (data.nonEmpty) {
         val timeZoneId = conf.get("spark.sql.session.timeZone")
-        val arrowData = ConvertToArrow(encoder, data, timeZoneId, allocator)
+        val arrowData = ArrowSerializer.serialize(data, encoder, allocator, timeZoneId)
         localRelationBuilder.setData(arrowData)
       }
     }
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
index 80db558918bba..39aed614e3f48 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
@@ -72,7 +72,6 @@ private[sql] class SparkResult[T](
           }
           while (reader.loadNextBatch()) {
             val rowCount = root.getRowCount
-            assert(root.getRowCount == response.getArrowBatch.getRowCount) // HUH!
             if (rowCount > 0) {
               val vectors = root.getFieldVectors.asScala
                 .map(v => new ArrowColumnVector(transferToNewVector(v)))
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
new file mode 100644
index 0000000000000..d022d3005b5ff
--- /dev/null
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import scala.collection.JavaConverters._
+import scala.reflect.ClassTag
+
+import org.apache.arrow.vector.{FieldVector, VectorSchemaRoot}
+import org.apache.arrow.vector.complex.StructVector
+
+private[arrow] object ArrowEncoderUtils {
+  object Classes {
+    val WRAPPED_ARRAY: Class[_] = classOf[scala.collection.mutable.WrappedArray[_]]
+    val ITERABLE: Class[_] = classOf[scala.collection.Iterable[_]]
+    val SEQ: Class[_] = classOf[scala.collection.Seq[_]]
+    val SET: Class[_] = classOf[scala.collection.Set[_]]
+    val MAP: Class[_] = classOf[scala.collection.Map[_, _]]
+    val JLIST: Class[_] = classOf[java.util.List[_]]
+    val JMAP: Class[_] = classOf[java.util.Map[_, _]]
+  }
+
+  def isSubClass(cls: Class[_], tag: ClassTag[_]): Boolean = {
+    cls.isAssignableFrom(tag.runtimeClass)
+  }
+
+  def unsupportedCollectionType(cls: Class[_]): Nothing = {
+    throw new RuntimeException(s"Unsupported collection type: $cls")
+  }
+}
+
+trait CloseableIterator[E] extends Iterator[E] with AutoCloseable
+
+private[arrow] object StructVectors {
+  def unapply(v: AnyRef): Option[(StructVector, Seq[FieldVector])] = v match {
+    case root: VectorSchemaRoot => Option((null, root.getFieldVectors.asScala))
+    case struct: StructVector => Option((struct, struct.getChildrenFromFields.asScala))
+    case _ => None
+  }
+}
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
new file mode 100644
index 0000000000000..038e6a49516cd
--- /dev/null
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
@@ -0,0 +1,529 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import java.io.{ByteArrayOutputStream, OutputStream}
+import java.lang.invoke.{MethodHandles, MethodType}
+import java.math.{BigDecimal => JBigDecimal, BigInteger => JBigInteger}
+import java.nio.channels.Channels
+import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period}
+import java.util.{Map => JMap}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import com.google.protobuf.ByteString
+import org.apache.arrow.memory.BufferAllocator
+import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, DurationVector, FieldVector, Float4Vector, Float8Vector, IntervalYearVector, IntVector, NullVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, VarBinaryVector, VarCharVector, VectorSchemaRoot, VectorUnloader}
+import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
+import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel}
+import org.apache.arrow.vector.ipc.message.{IpcOption, MessageSerializer}
+import org.apache.arrow.vector.util.Text
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.DefinedByConstructorParams
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders._
+import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils}
+import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.types.Decimal
+import org.apache.spark.sql.util.ArrowUtils
+
+/**
+ * Helper class for converting user objects into arrow batches.
+ */
+class ArrowSerializer[T](
+    private[this] val enc: AgnosticEncoder[T],
+    private[this] val allocator: BufferAllocator,
+    private[this] val timeZoneId: String) {
+  private val (root, serializer) = ArrowSerializer.serializerFor(enc, allocator, timeZoneId)
+  private val vectors = root.getFieldVectors.asScala
+  private val unloader = new VectorUnloader(root)
+  private val schemaBytes = {
+    // Only serialize the schema once.
+    val bytes = new ByteArrayOutputStream()
+    MessageSerializer.serialize(newChannel(bytes), root.getSchema)
+    bytes.toByteArray
+  }
+  private var i: Int = 0
+
+  private def newChannel(output: OutputStream): WriteChannel = {
+    new WriteChannel(Channels.newChannel(output))
+  }
+
+  /**
+   * The size of the current batch.
+   *
+   * The size computed consist of the size of the schema and the size of the arrow buffers. The
+   * actual batch will be larger than that because of alignment, written IPC tokens, and the
+   * written record batch metadata. The size of the record batch metadata is proportional to the
+   * complexity of the schema.
+   */
+  def sizeInBytes: Long = {
+    // We need to set the row count for getBufferSize to return the actual value.
+    root.setRowCount(i)
+    schemaBytes.length + vectors.map(_.getBufferSize).sum
+  }
+
+  /**
+   * Append a record to the current batch.
+   */
+  def append(record: T): Unit = {
+    serializer.write(i, record)
+    i += 1
+  }
+
+  /**
+   * Write the schema and the current batch in Arrow IPC stream format to the [[OutputStream]].
+   */
+  def writeIpcStream(output: OutputStream): Unit = {
+    val channel = newChannel(output)
+    root.setRowCount(i)
+    val batch = unloader.getRecordBatch
+    try {
+      channel.write(schemaBytes)
+      MessageSerializer.serialize(channel, batch)
+      ArrowStreamWriter.writeEndOfStream(channel, IpcOption.DEFAULT)
+    } finally {
+      batch.close()
+    }
+  }
+
+  /**
+   * Reset the serializer.
+   */
+  def reset(): Unit = {
+    i = 0
+    vectors.foreach(_.reset())
+  }
+
+  /**
+   * Close the serializer.
+   */
+  def close(): Unit = root.close()
+}
+
+object ArrowSerializer {
+  import ArrowEncoderUtils._
+
+  /**
+   * Create an [[Iterator]] that converts the input [[Iterator]] of type `T` into an [[Iterator]]
+   * of Arrow IPC Streams.
+   */
+  def serialize[T](
+      input: Iterator[T],
+      enc: AgnosticEncoder[T],
+      allocator: BufferAllocator,
+      maxRecordsPerBatch: Int,
+      maxBatchSize: Long,
+      timeZoneId: String,
+      batchSizeCheckInterval: Int = 128): CloseableIterator[Array[Byte]] = {
+    assert(maxRecordsPerBatch > 0)
+    assert(maxBatchSize > 0)
+    assert(batchSizeCheckInterval > 0)
+    new CloseableIterator[Array[Byte]] {
+      private val serializer = new ArrowSerializer[T](enc, allocator, timeZoneId)
+      private val bytes = new ByteArrayOutputStream
+      private var hasWrittenFirstBatch = false
+
+      /**
+       * Periodical check to make sure we don't go over the size threshold by too much.
+       */
+      private def sizeOk(i: Int): Boolean = {
+        if (i > 0 && i % batchSizeCheckInterval == 0) {
+          return serializer.sizeInBytes < maxBatchSize
+        }
+        true
+      }
+
+      override def hasNext: Boolean = input.hasNext || !hasWrittenFirstBatch
+
+      override def next(): Array[Byte] = {
+        if (!hasNext) {
+          throw new NoSuchElementException()
+        }
+        serializer.reset()
+        bytes.reset()
+        var i = 0
+        while (i < maxRecordsPerBatch && input.hasNext && sizeOk(i)) {
+          serializer.append(input.next())
+          i += 1
+        }
+        serializer.writeIpcStream(bytes)
+        hasWrittenFirstBatch = true
+        bytes.toByteArray
+      }
+
+      override def close(): Unit = serializer.close()
+    }
+  }
+
+  def serialize[T](
+      input: Iterator[T],
+      enc: AgnosticEncoder[T],
+      allocator: BufferAllocator,
+      timeZoneId: String): ByteString = {
+    val serializer = new ArrowSerializer[T](enc, allocator, timeZoneId)
+    serializer.reset()
+    input.foreach(serializer.append)
+    val output = ByteString.newOutput()
+    serializer.writeIpcStream(output)
+    output.toByteString
+  }
+
+  /**
+   * Create a (root) [[Serializer]] for [[AgnosticEncoder]] `encoder`.
+   *
+   * The serializer returned by this method is NOT thread-safe.
+   */
+  def serializerFor[T](
+      encoder: AgnosticEncoder[T],
+      allocator: BufferAllocator,
+      timeZoneId: String): (VectorSchemaRoot, Serializer) = {
+    val arrowSchema = ArrowUtils.toArrowSchema(encoder.schema, timeZoneId)
+    val root = VectorSchemaRoot.create(arrowSchema, allocator)
+    val serializer = if (encoder.schema != encoder.dataType) {
+      assert(root.getSchema.getFields.size() == 1)
+      serializerFor(encoder, root.getVector(0))
+    } else {
+      serializerFor(encoder, root)
+    }
+    root -> serializer
+  }
+
+  // TODO throw better errors on class cast exceptions.
+  private[arrow] def serializerFor[E](encoder: AgnosticEncoder[E], v: AnyRef): Serializer = {
+    (encoder, v) match {
+      case (PrimitiveBooleanEncoder | BoxedBooleanEncoder, v: BitVector) =>
+        new FieldSerializer[Boolean, BitVector](v) {
+          override def set(index: Int, value: Boolean): Unit =
+            vector.setSafe(index, if (value) 1 else 0)
+        }
+      case (PrimitiveByteEncoder | BoxedByteEncoder, v: TinyIntVector) =>
+        new FieldSerializer[Byte, TinyIntVector](v) {
+          override def set(index: Int, value: Byte): Unit = vector.setSafe(index, value)
+        }
+      case (PrimitiveShortEncoder | BoxedShortEncoder, v: SmallIntVector) =>
+        new FieldSerializer[Short, SmallIntVector](v) {
+          override def set(index: Int, value: Short): Unit = vector.setSafe(index, value)
+        }
+      case (PrimitiveIntEncoder | BoxedIntEncoder, v: IntVector) =>
+        new FieldSerializer[Int, IntVector](v) {
+          override def set(index: Int, value: Int): Unit = vector.setSafe(index, value)
+        }
+      case (PrimitiveLongEncoder | BoxedLongEncoder, v: BigIntVector) =>
+        new FieldSerializer[Long, BigIntVector](v) {
+          override def set(index: Int, value: Long): Unit = vector.setSafe(index, value)
+        }
+      case (PrimitiveFloatEncoder | BoxedFloatEncoder, v: Float4Vector) =>
+        new FieldSerializer[Float, Float4Vector](v) {
+          override def set(index: Int, value: Float): Unit = vector.setSafe(index, value)
+        }
+      case (PrimitiveDoubleEncoder | BoxedDoubleEncoder, v: Float8Vector) =>
+        new FieldSerializer[Double, Float8Vector](v) {
+          override def set(index: Int, value: Double): Unit = vector.setSafe(index, value)
+        }
+      case (NullEncoder, v: NullVector) =>
+        new FieldSerializer[Unit, NullVector](v) {
+          override def set(index: Int, value: Unit): Unit = vector.setNull(index)
+        }
+      case (StringEncoder, v: VarCharVector) =>
+        new FieldSerializer[String, VarCharVector](v) {
+          override def set(index: Int, value: String): Unit = setString(v, index, value)
+        }
+      case (JavaEnumEncoder(_), v: VarCharVector) =>
+        new FieldSerializer[Enum[_], VarCharVector](v) {
+          override def set(index: Int, value: Enum[_]): Unit = setString(v, index, value.name())
+        }
+      case (ScalaEnumEncoder(_, _), v: VarCharVector) =>
+        new FieldSerializer[Enumeration#Value, VarCharVector](v) {
+          override def set(index: Int, value: Enumeration#Value): Unit =
+            setString(v, index, value.toString)
+        }
+      case (BinaryEncoder, v: VarBinaryVector) =>
+        new FieldSerializer[Array[Byte], VarBinaryVector](v) {
+          override def set(index: Int, value: Array[Byte]): Unit = vector.setSafe(index, value)
+        }
+      case (SparkDecimalEncoder(_), v: DecimalVector) =>
+        new FieldSerializer[Decimal, DecimalVector](v) {
+          override def set(index: Int, value: Decimal): Unit =
+            setDecimal(vector, index, value.toJavaBigDecimal)
+        }
+      case (ScalaDecimalEncoder(_), v: DecimalVector) =>
+        new FieldSerializer[BigDecimal, DecimalVector](v) {
+          override def set(index: Int, value: BigDecimal): Unit =
+            setDecimal(vector, index, value.bigDecimal)
+        }
+      case (JavaDecimalEncoder(_, false), v: DecimalVector) =>
+        new FieldSerializer[JBigDecimal, DecimalVector](v) {
+          override def set(index: Int, value: JBigDecimal): Unit =
+            setDecimal(vector, index, value)
+        }
+      case (JavaDecimalEncoder(_, true), v: DecimalVector) =>
+        new FieldSerializer[Any, DecimalVector](v) {
+          override def set(index: Int, value: Any): Unit = {
+            val decimal = value match {
+              case j: JBigDecimal => j
+              case d: BigDecimal => d.bigDecimal
+              case k: BigInt => new JBigDecimal(k.bigInteger)
+              case l: JBigInteger => new JBigDecimal(l)
+              case d: Decimal => d.toJavaBigDecimal
+            }
+            setDecimal(vector, index, decimal)
+          }
+        }
+      case (ScalaBigIntEncoder, v: DecimalVector) =>
+        new FieldSerializer[BigInt, DecimalVector](v) {
+          override def set(index: Int, value: BigInt): Unit =
+            setDecimal(vector, index, new JBigDecimal(value.bigInteger))
+        }
+      case (JavaBigIntEncoder, v: DecimalVector) =>
+        new FieldSerializer[JBigInteger, DecimalVector](v) {
+          override def set(index: Int, value: JBigInteger): Unit =
+            setDecimal(vector, index, new JBigDecimal(value))
+        }
+      case (DayTimeIntervalEncoder, v: DurationVector) =>
+        new FieldSerializer[Duration, DurationVector](v) {
+          override def set(index: Int, value: Duration): Unit =
+            vector.setSafe(index, IntervalUtils.durationToMicros(value))
+        }
+      case (YearMonthIntervalEncoder, v: IntervalYearVector) =>
+        new FieldSerializer[Period, IntervalYearVector](v) {
+          override def set(index: Int, value: Period): Unit =
+            vector.setSafe(index, IntervalUtils.periodToMonths(value))
+        }
+      case (DateEncoder(true) | LocalDateEncoder(true), v: DateDayVector) =>
+        new FieldSerializer[Any, DateDayVector](v) {
+          override def set(index: Int, value: Any): Unit =
+            vector.setSafe(index, DateTimeUtils.anyToDays(value))
+        }
+      case (DateEncoder(false), v: DateDayVector) =>
+        new FieldSerializer[java.sql.Date, DateDayVector](v) {
+          override def set(index: Int, value: java.sql.Date): Unit =
+            vector.setSafe(index, DateTimeUtils.fromJavaDate(value))
+        }
+      case (LocalDateEncoder(false), v: DateDayVector) =>
+        new FieldSerializer[LocalDate, DateDayVector](v) {
+          override def set(index: Int, value: LocalDate): Unit =
+            vector.setSafe(index, DateTimeUtils.localDateToDays(value))
+        }
+      case (TimestampEncoder(true) | InstantEncoder(true), v: TimeStampMicroTZVector) =>
+        new FieldSerializer[Any, TimeStampMicroTZVector](v) {
+          override def set(index: Int, value: Any): Unit =
+            vector.setSafe(index, DateTimeUtils.anyToMicros(value))
+        }
+      case (TimestampEncoder(false), v: TimeStampMicroTZVector) =>
+        new FieldSerializer[java.sql.Timestamp, TimeStampMicroTZVector](v) {
+          override def set(index: Int, value: java.sql.Timestamp): Unit =
+            vector.setSafe(index, DateTimeUtils.fromJavaTimestamp(value))
+        }
+      case (InstantEncoder(false), v: TimeStampMicroTZVector) =>
+        new FieldSerializer[Instant, TimeStampMicroTZVector](v) {
+          override def set(index: Int, value: Instant): Unit =
+            vector.setSafe(index, DateTimeUtils.instantToMicros(value))
+        }
+      case (LocalDateTimeEncoder, v: TimeStampMicroVector) =>
+        new FieldSerializer[LocalDateTime, TimeStampMicroVector](v) {
+          override def set(index: Int, value: LocalDateTime): Unit =
+            vector.setSafe(index, DateTimeUtils.localDateTimeToMicros(value))
+        }
+
+      case (OptionEncoder(value), v) =>
+        new Serializer {
+          private[this] val delegate: Serializer = serializerFor(value, v)
+          override def write(index: Int, value: Any): Unit = value match {
+            case Some(value) => delegate.write(index, value)
+            case _ => delegate.write(index, null)
+          }
+        }
+
+      case (ArrayEncoder(element, _), v: ListVector) =>
+        val elementSerializer = serializerFor(element, v.getDataVector)
+        val toIterator = { array: Any =>
+          mutable.WrappedArray.make(array.asInstanceOf[AnyRef]).iterator
+        }
+        new ArraySerializer(v, toIterator, elementSerializer)
+
+      case (IterableEncoder(tag, element, _, lenient), v: ListVector) =>
+        val elementSerializer = serializerFor(element, v.getDataVector)
+        val toIterator: Any => Iterator[_] = if (lenient) {
+          {
+            case i: scala.collection.Iterable[_] => i.toIterator
+            case l: java.util.List[_] => l.iterator().asScala
+            case a: Array[_] => a.iterator
+            case o => unsupportedCollectionType(o.getClass)
+          }
+        } else if (isSubClass(Classes.ITERABLE, tag)) { v =>
+          v.asInstanceOf[scala.collection.Iterable[_]].toIterator
+        } else if (isSubClass(Classes.JLIST, tag)) { v =>
+          v.asInstanceOf[java.util.List[_]].iterator().asScala
+        } else {
+          unsupportedCollectionType(tag.runtimeClass)
+        }
+        new ArraySerializer(v, toIterator, elementSerializer)
+
+      case (MapEncoder(tag, key, value, _), v: MapVector) =>
+        val structVector = v.getDataVector.asInstanceOf[StructVector]
+        val extractor = if (isSubClass(classOf[scala.collection.Map[_, _]], tag)) { (v: Any) =>
+          v.asInstanceOf[scala.collection.Map[_, _]].iterator
+        } else if (isSubClass(classOf[JMap[_, _]], tag)) { (v: Any) =>
+          v.asInstanceOf[JMap[Any, Any]].asScala.iterator
+        } else {
+          unsupportedCollectionType(tag.runtimeClass)
+        }
+        val structSerializer = new StructSerializer(
+          structVector,
+          new StructFieldSerializer(
+            (v: Any) => v.asInstanceOf[(Any, Any)]._1,
+            serializerFor(key, structVector.getChild(MapVector.KEY_NAME))) ::
+            new StructFieldSerializer(
+              (v: Any) => v.asInstanceOf[(Any, Any)]._2,
+              serializerFor(value, structVector.getChild(MapVector.VALUE_NAME))) :: Nil)
+        new ArraySerializer(v, extractor, structSerializer)
+
+      case (ProductEncoder(tag, fields), StructVectors(struct, vectors)) =>
+        if (isSubClass(classOf[Product], tag)) {
+          structSerializerFor(fields, struct, vectors) { (_, i) => p =>
+            p.asInstanceOf[Product].productElement(i)
+          }
+        } else if (isSubClass(classOf[DefinedByConstructorParams], tag)) {
+          structSerializerFor(fields, struct, vectors) { (field, _) =>
+            val getter = methodLookup.findVirtual(
+              tag.runtimeClass,
+              field.name,
+              MethodType.methodType(field.enc.clsTag.runtimeClass))
+            o => getter.invoke(o)
+          }
+        } else {
+          unsupportedCollectionType(tag.runtimeClass)
+        }
+
+      case (RowEncoder(fields), StructVectors(struct, vectors)) =>
+        structSerializerFor(fields, struct, vectors) { (_, i) => r => r.asInstanceOf[Row].get(i) }
+
+      case (JavaBeanEncoder(tag, fields), StructVectors(struct, vectors)) =>
+        structSerializerFor(fields, struct, vectors) { (field, _) =>
+          val getter = methodLookup.findVirtual(
+            tag.runtimeClass,
+            field.readMethod.get,
+            MethodType.methodType(field.enc.clsTag.runtimeClass))
+          o => getter.invoke(o)
+        }
+
+      case (CalendarIntervalEncoder | _: UDTEncoder[_], _) =>
+        throw QueryExecutionErrors.unsupportedDataTypeError(encoder.dataType)
+
+      case _ =>
+        throw new RuntimeException(s"Unsupported Encoder($encoder)/Vector($v) combination.")
+    }
+  }
+
+  private val methodLookup = MethodHandles.lookup()
+
+  private def setString(vector: VarCharVector, index: Int, string: String): Unit = {
+    val bytes = Text.encode(string)
+    vector.setSafe(index, bytes, 0, bytes.limit())
+  }
+
+  private def setDecimal(vector: DecimalVector, index: Int, decimal: JBigDecimal): Unit = {
+    val scaledDecimal = if (vector.getScale != decimal.scale()) {
+      decimal.setScale(vector.getScale)
+    } else {
+      decimal
+    }
+    vector.setSafe(index, scaledDecimal)
+  }
+
+  private def structSerializerFor(
+      fields: Seq[EncoderField],
+      struct: StructVector,
+      vectors: Seq[FieldVector])(
+      createGetter: (EncoderField, Int) => Any => Any): StructSerializer = {
+    require(fields.size == vectors.size)
+    val serializers = fields.zip(vectors).zipWithIndex.map { case ((field, vector), i) =>
+      val serializer = serializerFor(field.enc, vector)
+      new StructFieldSerializer(createGetter(field, i), serializer)
+    }
+    new StructSerializer(struct, serializers)
+  }
+
+  abstract class Serializer {
+    def write(index: Int, value: Any): Unit
+  }
+
+  private abstract class FieldSerializer[E, V <: FieldVector](val vector: V) extends Serializer {
+    private[this] val nullable = vector.getField.isNullable
+    def set(index: Int, value: E): Unit
+
+    override def write(index: Int, raw: Any): Unit = {
+      val value = raw.asInstanceOf[E]
+      if (value != null) {
+        set(index, value)
+      } else if (nullable) {
+        vector.setNull(index)
+      } else {
+        throw new NullPointerException()
+      }
+    }
+  }
+
+  private class ArraySerializer(
+      v: ListVector,
+      toIterator: Any => Iterator[Any],
+      elementSerializer: Serializer)
+      extends FieldSerializer[Any, ListVector](v) {
+    override def set(index: Int, value: Any): Unit = {
+      val elementStartIndex = vector.startNewValue(index)
+      var elementIndex = elementStartIndex
+      val iterator = toIterator(value)
+      while (iterator.hasNext) {
+        elementSerializer.write(elementIndex, iterator.next())
+        elementIndex += 1
+      }
+      vector.endValue(index, elementIndex - elementStartIndex)
+    }
+  }
+
+  private class StructFieldSerializer(val extractor: Any => Any, val serializer: Serializer) {
+    def write(index: Int, value: Any): Unit = serializer.write(index, extractor(value))
+    def writeNull(index: Int): Unit = serializer.write(index, null)
+  }
+
+  private class StructSerializer(
+      struct: StructVector,
+      fieldSerializers: Seq[StructFieldSerializer])
+      extends Serializer {
+    private[this] val nullable = struct != null && struct.getField.isNullable
+
+    override def write(index: Int, value: Any): Unit = {
+      if (value == null) {
+        if (!nullable) {
+          throw new NullPointerException()
+        }
+        if (struct != null) {
+          struct.setNull(index)
+        }
+        fieldSerializers.foreach(_.writeNull(index))
+      } else {
+        if (struct != null) {
+          struct.setIndexDefined(index)
+        }
+        fieldSerializers.foreach(_.write(index, value))
+      }
+    }
+  }
+}
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
new file mode 100644
index 0000000000000..cf4affa1db49d
--- /dev/null
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -0,0 +1,837 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import java.util
+import java.util.{Collections, Objects}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.reflect.classTag
+import scala.util.control.NonFatal
+
+import com.google.protobuf.ByteString
+import org.apache.arrow.memory.{BufferAllocator, RootAllocator}
+import org.apache.arrow.vector.VarBinaryVector
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.SparkUnsupportedOperationException
+import org.apache.spark.connect.proto
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, DummyBean, FooEnum, JavaTypeInference, PrimitiveData, ScalaReflection}
+import org.apache.spark.sql.catalyst.FooEnum.FooEnum
+import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, BoxedData, UDTForCaseClass}
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedIntEncoder, CalendarIntervalEncoder, DateEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, PrimitiveDoubleEncoder, PrimitiveFloatEncoder, RowEncoder, StringEncoder, TimestampEncoder, UDTEncoder}
+import org.apache.spark.sql.catalyst.encoders.RowEncoder.{encoderFor => toRowEncoder}
+import org.apache.spark.sql.connect.client.SparkResult
+import org.apache.spark.sql.connect.client.util.ConnectFunSuite
+import org.apache.spark.sql.types.{ArrayType, Decimal, DecimalType, Metadata, StructType}
+
+/**
+ * Tests for encoding external data to and from arrow.
+ */
+class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
+  private val allocator = new RootAllocator()
+
+  private def newAllocator(name: String): BufferAllocator = {
+    allocator.newChildAllocator(name, 0, allocator.getLimit)
+  }
+
+  protected override def afterAll(): Unit = {
+    super.afterAll()
+    allocator.close()
+  }
+
+  private def withAllocator[T](f: BufferAllocator => T): T = {
+    val allocator = newAllocator("allocator")
+    try f(allocator)
+    finally {
+      allocator.close()
+    }
+  }
+
+  private def roundTrip[T](
+      encoder: AgnosticEncoder[T],
+      iterator: Iterator[T],
+      maxRecordsPerBatch: Int = 4 * 1024,
+      maxBatchSize: Long = 16 * 1024,
+      batchSizeCheckInterval: Int = 128,
+      inspectBatch: Array[Byte] => Unit = null): CloseableIterator[T] = {
+    // Use different allocators so we can pinpoint memory leaks better.
+    val serializerAllocator = newAllocator("serialization")
+    val deserializerAllocator = newAllocator("deserialization")
+
+    val arrowIterator = ArrowSerializer.serialize(
+      input = iterator,
+      enc = encoder,
+      allocator = serializerAllocator,
+      maxRecordsPerBatch = maxRecordsPerBatch,
+      maxBatchSize = maxBatchSize,
+      batchSizeCheckInterval = batchSizeCheckInterval,
+      timeZoneId = "UTC")
+
+    val inspectedIterator = if (inspectBatch != null) {
+      arrowIterator.map { batch =>
+        inspectBatch(batch)
+        batch
+      }
+    } else {
+      arrowIterator
+    }
+
+    val resultIterator =
+      try {
+        deserializeFromArrow(inspectedIterator, encoder, deserializerAllocator)
+      } catch {
+        case NonFatal(e) =>
+          arrowIterator.close()
+          serializerAllocator.close()
+          deserializerAllocator.close()
+          throw e
+      }
+    new CloseableIterator[T] {
+      override def close(): Unit = {
+        arrowIterator.close()
+        resultIterator.close()
+        serializerAllocator.close()
+        deserializerAllocator.close()
+      }
+      override def hasNext: Boolean = resultIterator.hasNext
+      override def next(): T = resultIterator.next()
+    }
+  }
+
+  // Temporary hack until we merge the deserializer.
+  private def deserializeFromArrow[E](
+      batches: Iterator[Array[Byte]],
+      encoder: AgnosticEncoder[E],
+      allocator: BufferAllocator): CloseableIterator[E] = {
+    val responses = batches.map { batch =>
+      val builder = proto.ExecutePlanResponse.newBuilder()
+      builder.getArrowBatchBuilder.setData(ByteString.copyFrom(batch))
+      builder.build()
+    }
+    val result = new SparkResult[E](responses.asJava, allocator, encoder)
+    new CloseableIterator[E] {
+      private val iterator = result.iterator
+      override def close(): Unit = iterator.close()
+      override def hasNext: Boolean = iterator.hasNext
+      override def next(): E = iterator.next()
+    }
+  }
+
+  private def roundTripAndCheck[T](
+      encoder: AgnosticEncoder[T],
+      toInputIterator: () => Iterator[Any],
+      toOutputIterator: () => Iterator[T],
+      maxRecordsPerBatch: Int = 4 * 1024,
+      maxBatchSize: Long = 16 * 1024,
+      batchSizeCheckInterval: Int = 128,
+      inspectBatch: Array[Byte] => Unit = null): Unit = {
+    val iterator = roundTrip(
+      encoder,
+      toInputIterator().asInstanceOf[Iterator[T]], // Erasure hack :)
+      maxRecordsPerBatch,
+      maxBatchSize,
+      batchSizeCheckInterval,
+      inspectBatch)
+    try {
+      compareIterators(toOutputIterator(), iterator)
+    } finally {
+      iterator.close()
+    }
+  }
+
+  private def roundTripAndCheckIdentical[T](
+      encoder: AgnosticEncoder[T],
+      maxRecordsPerBatch: Int = 4 * 1024,
+      maxBatchSize: Long = 16 * 1024,
+      batchSizeCheckInterval: Int = 128,
+      inspectBatch: Array[Byte] => Unit = null)(toIterator: () => Iterator[T]): Unit = {
+    roundTripAndCheck(
+      encoder,
+      toIterator,
+      toIterator,
+      maxRecordsPerBatch,
+      maxBatchSize,
+      batchSizeCheckInterval,
+      inspectBatch)
+  }
+
+  private def serializeToArrow[T](
+      input: Iterator[T],
+      encoder: AgnosticEncoder[T],
+      allocator: BufferAllocator): CloseableIterator[Array[Byte]] = {
+    ArrowSerializer.serialize(
+      input,
+      encoder,
+      allocator,
+      maxRecordsPerBatch = 1024,
+      maxBatchSize = 8 * 1024,
+      timeZoneId = "UTC")
+  }
+
+  private def compareIterators[T](expected: Iterator[T], actual: Iterator[T]): Unit = {
+    expected.zipAll(actual, null, null).foreach { case (expected, actual) =>
+      assert(expected != null)
+      assert(actual != null)
+      assert(actual == expected)
+    }
+  }
+
+  private class CountingBatchInspector extends (Array[Byte] => Unit) {
+    private var _numBatches: Int = 0
+    private var _sizeInBytes: Long = 0
+    def numBatches: Int = _numBatches
+    def sizeInBytes: Long = _sizeInBytes
+    def sizeInBytesPerBatch: Long = sizeInBytes / numBatches
+    override def apply(batch: Array[Byte]): Unit = {
+      _numBatches += 1
+      _sizeInBytes += batch.length
+    }
+  }
+
+  private case class MaybeNull(interval: Int) {
+    assert(interval > 1)
+    private var invocations = 0
+    def apply[T](value: T): T = {
+      val result = if (invocations % interval == 0) {
+        null.asInstanceOf[T]
+      } else {
+        value
+      }
+      invocations += 1
+      result
+    }
+  }
+
+  private def javaBigDecimal(i: Int): java.math.BigDecimal = {
+    javaBigDecimal(i, DecimalType.DEFAULT_SCALE)
+  }
+
+  private def javaBigDecimal(i: Int, scale: Int): java.math.BigDecimal = {
+    java.math.BigDecimal.valueOf(i).setScale(scale)
+  }
+
+  private val singleIntEncoder = RowEncoder(
+    EncoderField("i", BoxedIntEncoder, nullable = false, Metadata.empty) :: Nil)
+
+  /* ******************************************************************** *
+   * Iterator behavior tests.
+   * ******************************************************************** */
+
+  test("empty") {
+    val inspector = new CountingBatchInspector
+    roundTripAndCheckIdentical(singleIntEncoder, inspectBatch = inspector) { () =>
+      Iterator.empty
+    }
+    // We always write a batch with a schema.
+    assert(inspector.numBatches == 1)
+    assert(inspector.sizeInBytes > 0)
+  }
+
+  test("single batch") {
+    val inspector = new CountingBatchInspector
+    roundTripAndCheckIdentical(singleIntEncoder, inspectBatch = inspector) { () =>
+      Iterator.tabulate(10)(i => Row(i))
+    }
+    assert(inspector.numBatches == 1)
+  }
+
+  test("multiple batches - split by record count") {
+    val inspector = new CountingBatchInspector
+    roundTripAndCheckIdentical(
+      singleIntEncoder,
+      inspectBatch = inspector,
+      maxBatchSize = 32 * 1024) { () =>
+      Iterator.tabulate(1024 * 1024)(i => Row(i))
+    }
+    assert(inspector.numBatches == 256)
+  }
+
+  test("multiple batches - split by size") {
+    val dataGen = { () =>
+      Iterator.tabulate(4 * 1024)(i => Row(i))
+    }
+
+    // Normal interval
+    val inspector1 = new CountingBatchInspector
+    roundTripAndCheckIdentical(singleIntEncoder, maxBatchSize = 1024, inspectBatch = inspector1)(
+      dataGen)
+    assert(inspector1.numBatches == 16)
+    assert(inspector1.sizeInBytesPerBatch >= 1024)
+    assert(inspector1.sizeInBytesPerBatch <= 1024 + 128 * 5)
+
+    // Lowest possible interval
+    val inspector2 = new CountingBatchInspector
+    roundTripAndCheckIdentical(
+      singleIntEncoder,
+      maxBatchSize = 1024,
+      batchSizeCheckInterval = 1,
+      inspectBatch = inspector2)(dataGen)
+    assert(inspector2.numBatches == 20)
+    assert(inspector2.sizeInBytesPerBatch >= 1024)
+    assert(inspector2.sizeInBytesPerBatch <= 1024 + 128 * 2)
+    assert(inspector2.sizeInBytesPerBatch < inspector1.sizeInBytesPerBatch)
+  }
+
+  /* ******************************************************************** *
+   * Encoder specification tests
+   * ******************************************************************** */
+  // Lenient mode
+  // Errors
+
+  test("primitive fields") {
+    val encoder = ScalaReflection.encoderFor[PrimitiveData]
+    roundTripAndCheckIdentical(encoder) { () =>
+      Iterator.tabulate(10) { i =>
+        PrimitiveData(i, i, i.toDouble, i.toFloat, i.toShort, i.toByte, i < 4)
+      }
+    }
+  }
+
+  test("boxed primitive fields") {
+    val encoder = ScalaReflection.encoderFor[BoxedData]
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(3)
+      Iterator.tabulate(100) { i =>
+        BoxedData(
+          intField = maybeNull(i),
+          longField = maybeNull(i),
+          doubleField = maybeNull(i.toDouble),
+          floatField = maybeNull(i.toFloat),
+          shortField = maybeNull(i.toShort),
+          byteField = maybeNull(i.toByte),
+          booleanField = maybeNull(i > 4))
+      }
+    }
+  }
+
+  test("special floating point numbers") {
+    val floatIterator = roundTrip(
+      PrimitiveFloatEncoder,
+      Iterator[Float](Float.NaN, Float.NegativeInfinity, Float.PositiveInfinity))
+    assert(java.lang.Float.isNaN(floatIterator.next()))
+    assert(floatIterator.next() == Float.NegativeInfinity)
+    assert(floatIterator.next() == Float.PositiveInfinity)
+    assert(!floatIterator.hasNext)
+    floatIterator.close()
+
+    val doubleIterator = roundTrip(
+      PrimitiveDoubleEncoder,
+      Iterator[Double](Double.NaN, Double.NegativeInfinity, Double.PositiveInfinity))
+    assert(java.lang.Double.isNaN(doubleIterator.next()))
+    assert(doubleIterator.next() == Double.NegativeInfinity)
+    assert(doubleIterator.next() == Double.PositiveInfinity)
+    assert(!doubleIterator.hasNext)
+    doubleIterator.close()
+  }
+
+  test("nullable fields") {
+    val encoder = ScalaReflection.encoderFor[NullableData]
+    val instant = java.time.Instant.now()
+    val now = java.time.LocalDateTime.now()
+    val today = java.time.LocalDate.now()
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(3)
+      Iterator.tabulate(100) { i =>
+        NullableData(
+          string = maybeNull(if (i % 7 == 0) "" else "s" + i),
+          month = maybeNull(java.time.Month.of(1 + (i % 12))),
+          foo = maybeNull(FooEnum(i % FooEnum.maxId)),
+          decimal = maybeNull(Decimal(i)),
+          scalaBigDecimal = maybeNull(BigDecimal(javaBigDecimal(i + 1))),
+          javaBigDecimal = maybeNull(javaBigDecimal(i + 2)),
+          scalaBigInt = maybeNull(BigInt(i + 3)),
+          javaBigInteger = maybeNull(java.math.BigInteger.valueOf(i + 4)),
+          duration = maybeNull(java.time.Duration.ofDays(i)),
+          period = maybeNull(java.time.Period.ofMonths(i)),
+          date = maybeNull(java.sql.Date.valueOf(today.plusDays(i))),
+          localDate = maybeNull(today.minusDays(i)),
+          timestamp = maybeNull(java.sql.Timestamp.valueOf(now.plusSeconds(i))),
+          instant = maybeNull(instant.plusSeconds(i * 100)),
+          localDateTime = maybeNull(now.minusHours(i)))
+      }
+    }
+  }
+
+  test("binary field") {
+    val encoder = ScalaReflection.encoderFor[BinaryData]
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(3)
+      Iterator.tabulate(100) { i =>
+        BinaryData(maybeNull(Array.tabulate(i % 100)(_.toByte)))
+      }
+    }
+  }
+
+  // Row and Scala class are already covered in other tests
+  test("javabean") {
+    val encoder = JavaTypeInference.encoderFor[DummyBean](classOf[DummyBean])
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(6)
+      Iterator.tabulate(100) { i =>
+        val bean = new DummyBean()
+        bean.setBigInteger(maybeNull(java.math.BigInteger.valueOf(i)))
+        bean
+      }
+    }
+  }
+
+  test("defined by constructor parameters") {
+    val encoder = ScalaReflection.encoderFor[NonProduct]
+    roundTripAndCheckIdentical(encoder) { () =>
+      Iterator.tabulate(100) { i =>
+        new NonProduct("k" + i, i.toDouble)
+      }
+    }
+  }
+
+  test("option") {
+    val encoder = ScalaReflection.encoderFor[Option[String]]
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(6)
+      Iterator.tabulate(100) { i =>
+        Option(maybeNull("v" + i))
+      }
+    }
+  }
+
+  test("arrays") {
+    val encoder = ScalaReflection.encoderFor[ArrayData]
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(5)
+      Iterator.tabulate(100) { i =>
+        ArrayData(
+          maybeNull(Array.tabulate[Double](i % 9)(_.toDouble)),
+          maybeNull(Array.tabulate[String](i % 21)(i => maybeNull("s" + i))),
+          maybeNull(Array.tabulate[Array[Int]](i % 13) { i =>
+            maybeNull {
+              Array.fill(i % 29)(i)
+            }
+          }))
+      }
+    }
+  }
+
+  test("scala iterables") {
+    val encoder = ScalaReflection.encoderFor[ListData]
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(5)
+      Iterator.tabulate(100) { i =>
+        ListData(
+          maybeNull(Seq.tabulate[String](i % 9)(i => maybeNull("s" + i))),
+          maybeNull(Seq.tabulate[Int](i % 10)(identity)),
+          maybeNull(Set(i.toLong, i.toLong - 1, i.toLong - 33)),
+          maybeNull(mutable.Queue.tabulate(5 + i % 6) { i =>
+            Option(maybeNull(BigInt(i)))
+          }))
+      }
+    }
+  }
+
+  test("java lists") {
+    def genJavaData[E](n: Int, collection: util.Collection[E])(f: Int => E): Unit = {
+      Iterator.tabulate(n)(f).foreach(collection.add)
+    }
+    val encoder = JavaTypeInference.encoderFor(classOf[JavaListData])
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(7)
+      Iterator.tabulate(1) { i =>
+        val bean = new JavaListData
+        bean.setListOfDecimal(maybeNull {
+          val list = new util.ArrayList[java.math.BigDecimal]
+          genJavaData(i % 7, list) { i => maybeNull(java.math.BigDecimal.valueOf(i * 33)) }
+          list
+        })
+        bean.setListOfBigInt(maybeNull {
+          val list = new util.LinkedList[java.math.BigInteger]
+          genJavaData(10, list) { i => maybeNull(java.math.BigInteger.valueOf(i * 50)) }
+          list
+        })
+        bean.setListOfStrings(maybeNull {
+          val list = new util.ArrayList[String]
+          genJavaData((i + 5) % 50, list) { i => maybeNull("v" + (i * 2)) }
+          list
+        })
+        bean.setListOfBytes(maybeNull(Collections.singletonList(i.toByte)))
+        bean
+      }
+    }
+  }
+
+  test("wrapped array") {
+    val encoder = ScalaReflection.encoderFor[mutable.WrappedArray[Int]]
+    val input = mutable.WrappedArray.make[Int](Array(1, 98, 7, 6))
+    val iterator = roundTrip(encoder, Iterator.single(input))
+    val Seq(result) = iterator.toSeq
+    assert(result == input)
+    assert(result.array.getClass == classOf[Array[Int]])
+    iterator.close()
+  }
+
+  test("wrapped array - empty") {
+    val schema = new StructType().add("names", "array<string>")
+    val encoder = toRowEncoder(schema)
+    val iterator = roundTrip(encoder, Iterator.single(Row(Seq())))
+    val Seq(Row(raw)) = iterator.toSeq
+    val seq = raw.asInstanceOf[mutable.WrappedArray[String]]
+    assert(seq.isEmpty)
+    assert(seq.array.getClass == classOf[Array[String]])
+    iterator.close()
+  }
+
+  test("maps") {
+    val encoder = ScalaReflection.encoderFor[MapData]
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(5)
+      Iterator.tabulate(100) { i =>
+        MapData(
+          maybeNull(
+            Iterator
+              .tabulate(i % 9) { i =>
+                i -> maybeNull("s" + i)
+              }
+              .toMap),
+          maybeNull(
+            Iterator
+              .tabulate(i % 10) { i =>
+                ("s" + 1) -> maybeNull(Array.tabulate[Long]((i + 5) % 20)(_.toLong))
+              }
+              .toMap))
+      }
+    }
+  }
+
+  test("java maps") {
+    val encoder = JavaTypeInference.encoderFor(classOf[JavaMapData])
+    roundTripAndCheckIdentical(encoder) { () =>
+      val maybeNull = MaybeNull(11)
+      Iterator.tabulate(100) { i =>
+        val bean = new JavaMapData
+        bean.setDummyToDoubleListMap(maybeNull {
+          val map = new util.HashMap[DummyBean, java.util.List[java.lang.Double]]
+          (0 until (i % 5)).foreach { j =>
+            val dummy = new DummyBean
+            dummy.setBigInteger(maybeNull(java.math.BigInteger.valueOf(i * j)))
+            val values = Array.tabulate(i % 40) { j =>
+              Double.box(j.toDouble)
+            }
+            map.put(dummy, maybeNull(util.Arrays.asList(values: _*)))
+          }
+          map
+        })
+        bean
+      }
+    }
+  }
+
+  test("map with null key") {
+    val encoder = ScalaReflection.encoderFor[Map[String, String]]
+    withAllocator { allocator =>
+      val iterator = ArrowSerializer.serialize(
+        Iterator(Map((null.asInstanceOf[String], "kaboom?"))),
+        encoder,
+        allocator,
+        maxRecordsPerBatch = 128,
+        maxBatchSize = 1024,
+        timeZoneId = "UTC")
+      intercept[NullPointerException] {
+        iterator.next()
+      }
+      iterator.close()
+    }
+  }
+
+  // TODO follow-up with more null tests here:
+  // - Null primitive
+  // - Non-nullable map value
+  // - Non-nullable structfield
+  // - Non-nullable array element.
+
+  test("lenient field serialization - date/localdate") {
+    val base = java.time.LocalDate.now()
+    val localDates = () => Iterator.tabulate(10)(i => base.plusDays(i * i * 60))
+    val dates = () => localDates().map(java.sql.Date.valueOf)
+    val combo = () => localDates() ++ dates()
+    roundTripAndCheck(DateEncoder(true), dates, dates)
+    roundTripAndCheck(DateEncoder(true), localDates, dates)
+    roundTripAndCheck(DateEncoder(true), combo, () => dates() ++ dates())
+    roundTripAndCheck(LocalDateEncoder(true), dates, localDates)
+    roundTripAndCheck(LocalDateEncoder(true), localDates, localDates)
+    roundTripAndCheck(LocalDateEncoder(true), combo, () => localDates() ++ localDates())
+  }
+
+  test("lenient field serialization - timestamp/instant") {
+    val base = java.time.Instant.now()
+    val instants = () => Iterator.tabulate(10)(i => base.plusSeconds(i * i * 60))
+    val timestamps = () => instants().map(java.sql.Timestamp.from)
+    val combo = () => instants() ++ timestamps()
+    roundTripAndCheck(InstantEncoder(true), instants, instants)
+    roundTripAndCheck(InstantEncoder(true), timestamps, instants)
+    roundTripAndCheck(InstantEncoder(true), combo, () => instants() ++ instants())
+    roundTripAndCheck(TimestampEncoder(true), instants, timestamps)
+    roundTripAndCheck(TimestampEncoder(true), timestamps, timestamps)
+    roundTripAndCheck(TimestampEncoder(true), combo, () => timestamps() ++ timestamps())
+  }
+
+  test("lenient field serialization - decimal") {
+    val base = javaBigDecimal(137, DecimalType.DEFAULT_SCALE)
+    val bigDecimals = () =>
+      Iterator.tabulate(100) { i =>
+        base.multiply(javaBigDecimal(i)).setScale(DecimalType.DEFAULT_SCALE)
+      }
+    val bigInts = () => bigDecimals().map(_.toBigInteger)
+    val scalaBigDecimals = () => bigDecimals().map(BigDecimal.apply)
+    val scalaBigInts = () => bigDecimals().map(v => BigInt(v.toBigInteger))
+    val sparkDecimals = () => bigDecimals().map(Decimal.apply)
+    val encoder = JavaDecimalEncoder(DecimalType.SYSTEM_DEFAULT, lenientSerialization = true)
+    roundTripAndCheck(encoder, bigDecimals, bigDecimals)
+    roundTripAndCheck(encoder, bigInts, bigDecimals)
+    roundTripAndCheck(encoder, scalaBigDecimals, bigDecimals)
+    roundTripAndCheck(encoder, scalaBigInts, bigDecimals)
+    roundTripAndCheck(encoder, sparkDecimals, bigDecimals)
+    roundTripAndCheck(
+      encoder,
+      () => bigDecimals() ++ bigInts() ++ scalaBigDecimals() ++ scalaBigInts() ++ sparkDecimals(),
+      () => Iterator.fill(5)(bigDecimals()).flatten)
+  }
+
+  test("lenient field serialization - iterables") {
+    val encoder = IterableEncoder(
+      classTag[Seq[Int]],
+      BoxedIntEncoder,
+      containsNull = true,
+      lenientSerialization = true)
+    val elements = Seq(Array(1, 7, 8), Array.emptyIntArray, Array(88))
+    val primitiveArrays = () => elements.iterator
+    val genericArrays = () => elements.iterator.map(v => v.map(Int.box))
+    val lists = () => elements.iterator.map(v => java.util.Arrays.asList(v.map(Int.box): _*))
+    val seqs = () => elements.iterator.map(_.toSeq)
+    roundTripAndCheck(encoder, seqs, seqs)
+    roundTripAndCheck(encoder, primitiveArrays, seqs)
+    roundTripAndCheck(encoder, genericArrays, seqs)
+    roundTripAndCheck(encoder, lists, seqs)
+    roundTripAndCheck(
+      encoder,
+      () => lists() ++ seqs() ++ genericArrays() ++ primitiveArrays(),
+      () => Iterator.fill(4)(seqs()).flatten)
+  }
+
+  private val wideSchemaEncoder = toRowEncoder(
+    new StructType()
+      .add("a", "int")
+      .add("b", "string")
+      .add(
+        "c",
+        new StructType()
+          .add("ca", "array<int>")
+          .add("cb", "binary")
+          .add("cc", "float"))
+      .add(
+        "d",
+        ArrayType(
+          new StructType()
+            .add("da", "decimal(20, 10)")
+            .add("db", "string")
+            .add("dc", "boolean"))))
+
+  private val narrowSchemaEncoder = toRowEncoder(
+    new StructType()
+      .add("b", "string")
+      .add(
+        "d",
+        ArrayType(
+          new StructType()
+            .add("da", "decimal(20, 10)")
+            .add("dc", "boolean")))
+      .add(
+        "C",
+        new StructType()
+          .add("Ca", "array<int>")
+          .add("Cb", "binary")))
+
+  /* ******************************************************************** *
+   * Arrow serialization/deserialization specific errors
+   * ******************************************************************** */
+  test("unsupported encoders") {
+    // CalendarIntervalEncoder
+    val data = null.asInstanceOf[AnyRef]
+    intercept[SparkUnsupportedOperationException](
+      ArrowSerializer.serializerFor(CalendarIntervalEncoder, data))
+
+    // UDT
+    val udtEncoder = UDTEncoder(new UDTForCaseClass, classOf[UDTForCaseClass])
+    intercept[SparkUnsupportedOperationException](ArrowSerializer.serializerFor(udtEncoder, data))
+  }
+
+  test("unsupported encoder/vector combinations") {
+    // Also add a test for the serializer...
+    withAllocator { allocator =>
+      intercept[RuntimeException] {
+        ArrowSerializer.serializerFor(StringEncoder, new VarBinaryVector("bytes", allocator))
+      }
+    }
+  }
+}
+
+// TODO fix actual Null fields, e.g.: nullable: Null
+case class NullableData(
+    string: String,
+    month: java.time.Month,
+    foo: FooEnum,
+    decimal: Decimal,
+    scalaBigDecimal: BigDecimal,
+    javaBigDecimal: java.math.BigDecimal,
+    scalaBigInt: BigInt,
+    javaBigInteger: java.math.BigInteger,
+    duration: java.time.Duration,
+    period: java.time.Period,
+    date: java.sql.Date,
+    localDate: java.time.LocalDate,
+    timestamp: java.sql.Timestamp,
+    instant: java.time.Instant,
+    localDateTime: java.time.LocalDateTime)
+
+case class BinaryData(binary: Array[Byte]) {
+  def canEqual(other: Any): Boolean = other.isInstanceOf[BinaryData]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: BinaryData if that.canEqual(this) =>
+      java.util.Arrays.equals(binary, that.binary)
+    case _ => false
+  }
+
+  override def hashCode(): Int = java.util.Arrays.hashCode(binary)
+}
+
+class NonProduct(val name: String, val value: Double) extends DefinedByConstructorParams {
+
+  def canEqual(other: Any): Boolean = other.isInstanceOf[NonProduct]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: NonProduct =>
+      (that canEqual this) &&
+      name == that.name &&
+      value == that.value
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    val state = Seq(name, value)
+    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
+  }
+}
+
+case class ArrayData(doubles: Array[Double], strings: Array[String], nested: Array[Array[Int]]) {
+  def canEqual(other: Any): Boolean = other.isInstanceOf[ArrayData]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: ArrayData if that.canEqual(this) =>
+      Objects.deepEquals(that.doubles, doubles) &&
+      Objects.deepEquals(that.strings, strings) &&
+      Objects.deepEquals(that.nested, nested)
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    val state = Seq(doubles, strings, nested)
+    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
+  }
+}
+
+case class ListData(
+    seqOfStrings: Seq[String],
+    seqOfInts: Seq[Int],
+    setOfLongs: Set[Long],
+    queueOfBigIntOptions: mutable.Queue[Option[BigInt]])
+
+class JavaListData {
+  @scala.beans.BeanProperty
+  var listOfDecimal: java.util.ArrayList[java.math.BigDecimal] = _
+  @scala.beans.BeanProperty
+  var listOfBigInt: java.util.LinkedList[java.math.BigInteger] = _
+  @scala.beans.BeanProperty
+  var listOfStrings: java.util.AbstractList[String] = _
+  @scala.beans.BeanProperty
+  var listOfBytes: java.util.List[java.lang.Byte] = _
+
+  def canEqual(other: Any): Boolean = other.isInstanceOf[JavaListData]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: JavaListData if that canEqual this =>
+      Objects.equals(listOfDecimal, that.listOfDecimal) &&
+      Objects.equals(listOfBigInt, that.listOfBigInt) &&
+      Objects.equals(listOfStrings, that.listOfStrings) &&
+      Objects.equals(listOfBytes, that.listOfBytes)
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    val state = Seq(listOfDecimal, listOfBigInt, listOfStrings, listOfBytes)
+    state.map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b)
+  }
+
+  override def toString: String = {
+    s"JavaListData(listOfDecimal=$listOfDecimal, " +
+      s"listOfBigInt=$listOfBigInt, " +
+      s"listOfStrings=$listOfStrings, " +
+      s"listOfBytes=$listOfBytes)"
+  }
+}
+
+case class MapData(intStringMap: Map[Int, String], metricMap: Map[String, Array[Long]]) {
+  def canEqual(other: Any): Boolean = other.isInstanceOf[MapData]
+
+  private def sameMetricMap(other: Map[String, Array[Long]]): Boolean = {
+    if (metricMap == null && other == null) {
+      true
+    } else if (metricMap == null || other == null || metricMap.keySet != other.keySet) {
+      false
+    } else {
+      metricMap.forall { case (key, values) =>
+        java.util.Arrays.equals(values, other(key))
+      }
+    }
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case that: MapData if that canEqual this =>
+      Objects.deepEquals(intStringMap, that.intStringMap) &&
+      sameMetricMap(that.metricMap)
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    java.util.Arrays.deepHashCode(Array(intStringMap, metricMap))
+  }
+}
+
+class JavaMapData {
+  @scala.beans.BeanProperty
+  var dummyToDoubleListMap: java.util.Map[DummyBean, java.util.List[java.lang.Double]] = _
+
+  def canEqual(other: Any): Boolean = other.isInstanceOf[JavaMapData]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: JavaMapData if that canEqual this =>
+      dummyToDoubleListMap == that.dummyToDoubleListMap
+    case _ => false
+  }
+
+  override def hashCode(): Int = Objects.hashCode(dummyToDoubleListMap)
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala
index 35f5bf739bfce..90c61c402306e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst
 
 import java.math.BigInteger
-import java.util.{LinkedList, List => JList, Map => JMap}
+import java.util.{LinkedList, List => JList, Map => JMap, Objects}
 
 import scala.beans.{BeanProperty, BooleanBeanProperty}
 import scala.reflect.{classTag, ClassTag}
@@ -30,6 +30,13 @@ import org.apache.spark.sql.types.{DecimalType, MapType, Metadata, StringType, S
 
 class DummyBean {
   @BeanProperty var bigInteger: BigInteger = _
+
+  override def hashCode(): Int = Objects.hashCode(bigInteger)
+
+  override def equals(obj: Any): Boolean = obj match {
+    case bean: DummyBean => Objects.equals(bigInteger, bean.bigInteger)
+    case _ => false
+  }
 }
 
 class GenericCollectionBean {

From 30cc7b3236a5caf45b323c67f1fecf3f631a7d1f Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Tue, 11 Jul 2023 18:23:50 -0400
Subject: [PATCH 02/16] Undo Scala version change

---
 assembly/pom.xml                                   |  4 ++--
 common/kvstore/pom.xml                             |  4 ++--
 common/network-common/pom.xml                      |  4 ++--
 common/network-shuffle/pom.xml                     |  4 ++--
 common/network-yarn/pom.xml                        |  4 ++--
 common/sketch/pom.xml                              |  4 ++--
 common/tags/pom.xml                                |  4 ++--
 common/unsafe/pom.xml                              |  4 ++--
 common/utils/pom.xml                               |  4 ++--
 connector/avro/pom.xml                             |  8 ++++----
 connector/connect/client/jvm/pom.xml               |  4 ++--
 connector/connect/common/pom.xml                   |  4 ++--
 connector/connect/server/pom.xml                   |  8 ++++----
 connector/docker-integration-tests/pom.xml         |  4 ++--
 connector/kafka-0-10-assembly/pom.xml              |  4 ++--
 connector/kafka-0-10-sql/pom.xml                   |  8 ++++----
 connector/kafka-0-10-token-provider/pom.xml        |  4 ++--
 connector/kafka-0-10/pom.xml                       |  8 ++++----
 connector/kinesis-asl-assembly/pom.xml             |  4 ++--
 connector/kinesis-asl/pom.xml                      |  4 ++--
 connector/protobuf/pom.xml                         |  8 ++++----
 connector/spark-ganglia-lgpl/pom.xml               |  4 ++--
 core/pom.xml                                       |  8 ++++----
 dev/mima                                           |  8 ++++----
 docs/_plugins/copy_api_dirs.rb                     | 14 +++++++-------
 examples/pom.xml                                   |  4 ++--
 graphx/pom.xml                                     |  4 ++--
 hadoop-cloud/pom.xml                               |  4 ++--
 launcher/pom.xml                                   |  4 ++--
 mllib-local/pom.xml                                |  4 ++--
 mllib/pom.xml                                      |  8 ++++----
 pom.xml                                            | 12 ++++++------
 repl/pom.xml                                       |  4 ++--
 resource-managers/kubernetes/core/pom.xml          |  4 ++--
 .../kubernetes/integration-tests/pom.xml           |  4 ++--
 resource-managers/mesos/pom.xml                    |  4 ++--
 resource-managers/yarn/pom.xml                     |  4 ++--
 sql/api/pom.xml                                    |  4 ++--
 sql/catalyst/pom.xml                               |  8 ++++----
 sql/core/pom.xml                                   |  8 ++++----
 sql/hive-thriftserver/pom.xml                      |  8 ++++----
 sql/hive/pom.xml                                   |  8 ++++----
 streaming/pom.xml                                  |  8 ++++----
 tools/pom.xml                                      |  4 ++--
 44 files changed, 123 insertions(+), 123 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index d4d7a1db4a29e..09d6bd8a33f79 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-assembly_2.13</artifactId>
+  <artifactId>spark-assembly_2.12</artifactId>
   <name>Spark Project Assembly</name>
   <url>https://spark.apache.org/</url>
   <packaging>pom</packaging>
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 69f9f186e0889..bef8303874b20 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-kvstore_2.13</artifactId>
+  <artifactId>spark-kvstore_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Local DB</name>
   <url>https://spark.apache.org/</url>
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 9f90d12216e69..8a63e999c53cd 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-network-common_2.13</artifactId>
+  <artifactId>spark-network-common_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Networking</name>
   <url>https://spark.apache.org/</url>
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 864f1cc2d3715..a8bde14a259f0 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-network-shuffle_2.13</artifactId>
+  <artifactId>spark-network-shuffle_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Shuffle Streaming Service</name>
   <url>https://spark.apache.org/</url>
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index c19ac33afa5cd..671d5cb7e0178 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-network-yarn_2.13</artifactId>
+  <artifactId>spark-network-yarn_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project YARN Shuffle Service</name>
   <url>https://spark.apache.org/</url>
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 6cf1a4fb83e56..4cc597519c3dd 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-sketch_2.13</artifactId>
+  <artifactId>spark-sketch_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Sketch</name>
   <url>https://spark.apache.org/</url>
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 1eb8352e32df3..9a44c847d8a03 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-tags_2.13</artifactId>
+  <artifactId>spark-tags_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Tags</name>
   <url>https://spark.apache.org/</url>
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 84e7b61553483..bdf82d9285e06 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-unsafe_2.13</artifactId>
+  <artifactId>spark-unsafe_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Unsafe</name>
   <url>https://spark.apache.org/</url>
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index ee10a60618297..36cfceed931e0 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-common-utils_2.13</artifactId>
+  <artifactId>spark-common-utils_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Common Utils</name>
   <url>https://spark.apache.org/</url>
diff --git a/connector/avro/pom.xml b/connector/avro/pom.xml
index 7087fdbccd04d..597e3c2235f7a 100644
--- a/connector/avro/pom.xml
+++ b/connector/avro/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-avro_2.13</artifactId>
+  <artifactId>spark-avro_2.12</artifactId>
   <properties>
     <sbt.project.name>avro</sbt.project.name>
   </properties>
@@ -70,12 +70,12 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>org.tukaani</groupId>
       <artifactId>xz</artifactId>
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index cef6f6d214e74..60e4ae78147ee 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-connect-client-jvm_2.13</artifactId>
+  <artifactId>spark-connect-client-jvm_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Connect Client</name>
   <url>https://spark.apache.org/</url>
diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml
index 9b28aca5d0726..1890384b51db5 100644
--- a/connector/connect/common/pom.xml
+++ b/connector/connect/common/pom.xml
@@ -21,12 +21,12 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>org.apache.spark</groupId>
-        <artifactId>spark-parent_2.13</artifactId>
+        <artifactId>spark-parent_2.12</artifactId>
         <version>3.5.0-SNAPSHOT</version>
         <relativePath>../../../pom.xml</relativePath>
     </parent>
 
-    <artifactId>spark-connect-common_2.13</artifactId>
+    <artifactId>spark-connect-common_2.12</artifactId>
     <packaging>jar</packaging>
     <name>Spark Project Connect Common</name>
     <url>https://spark.apache.org/</url>
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
index 01d6d86c54292..95b70c6b0f41d 100644
--- a/connector/connect/server/pom.xml
+++ b/connector/connect/server/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-connect_2.13</artifactId>
+  <artifactId>spark-connect_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Connect Server</name>
   <url>https://spark.apache.org/</url>
@@ -152,12 +152,12 @@
         </exclusion>
       </exclusions>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml
index e40269838522d..cc549487a8b57 100644
--- a/connector/docker-integration-tests/pom.xml
+++ b/connector/docker-integration-tests/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-docker-integration-tests_2.13</artifactId>
+  <artifactId>spark-docker-integration-tests_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Docker Integration Tests</name>
   <url>https://spark.apache.org/</url>
diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml
index f339e8c2e4fd2..340974cc789bd 100644
--- a/connector/kafka-0-10-assembly/pom.xml
+++ b/connector/kafka-0-10-assembly/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-streaming-kafka-0-10-assembly_2.13</artifactId>
+  <artifactId>spark-streaming-kafka-0-10-assembly_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Integration for Kafka 0.10 Assembly</name>
   <url>https://spark.apache.org/</url>
diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml
index f5a12b61c2bea..fdd1196cd446a 100644
--- a/connector/kafka-0-10-sql/pom.xml
+++ b/connector/kafka-0-10-sql/pom.xml
@@ -20,13 +20,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-sql-kafka-0-10_2.13</artifactId>
+  <artifactId>spark-sql-kafka-0-10_2.12</artifactId>
   <properties>
     <sbt.project.name>sql-kafka-0-10</sbt.project.name>
   </properties>
@@ -74,12 +74,12 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka-clients</artifactId>
diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml
index b3c0889d9475a..3256130c50f3b 100644
--- a/connector/kafka-0-10-token-provider/pom.xml
+++ b/connector/kafka-0-10-token-provider/pom.xml
@@ -20,13 +20,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-token-provider-kafka-0-10_2.13</artifactId>
+  <artifactId>spark-token-provider-kafka-0-10_2.12</artifactId>
   <properties>
     <sbt.project.name>token-provider-kafka-0-10</sbt.project.name>
   </properties>
diff --git a/connector/kafka-0-10/pom.xml b/connector/kafka-0-10/pom.xml
index f1820bb595a2d..706eb2dd2c399 100644
--- a/connector/kafka-0-10/pom.xml
+++ b/connector/kafka-0-10/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-streaming-kafka-0-10_2.13</artifactId>
+  <artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
   <properties>
     <sbt.project.name>streaming-kafka-0-10</sbt.project.name>
   </properties>
@@ -59,12 +59,12 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka-clients</artifactId>
diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml
index 2cba2668f049a..cd5c0393f6f84 100644
--- a/connector/kinesis-asl-assembly/pom.xml
+++ b/connector/kinesis-asl-assembly/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-streaming-kinesis-asl-assembly_2.13</artifactId>
+  <artifactId>spark-streaming-kinesis-asl-assembly_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Kinesis Assembly</name>
   <url>https://spark.apache.org/</url>
diff --git a/connector/kinesis-asl/pom.xml b/connector/kinesis-asl/pom.xml
index af9cd4b7ec96e..c70a073e73407 100644
--- a/connector/kinesis-asl/pom.xml
+++ b/connector/kinesis-asl/pom.xml
@@ -19,13 +19,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
   <!-- Kinesis integration is not included by default due to ASL-licensed code. -->
-  <artifactId>spark-streaming-kinesis-asl_2.13</artifactId>
+  <artifactId>spark-streaming-kinesis-asl_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Kinesis Integration</name>
 
diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml
index db92af75a5728..3d6bbea7d41c5 100644
--- a/connector/protobuf/pom.xml
+++ b/connector/protobuf/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-protobuf_2.13</artifactId>
+  <artifactId>spark-protobuf_2.12</artifactId>
   <properties>
     <sbt.project.name>protobuf</sbt.project.name>
   </properties>
@@ -70,12 +70,12 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
diff --git a/connector/spark-ganglia-lgpl/pom.xml b/connector/spark-ganglia-lgpl/pom.xml
index 00f4769fd60ad..c0dcde1355849 100644
--- a/connector/spark-ganglia-lgpl/pom.xml
+++ b/connector/spark-ganglia-lgpl/pom.xml
@@ -19,13 +19,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
   <!-- Ganglia integration is not included by default due to LGPL-licensed code -->
-  <artifactId>spark-ganglia-lgpl_2.13</artifactId>
+  <artifactId>spark-ganglia-lgpl_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Ganglia Integration</name>
 
diff --git a/core/pom.xml b/core/pom.xml
index 79bf8a2163554..6519b46d96e31 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-core_2.13</artifactId>
+  <artifactId>spark-core_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Core</name>
   <url>https://spark.apache.org/</url>
@@ -35,12 +35,12 @@
   </properties>
   
   <dependencies>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>org.apache.avro</groupId>
       <artifactId>avro</artifactId>
diff --git a/dev/mima b/dev/mima
index 32c3718e4ccca..4a9e343b0a78f 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,9 +24,9 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES=${1:-"-Pscala-2.13 -Pmesos -Pkubernetes -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"}
-TOOLS_CLASSPATH="$(build/sbt -Pscala-2.13 -DcopyDependencies=false "export tools/fullClasspath" | grep jar | tail -n1)"
-OLD_DEPS_CLASSPATH="$(build/sbt -Pscala-2.13 -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | grep jar | tail -n1)"
+SPARK_PROFILES=${1:-"-Pmesos -Pkubernetes -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"}
+TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | grep jar | tail -n1)"
+OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | grep jar | tail -n1)"
 
 rm -f .generated-mima*
 
@@ -42,7 +42,7 @@ $JAVA_CMD \
   -cp "$TOOLS_CLASSPATH:$OLD_DEPS_CLASSPATH" \
   org.apache.spark.tools.GenerateMIMAIgnore
 
-echo -e "q\n" | build/sbt -Pscala-2.13 -mem 5120 -DcopyDependencies=false "$@" mimaReportBinaryIssues | grep -v -e "info.*Resolving"
+echo -e "q\n" | build/sbt -mem 5120 -DcopyDependencies=false "$@" mimaReportBinaryIssues | grep -v -e "info.*Resolving"
 ret_val=$?
 
 if [ $ret_val != 0 ]; then
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 9cb073ef1e00c..28d5e0d82c93a 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -26,8 +26,8 @@
     curr_dir = pwd
     cd("..")
 
-    puts "Running 'build/sbt -Pscala-2.13 -Pkinesis-asl clean compile unidoc' from " + pwd + "; this may take a few minutes..."
-    system("build/sbt -Pscala-2.13 -Pkinesis-asl clean compile unidoc") || raise("Unidoc generation failed")
+    puts "Running 'build/sbt -Pkinesis-asl clean compile unidoc' from " + pwd + "; this may take a few minutes..."
+    system("build/sbt -Pkinesis-asl clean compile unidoc") || raise("Unidoc generation failed")
 
     puts "Moving back into docs dir."
     cd("docs")
@@ -37,7 +37,7 @@
 
     # Copy over the unified ScalaDoc for all projects to api/scala.
     # This directory will be copied over to _site when `jekyll` command is run.
-    source = "../target/scala-2.13/unidoc"
+    source = "../target/scala-2.12/unidoc"
     dest = "api/scala"
 
     puts "Making directory " + dest
@@ -119,8 +119,8 @@
     puts "Moving to project root and building API docs."
     cd("..")
 
-    puts "Running 'build/sbt -Pscala-2.13 clean package -Phive' from " + pwd + "; this may take a few minutes..."
-    system("build/sbt -Pscala-2.13 clean package -Phive") || raise("PySpark doc generation failed")
+    puts "Running 'build/sbt clean package -Phive' from " + pwd + "; this may take a few minutes..."
+    system("build/sbt clean package -Phive") || raise("PySpark doc generation failed")
 
     puts "Moving back into docs dir."
     cd("docs")
@@ -165,8 +165,8 @@
       puts "Moving to project root and building API docs."
       cd("..")
 
-      puts "Running 'build/sbt -Pscala-2.13 clean package -Phive' from " + pwd + "; this may take a few minutes..."
-      system("build/sbt -Pscala-2.13 clean package -Phive") || raise("SQL doc generation failed")
+      puts "Running 'build/sbt clean package -Phive' from " + pwd + "; this may take a few minutes..."
+      system("build/sbt clean package -Phive") || raise("SQL doc generation failed")
 
       puts "Moving back into docs dir."
       cd("docs")
diff --git a/examples/pom.xml b/examples/pom.xml
index 57e41724bdca4..e8f22b995fded 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-examples_2.13</artifactId>
+  <artifactId>spark-examples_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Examples</name>
   <url>https://spark.apache.org/</url>
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 5d01dd06c0ecb..48baeb9a87560 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-graphx_2.13</artifactId>
+  <artifactId>spark-graphx_2.12</artifactId>
   <properties>
     <sbt.project.name>graphx</sbt.project.name>
   </properties>
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 21c1e0fee1ddf..02e7675df286c 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-hadoop-cloud_2.13</artifactId>
+  <artifactId>spark-hadoop-cloud_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Hadoop Cloud Integration</name>
   <description>
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 0bc3ae20ee183..aba7ee82d53cf 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-launcher_2.13</artifactId>
+  <artifactId>spark-launcher_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Launcher</name>
   <url>https://spark.apache.org/</url>
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 83ca643f43bce..00c16a8b6a544 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-mllib-local_2.13</artifactId>
+  <artifactId>spark-mllib-local_2.12</artifactId>
   <properties>
     <sbt.project.name>mllib-local</sbt.project.name>
   </properties>
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 07290124273f2..73af83c758688 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-mllib_2.13</artifactId>
+  <artifactId>spark-mllib_2.12</artifactId>
   <properties>
     <sbt.project.name>mllib</sbt.project.name>
   </properties>
@@ -91,12 +91,12 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>org.scalanlp</groupId>
       <artifactId>breeze_${scala.binary.version}</artifactId>
diff --git a/pom.xml b/pom.xml
index 2a917b46d8520..96375ea904dd8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -25,7 +25,7 @@
     <version>18</version>
   </parent>
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-parent_2.13</artifactId>
+  <artifactId>spark-parent_2.12</artifactId>
   <version>3.5.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
@@ -168,8 +168,8 @@
     <!-- managed up from 3.2.1 for SPARK-11652 -->
     <commons.collections.version>3.2.2</commons.collections.version>
     <commons.collections4.version>4.4</commons.collections4.version>
-    <scala.version>2.13.11</scala.version>
-    <scala.binary.version>2.13</scala.binary.version>
+    <scala.version>2.12.18</scala.version>
+    <scala.binary.version>2.12</scala.binary.version>
     <scalatest-maven-plugin.version>2.2.0</scalatest-maven-plugin.version>
     <!-- dont update scala-maven-plugin to version 4.8.1 SPARK-42809 and SPARK-43595 -->   
     <scala-maven-plugin.version>4.8.0</scala-maven-plugin.version>
@@ -438,13 +438,13 @@
         <version>${project.version}</version>
         <type>test-jar</type>
       </dependency>
-      <!-- #if scala-2.13 -->
+      <!-- #if scala-2.13 --><!--
       <dependency>
         <groupId>org.scala-lang.modules</groupId>
         <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
         <version>1.0.4</version>
       </dependency>
-      <!-- #endif scala-2.13 -->
+      --><!-- #endif scala-2.13 -->
       <dependency>
         <groupId>com.twitter</groupId>
         <artifactId>chill_${scala.binary.version}</artifactId>
@@ -1089,7 +1089,7 @@
         <exclusions>
           <exclusion>
             <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-xml_2.13</artifactId>
+            <artifactId>scala-xml_2.12</artifactId>
           </exclusion>
         </exclusions>
       </dependency>
diff --git a/repl/pom.xml b/repl/pom.xml
index 74ac775100cb8..8c0f9f989c170 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-repl_2.13</artifactId>
+  <artifactId>spark-repl_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project REPL</name>
   <url>https://spark.apache.org/</url>
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 72c7f1f12f42d..9dab5496184e2 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -19,12 +19,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-kubernetes_2.13</artifactId>
+  <artifactId>spark-kubernetes_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Kubernetes</name>
   <properties>
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 3e25e7053707a..02894f82eec9d 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -19,12 +19,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-kubernetes-integration-tests_2.13</artifactId>
+  <artifactId>spark-kubernetes-integration-tests_2.12</artifactId>
   <properties>
     <sbt.project.name>kubernetes-integration-tests</sbt.project.name>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 267d6c7d84f21..7510ecac3e7fc 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -19,12 +19,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-mesos_2.13</artifactId>
+  <artifactId>spark-mesos_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Mesos</name>
   <properties>
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 2cda552a9c47c..dcc7bcdd1af38 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -19,12 +19,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-yarn_2.13</artifactId>
+  <artifactId>spark-yarn_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project YARN</name>
   <properties>
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index db3bfaeeca0a9..41a5b85d4c670 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -21,12 +21,12 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>org.apache.spark</groupId>
-        <artifactId>spark-parent_2.13</artifactId>
+        <artifactId>spark-parent_2.12</artifactId>
         <version>3.5.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
-    <artifactId>spark-sql-api_2.13</artifactId>
+    <artifactId>spark-sql-api_2.12</artifactId>
     <packaging>jar</packaging>
     <name>Spark Project SQL API</name>
     <url>https://spark.apache.org/</url>
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 80b7c99ddc139..9dbc8d625d079 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-catalyst_2.13</artifactId>
+  <artifactId>spark-catalyst_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Catalyst</name>
   <url>https://spark.apache.org/</url>
@@ -92,12 +92,12 @@
       <artifactId>spark-sketch_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 5d4f7572d0022..7f4c2a4cfa54d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-sql_2.13</artifactId>
+  <artifactId>spark-sql_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project SQL</name>
   <url>https://spark.apache.org/</url>
@@ -89,12 +89,12 @@
       <scope>test</scope>
     </dependency>
 
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>org.apache.orc</groupId>
       <artifactId>orc-core</artifactId>
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 4bbb92d1376a0..ad7fc0d2ac4bd 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-hive-thriftserver_2.13</artifactId>
+  <artifactId>spark-hive-thriftserver_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Hive Thrift Server</name>
   <url>https://spark.apache.org/</url>
@@ -61,12 +61,12 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index b267830a3ad5f..16d915c233ee4 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -21,12 +21,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-hive_2.13</artifactId>
+  <artifactId>spark-hive_2.12</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Hive</name>
   <url>https://spark.apache.org/</url>
@@ -79,12 +79,12 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
     <dependency>
       <groupId>${hive.group}</groupId>
       <artifactId>hive-common</artifactId>
diff --git a/streaming/pom.xml b/streaming/pom.xml
index a36370a1e8b61..bebfd3abcce39 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -20,12 +20,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-streaming_2.13</artifactId>
+  <artifactId>spark-streaming_2.12</artifactId>
   <properties>
     <sbt.project.name>streaming</sbt.project.name>
   </properties>
@@ -50,12 +50,12 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #if scala-2.13 -->
+    <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
-    <!-- #endif scala-2.13 -->
+    --><!-- #endif scala-2.13 -->
 
     <!--
       This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
diff --git a/tools/pom.xml b/tools/pom.xml
index 9a088e84bc017..9dc136d426871 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -19,12 +19,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.13</artifactId>
+    <artifactId>spark-parent_2.12</artifactId>
     <version>3.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>spark-tools_2.13</artifactId>
+  <artifactId>spark-tools_2.12</artifactId>
   <properties>
     <sbt.project.name>tools</sbt.project.name>
   </properties>

From 2117067378135b6a3f3fe4655443cbbb1d989b60 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Tue, 11 Jul 2023 18:24:22 -0400
Subject: [PATCH 03/16] Fix bug

---
 .../spark/sql/connect/client/arrow/ArrowSerializer.scala | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
index 29c13befbe8d7..a46c4a2c2da99 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
@@ -469,17 +469,14 @@ object ArrowSerializer {
   }
 
   private abstract class FieldSerializer[E, V <: FieldVector](val vector: V) extends Serializer {
-    private[this] val nullable = vector.getField.isNullable
     def set(index: Int, value: E): Unit
 
     override def write(index: Int, raw: Any): Unit = {
       val value = raw.asInstanceOf[E]
       if (value != null) {
         set(index, value)
-      } else if (nullable) {
-        vector.setNull(index)
       } else {
-        throw new NullPointerException()
+        vector.setNull(index)
       }
     }
   }
@@ -510,13 +507,9 @@ object ArrowSerializer {
       struct: StructVector,
       fieldSerializers: Seq[StructFieldSerializer])
       extends Serializer {
-    private[this] val nullable = struct != null && struct.getField.isNullable
 
     override def write(index: Int, value: Any): Unit = {
       if (value == null) {
-        if (!nullable) {
-          throw new NullPointerException()
-        }
         if (struct != null) {
           struct.setNull(index)
         }

From 0f8761864324191fdce535ab2266433451a2d01c Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Wed, 12 Jul 2023 02:47:42 -0400
Subject: [PATCH 04/16] Style & Code Review comments.

---
 .../client/arrow/ArrowSerializer.scala        | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
index a46c4a2c2da99..694596102e12b 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
@@ -24,7 +24,6 @@ import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period}
 import java.util.{Map => JMap}
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable
 
 import com.google.protobuf.ByteString
 import org.apache.arrow.memory.BufferAllocator
@@ -59,7 +58,7 @@ class ArrowSerializer[T](
     MessageSerializer.serialize(newChannel(bytes), root.getSchema)
     bytes.toByteArray
   }
-  private var i: Int = 0
+  private var rowCount: Int = 0
 
   private def newChannel(output: OutputStream): WriteChannel = {
     new WriteChannel(Channels.newChannel(output))
@@ -75,7 +74,7 @@ class ArrowSerializer[T](
    */
   def sizeInBytes: Long = {
     // We need to set the row count for getBufferSize to return the actual value.
-    root.setRowCount(i)
+    root.setRowCount(rowCount)
     schemaBytes.length + vectors.map(_.getBufferSize).sum
   }
 
@@ -83,8 +82,8 @@ class ArrowSerializer[T](
    * Append a record to the current batch.
    */
   def append(record: T): Unit = {
-    serializer.write(i, record)
-    i += 1
+    serializer.write(rowCount, record)
+    rowCount += 1
   }
 
   /**
@@ -92,7 +91,7 @@ class ArrowSerializer[T](
    */
   def writeIpcStream(output: OutputStream): Unit = {
     val channel = newChannel(output)
-    root.setRowCount(i)
+    root.setRowCount(rowCount)
     val batch = unloader.getRecordBatch
     try {
       channel.write(schemaBytes)
@@ -107,7 +106,7 @@ class ArrowSerializer[T](
    * Reset the serializer.
    */
   def reset(): Unit = {
-    i = 0
+    rowCount = 0
     vectors.foreach(_.reset())
   }
 
@@ -194,10 +193,8 @@ object ArrowSerializer {
       encoder: AgnosticEncoder[T],
       allocator: BufferAllocator,
       timeZoneId: String): (VectorSchemaRoot, Serializer) = {
-    val arrowSchema = ArrowUtils.toArrowSchema(
-      encoder.schema,
-      timeZoneId,
-      errorOnDuplicatedFieldNames = true)
+    val arrowSchema =
+      ArrowUtils.toArrowSchema(encoder.schema, timeZoneId, errorOnDuplicatedFieldNames = true)
     val root = VectorSchemaRoot.create(arrowSchema, allocator)
     val serializer = if (encoder.schema != encoder.dataType) {
       assert(root.getSchema.getFields.size() == 1)
@@ -357,7 +354,7 @@ object ArrowSerializer {
       case (ArrayEncoder(element, _), v: ListVector) =>
         val elementSerializer = serializerFor(element, v.getDataVector)
         val toIterator = { array: Any =>
-          mutable.WrappedArray.make(array.asInstanceOf[Array[_]]).iterator
+          array.asInstanceOf[Array[_]].iterator
         }
         new ArraySerializer(v, toIterator, elementSerializer)
 
@@ -365,13 +362,13 @@ object ArrowSerializer {
         val elementSerializer = serializerFor(element, v.getDataVector)
         val toIterator: Any => Iterator[_] = if (lenient) {
           {
-            case i: scala.collection.Iterable[_] => i.toIterator
+            case i: scala.collection.Iterable[_] => i.iterator
             case l: java.util.List[_] => l.iterator().asScala
             case a: Array[_] => a.iterator
             case o => unsupportedCollectionType(o.getClass)
           }
         } else if (isSubClass(Classes.ITERABLE, tag)) { v =>
-          v.asInstanceOf[scala.collection.Iterable[_]].toIterator
+          v.asInstanceOf[scala.collection.Iterable[_]].iterator
         } else if (isSubClass(Classes.JLIST, tag)) { v =>
           v.asInstanceOf[java.util.List[_]].iterator().asScala
         } else {

From 8a34beab0e95f3d8ff605de9bd1c269e4eb127b8 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Wed, 12 Jul 2023 03:16:24 -0400
Subject: [PATCH 05/16] Fix bugs :)

---
 .../connect/client/arrow/ArrowSerializer.scala   | 16 +++++++++++++---
 .../connect/client/arrow/ArrowEncoderSuite.scala |  2 +-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
index 694596102e12b..39966f41f159f 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
@@ -21,7 +21,7 @@ import java.lang.invoke.{MethodHandles, MethodType}
 import java.math.{BigDecimal => JBigDecimal, BigInteger => JBigInteger}
 import java.nio.channels.Channels
 import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period}
-import java.util.{Map => JMap}
+import java.util.{Map => JMap, Objects}
 
 import scala.collection.JavaConverters._
 
@@ -388,10 +388,10 @@ object ArrowSerializer {
         val structSerializer = new StructSerializer(
           structVector,
           new StructFieldSerializer(
-            (v: Any) => v.asInstanceOf[(Any, Any)]._1,
+            extractKey,
             serializerFor(key, structVector.getChild(MapVector.KEY_NAME))) ::
             new StructFieldSerializer(
-              (v: Any) => v.asInstanceOf[(Any, Any)]._2,
+              extractValue,
               serializerFor(value, structVector.getChild(MapVector.VALUE_NAME))) :: Nil)
         new ArraySerializer(v, extractor, structSerializer)
 
@@ -448,6 +448,16 @@ object ArrowSerializer {
     vector.setSafe(index, scaledDecimal)
   }
 
+  private def extractKey(v: Any): Any = {
+    val key = v.asInstanceOf[(Any, Any)]._1
+    Objects.requireNonNull(key)
+    key
+  }
+
+  private def extractValue(v: Any): Any = {
+    v.asInstanceOf[(Any, Any)]._2
+  }
+
   private def structSerializerFor(
       fields: Seq[EncoderField],
       struct: StructVector,
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
index cf4affa1db49d..c7243533530ac 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -127,7 +127,7 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
     }
     val result = new SparkResult[E](responses.asJava, allocator, encoder)
     new CloseableIterator[E] {
-      private val iterator = result.iterator
+      private val iterator: java.util.Iterator[E] with AutoCloseable = result.iterator
       override def close(): Unit = iterator.close()
       override def hasNext: Boolean = iterator.hasNext
       override def next(): E = iterator.next()

From d49dcf354a9f2307ce1b7dc540885f1c1d688c73 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Wed, 12 Jul 2023 03:28:54 -0400
Subject: [PATCH 06/16] Fix test in 2.13

---
 .../sql/connect/client/arrow/ArrowEncoderSuite.scala      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
index c7243533530ac..a01d8bc70eb8f 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -127,10 +127,10 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
     }
     val result = new SparkResult[E](responses.asJava, allocator, encoder)
     new CloseableIterator[E] {
-      private val iterator: java.util.Iterator[E] with AutoCloseable = result.iterator
-      override def close(): Unit = iterator.close()
-      override def hasNext: Boolean = iterator.hasNext
-      override def next(): E = iterator.next()
+      private val itr = result.iterator
+      override def close(): Unit = itr.close()
+      override def hasNext: Boolean = itr.hasNext
+      override def next(): E = itr.next()
     }
   }
 

From 2b88ada2d165d86fc39429891198d92c1d9bb6e3 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Wed, 12 Jul 2023 08:59:52 -0400
Subject: [PATCH 07/16] Code Review.

---
 .../org/apache/spark/sql/SparkSession.scala   |  1 -
 .../client/arrow/ArrowEncoderUtils.scala      |  5 --
 .../client/arrow/ArrowSerializer.scala        | 18 ++++-
 .../connect/client/util/ConvertToArrow.scala  | 74 -------------------
 .../client/arrow/ArrowEncoderSuite.scala      |  8 ++
 5 files changed, 24 insertions(+), 82 deletions(-)
 delete mode 100644 connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/ConvertToArrow.scala

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 5d042bdc7f176..c27f0f32e0dfe 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -127,7 +127,6 @@ class SparkSession private[sql] (
     newDataset(encoder) { builder =>
       if (data.nonEmpty) {
         val timeZoneId = conf.get("spark.sql.session.timeZone")
-        // TODO add errorOnDuplicatedFieldNames?
         val arrowData = ArrowSerializer.serialize(data, encoder, allocator, timeZoneId)
         if (arrowData.size() <= conf.get("spark.sql.session.localRelationCacheThreshold").toInt) {
           builder.getLocalRelationBuilder
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
index a17247ac229b3..f6b140bae557b 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
@@ -24,13 +24,8 @@ import org.apache.arrow.vector.complex.StructVector
 
 private[arrow] object ArrowEncoderUtils {
   object Classes {
-    val WRAPPED_ARRAY: Class[_] = classOf[scala.collection.mutable.WrappedArray[_]]
     val ITERABLE: Class[_] = classOf[scala.collection.Iterable[_]]
-    val SEQ: Class[_] = classOf[scala.collection.Seq[_]]
-    val SET: Class[_] = classOf[scala.collection.Set[_]]
-    val MAP: Class[_] = classOf[scala.collection.Map[_, _]]
     val JLIST: Class[_] = classOf[java.util.List[_]]
-    val JMAP: Class[_] = classOf[java.util.Map[_, _]]
   }
 
   def isSubClass(cls: Class[_], tag: ClassTag[_]): Boolean = {
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
index 39966f41f159f..d29f90a6a1981 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala
@@ -59,6 +59,7 @@ class ArrowSerializer[T](
     bytes.toByteArray
   }
   private var rowCount: Int = 0
+  private var closed: Boolean = false
 
   private def newChannel(output: OutputStream): WriteChannel = {
     new WriteChannel(Channels.newChannel(output))
@@ -113,7 +114,18 @@ class ArrowSerializer[T](
   /**
    * Close the serializer.
    */
-  def close(): Unit = root.close()
+  def close(): Unit = {
+    root.close()
+    closed = true
+  }
+
+  /**
+   * Check if the serializer has been closed.
+   *
+   * It is illegal to used the serializer after it has been closed. It will lead to errors and
+   * sorts of undefined behavior.
+   */
+  def isClosed: Boolean = closed
 }
 
 object ArrowSerializer {
@@ -149,7 +161,9 @@ object ArrowSerializer {
         true
       }
 
-      override def hasNext: Boolean = input.hasNext || !hasWrittenFirstBatch
+      override def hasNext: Boolean = {
+        (input.hasNext || !hasWrittenFirstBatch) && !serializer.isClosed
+      }
 
       override def next(): Array[Byte] = {
         if (!hasNext) {
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/ConvertToArrow.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/ConvertToArrow.scala
deleted file mode 100644
index 14235094f2d26..0000000000000
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/ConvertToArrow.scala
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.connect.client.util
-
-import java.nio.channels.Channels
-
-import com.google.protobuf.ByteString
-import org.apache.arrow.memory.BufferAllocator
-import org.apache.arrow.vector.{VectorSchemaRoot, VectorUnloader}
-import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel}
-import org.apache.arrow.vector.ipc.message.{IpcOption, MessageSerializer}
-
-import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, ExpressionEncoder}
-import org.apache.spark.sql.execution.arrow.ArrowWriter
-import org.apache.spark.sql.util.ArrowUtils
-
-/**
- * Utility for converting common Scala objects into Arrow IPC Stream.
- */
-private[sql] object ConvertToArrow {
-
-  /**
-   * Convert an iterator of common Scala objects into a single Arrow IPC Stream.
-   */
-  def apply[T](
-      encoder: AgnosticEncoder[T],
-      data: Iterator[T],
-      timeZoneId: String,
-      errorOnDuplicatedFieldNames: Boolean,
-      bufferAllocator: BufferAllocator): (ByteString, Int) = {
-    val arrowSchema =
-      ArrowUtils.toArrowSchema(encoder.schema, timeZoneId, errorOnDuplicatedFieldNames)
-    val root = VectorSchemaRoot.create(arrowSchema, bufferAllocator)
-    val writer: ArrowWriter = ArrowWriter.create(root)
-    val unloader = new VectorUnloader(root)
-    val bytes = ByteString.newOutput()
-    val channel = new WriteChannel(Channels.newChannel(bytes))
-
-    try {
-      // Convert and write the data to the vector root.
-      val serializer = ExpressionEncoder(encoder).createSerializer()
-      data.foreach(o => writer.write(serializer(o)))
-      writer.finish()
-
-      // Write the IPC Stream
-      MessageSerializer.serialize(channel, root.getSchema)
-      val batch = unloader.getRecordBatch
-      try MessageSerializer.serialize(channel, batch)
-      finally {
-        batch.close()
-      }
-      ArrowStreamWriter.writeEndOfStream(channel, IpcOption.DEFAULT)
-
-      // Done
-      (bytes.toByteString, bytes.size)
-    } finally {
-      root.close()
-    }
-  }
-}
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
index a01d8bc70eb8f..de4af00c9a959 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -289,6 +289,14 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
     assert(inspector2.sizeInBytesPerBatch < inspector1.sizeInBytesPerBatch)
   }
 
+  test("use after close") {
+    val iterator = serializeToArrow(Iterator.single(Row(0)), singleIntEncoder, allocator)
+    assert(iterator.hasNext)
+    iterator.close()
+    assert(!iterator.hasNext)
+    intercept[NoSuchElementException](iterator.next())
+  }
+
   /* ******************************************************************** *
    * Encoder specification tests
    * ******************************************************************** */

From ff6bce2e004e6f196a5a8fac55501e64e05a08c6 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Wed, 12 Jul 2023 15:21:36 -0400
Subject: [PATCH 08/16] Try to make MiMa check work.

---
 connector/connect/client/jvm/pom.xml          | 13 ----
 .../client/arrow/ArrowEncoderSuite.scala      | 59 +++++++++++++++++--
 .../sql/catalyst/JavaTypeInferenceSuite.scala |  9 +--
 3 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 60e4ae78147ee..8a51bf65d6a88 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -121,19 +121,6 @@
         </exclusion>
       </exclusions>
     </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-      <type>test-jar</type>
-      <scope>test</scope>
-      <exclusions>
-        <exclusion>
-          <groupId>com.google.guava</groupId>
-          <artifactId>guava</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
index de4af00c9a959..0c327484e477d 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -16,9 +16,11 @@
  */
 package org.apache.spark.sql.connect.client.arrow
 
+import java.math.BigInteger
 import java.util
 import java.util.{Collections, Objects}
 
+import scala.beans.BeanProperty
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.reflect.classTag
@@ -32,14 +34,14 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.SparkUnsupportedOperationException
 import org.apache.spark.connect.proto
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, DummyBean, FooEnum, JavaTypeInference, PrimitiveData, ScalaReflection}
-import org.apache.spark.sql.catalyst.FooEnum.FooEnum
-import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, BoxedData, UDTForCaseClass}
+import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, JavaTypeInference, ScalaReflection}
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedIntEncoder, CalendarIntervalEncoder, DateEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, PrimitiveDoubleEncoder, PrimitiveFloatEncoder, RowEncoder, StringEncoder, TimestampEncoder, UDTEncoder}
 import org.apache.spark.sql.catalyst.encoders.RowEncoder.{encoderFor => toRowEncoder}
 import org.apache.spark.sql.connect.client.SparkResult
+import org.apache.spark.sql.connect.client.arrow.FooEnum.FooEnum
 import org.apache.spark.sql.connect.client.util.ConnectFunSuite
-import org.apache.spark.sql.types.{ArrayType, Decimal, DecimalType, Metadata, StructType}
+import org.apache.spark.sql.types.{ArrayType, DataType, Decimal, DecimalType, IntegerType, Metadata, SQLUserDefinedType, StructType, UserDefinedType}
 
 /**
  * Tests for encoding external data to and from arrow.
@@ -683,7 +685,7 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
       ArrowSerializer.serializerFor(CalendarIntervalEncoder, data))
 
     // UDT
-    val udtEncoder = UDTEncoder(new UDTForCaseClass, classOf[UDTForCaseClass])
+    val udtEncoder = UDTEncoder(new UDTNotSupported, classOf[UDTNotSupported])
     intercept[SparkUnsupportedOperationException](ArrowSerializer.serializerFor(udtEncoder, data))
   }
 
@@ -843,3 +845,50 @@ class JavaMapData {
 
   override def hashCode(): Int = Objects.hashCode(dummyToDoubleListMap)
 }
+
+class DummyBean {
+  @BeanProperty var bigInteger: BigInteger = _
+
+  override def hashCode(): Int = Objects.hashCode(bigInteger)
+
+  override def equals(obj: Any): Boolean = obj match {
+    case bean: DummyBean => Objects.equals(bigInteger, bean.bigInteger)
+    case _ => false
+  }
+}
+
+object FooEnum extends Enumeration {
+  type FooEnum = Value
+  val E1, E2 = Value
+}
+
+case class PrimitiveData(
+    intField: Int,
+    longField: Long,
+    doubleField: Double,
+    floatField: Float,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean)
+
+case class BoxedData(
+    intField: java.lang.Integer,
+    longField: java.lang.Long,
+    doubleField: java.lang.Double,
+    floatField: java.lang.Float,
+    shortField: java.lang.Short,
+    byteField: java.lang.Byte,
+    booleanField: java.lang.Boolean)
+
+/** For testing UDT for a case class */
+@SQLUserDefinedType(udt = classOf[UDTNotSupported])
+case class UDTNotSupportedClass(i: Int)
+
+class UDTNotSupported extends UserDefinedType[UDTNotSupportedClass] {
+  override def sqlType: DataType = IntegerType
+  override def userClass: Class[UDTNotSupportedClass] = classOf[UDTNotSupportedClass]
+  override def serialize(obj: UDTNotSupportedClass): Int = obj.i
+  override def deserialize(datum: Any): UDTNotSupportedClass = datum match {
+    case i: Int => UDTNotSupportedClass(i)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala
index 90c61c402306e..35f5bf739bfce 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/JavaTypeInferenceSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst
 
 import java.math.BigInteger
-import java.util.{LinkedList, List => JList, Map => JMap, Objects}
+import java.util.{LinkedList, List => JList, Map => JMap}
 
 import scala.beans.{BeanProperty, BooleanBeanProperty}
 import scala.reflect.{classTag, ClassTag}
@@ -30,13 +30,6 @@ import org.apache.spark.sql.types.{DecimalType, MapType, Metadata, StringType, S
 
 class DummyBean {
   @BeanProperty var bigInteger: BigInteger = _
-
-  override def hashCode(): Int = Objects.hashCode(bigInteger)
-
-  override def equals(obj: Any): Boolean = obj match {
-    case bean: DummyBean => Objects.equals(bigInteger, bean.bigInteger)
-    case _ => false
-  }
 }
 
 class GenericCollectionBean {

From c599ed4bcf9fc460140215f8257b25c2855e880f Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Wed, 12 Jul 2023 15:37:21 -0400
Subject: [PATCH 09/16] wip

---
 .../client/arrow/ArrowDeserializer.scala      | 565 ++++++++++++++++++
 .../client/arrow/ArrowEncoderUtils.scala      |   3 +
 2 files changed, 568 insertions(+)
 create mode 100644 connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
new file mode 100644
index 0000000000000..1f01d1bca31cb
--- /dev/null
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
@@ -0,0 +1,565 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import java.io.IOException
+import java.lang.invoke.{MethodHandles, MethodType}
+import java.lang.reflect.Modifier
+import java.math.{BigDecimal => JBigDecimal, BigInteger => JBigInteger}
+import java.time._
+import java.util
+import java.util.{List => JList, Locale, Map => JMap}
+
+import scala.collection.JavaConverters._
+import scala.collection.generic.{GenericCompanion, GenMapFactory}
+import scala.collection.mutable
+import scala.reflect.ClassTag
+
+import org.apache.arrow.memory.BufferAllocator
+import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, DurationVector, FieldVector, Float4Vector, Float8Vector, IntervalYearVector, IntVector, NullVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
+import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
+import org.apache.arrow.vector.util.Text
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders._
+import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
+import org.apache.spark.sql.types.{Decimal, StructType}
+import org.apache.spark.sql.util.ArrowUtils
+
+/**
+ * Helper class for converting arrow batches into user objects.
+ */
+object ArrowDeserializers {
+  import ArrowEncoderUtils._
+  /**
+   * Create an Iterator of `T`. This iterator takes an Iterator of Arrow IPC Streams,
+   * and deserializes these streams into one or more instances of `T`
+   */
+  def deserializeFromArrow[T](
+      input: Iterator[Array[Byte]],
+      encoder: AgnosticEncoder[T],
+      allocator: BufferAllocator): TypedDeserializingIterator[T] = {
+    try {
+      val reader = new ConcatenatingArrowStreamReader(allocator, input)
+      new ArrowDeserializingIterator(encoder, reader)
+    } catch {
+      case _: IOException =>
+        new EmptyDeserializingIterator(encoder)
+    }
+  }
+
+  /**
+   * Create an Iterator of [[Row]]. This iterator takes an Iterator of Arrow IPC Streams,
+   * and deserializes these streams into one or more instances of [[T]].
+   *
+   * The schema of the contained in the first IPC stream is used to construct the Row encoder.
+   * All subsequent streams must have the same schema.
+   */
+  def deserializeFromArrow(
+      input: Iterator[Array[Byte]],
+      allocator: BufferAllocator): TypedDeserializingIterator[Row] = {
+    try {
+      val reader = new ConcatenatingArrowStreamReader(allocator, input)
+      val schema = ArrowUtils.fromArrowSchema(reader.getVectorSchemaRoot.getSchema)
+      val encoder = org.apache.spark.sql.catalyst.encoders.RowEncoder.encoderFor(schema)
+      new ArrowDeserializingIterator(encoder, reader)
+    } catch {
+      case e: IOException =>
+        new EmptyDeserializingIterator(RowEncoder(Nil))
+    }
+  }
+
+  /**
+   * Create a deserializer of `T` on top of the given `root`.
+   */
+  private[arrow] def deserializerFor[T](
+      encoder: AgnosticEncoder[T],
+      root: VectorSchemaRoot): Deserializer[T] = {
+    val data: AnyRef = if (encoder.schema == encoder.dataType) {
+      root
+    } else {
+      // The input schema is allowed to have multiple columns,
+      // by convention we bind to the first one.
+      root.getVector(0)
+    }
+    deserializerFor(encoder, data).asInstanceOf[Deserializer[T]]
+  }
+
+  private[arrow] def deserializerFor(
+      encoder: AgnosticEncoder[_],
+      data: AnyRef): Deserializer[Any] = {
+    (encoder, data) match {
+      case (PrimitiveBooleanEncoder | BoxedBooleanEncoder, v: BitVector) =>
+        new FieldDeserializer[Boolean, BitVector](v) {
+          def value(i: Int): Boolean = vector.get(i) != 0
+        }
+      case (PrimitiveByteEncoder | BoxedByteEncoder, v: TinyIntVector) =>
+        new FieldDeserializer[Byte, TinyIntVector](v) {
+          def value(i: Int): Byte = vector.get(i)
+        }
+      case (PrimitiveShortEncoder | BoxedShortEncoder, v: SmallIntVector) =>
+        new FieldDeserializer[Short, SmallIntVector](v) {
+          def value(i: Int): Short = vector.get(i)
+        }
+      case (PrimitiveIntEncoder | BoxedIntEncoder, v: IntVector) =>
+        new FieldDeserializer[Int, IntVector](v) {
+          def value(i: Int): Int = vector.get(i)
+        }
+      case (PrimitiveLongEncoder | BoxedLongEncoder, v: BigIntVector) =>
+        new FieldDeserializer[Long, BigIntVector](v) {
+          def value(i: Int): Long = vector.get(i)
+        }
+      case (PrimitiveFloatEncoder | BoxedFloatEncoder, v: Float4Vector) =>
+        new FieldDeserializer[Float, Float4Vector](v) {
+          def value(i: Int): Float = vector.get(i)
+        }
+      case (PrimitiveDoubleEncoder | BoxedDoubleEncoder, v: Float8Vector) =>
+        new FieldDeserializer[Double, Float8Vector](v) {
+          def value(i: Int): Double = vector.get(i)
+        }
+      case (NullEncoder, v: NullVector) =>
+        new FieldDeserializer[Any, NullVector](v) {
+          def value(i: Int): Any = null
+        }
+      case (StringEncoder, v: VarCharVector) =>
+        new FieldDeserializer[String, VarCharVector](v) {
+          def value(i: Int): String = getString(vector, i)
+        }
+      case (JavaEnumEncoder(tag), v: VarCharVector) =>
+        // TODO  see if we can get Enum.valueOf working...
+        val valueOf = methodLookup.findStatic(
+          tag.runtimeClass,
+          "valueOf",
+          MethodType.methodType(tag.runtimeClass, classOf[String]))
+        new FieldDeserializer[Enum[_], VarCharVector](v) {
+          def value(i: Int): Enum[_] = {
+            valueOf.invoke(getString(vector, i)).asInstanceOf[Enum[_]]
+          }
+        }
+      case (ScalaEnumEncoder(parent, _), v: VarCharVector) =>
+        val mirror = scala.reflect.runtime.currentMirror
+        val module = mirror.classSymbol(parent).module.asModule
+        val enumeration = mirror.reflectModule(module).instance.asInstanceOf[Enumeration]
+        new FieldDeserializer[Enumeration#Value, VarCharVector](v) {
+          def value(i: Int): Enumeration#Value = enumeration.withName(getString(vector, i))
+        }
+      case (BinaryEncoder, v: VarBinaryVector) =>
+        new FieldDeserializer[Array[Byte], VarBinaryVector](v) {
+          def value(i: Int): Array[Byte] = vector.get(i)
+        }
+      case (SparkDecimalEncoder(_), v: DecimalVector) =>
+        new FieldDeserializer[Decimal, DecimalVector](v) {
+          def value(i: Int): Decimal = Decimal(vector.getObject(i))
+        }
+      case (ScalaDecimalEncoder(_), v: DecimalVector) =>
+        new FieldDeserializer[BigDecimal, DecimalVector](v) {
+          def value(i: Int): BigDecimal = BigDecimal(vector.getObject(i))
+        }
+      case (JavaDecimalEncoder(_, _), v: DecimalVector) =>
+        new FieldDeserializer[JBigDecimal, DecimalVector](v) {
+          def value(i: Int): JBigDecimal = vector.getObject(i)
+        }
+      case (ScalaBigIntEncoder, v: DecimalVector) =>
+        new FieldDeserializer[BigInt, DecimalVector](v) {
+          def value(i: Int): BigInt = new BigInt(vector.getObject(i).toBigInteger)
+        }
+      case (JavaBigIntEncoder, v: DecimalVector) =>
+        new FieldDeserializer[JBigInteger, DecimalVector](v) {
+          def value(i: Int): JBigInteger = vector.getObject(i).toBigInteger
+        }
+      case (DayTimeIntervalEncoder, v: DurationVector) =>
+        new FieldDeserializer[Duration, DurationVector](v) {
+          def value(i: Int): Duration = vector.getObject(i)
+        }
+      case (YearMonthIntervalEncoder, v: IntervalYearVector) =>
+        new FieldDeserializer[Period, IntervalYearVector](v) {
+          def value(i: Int): Period = vector.getObject(i).normalized()
+        }
+      case (DateEncoder(_), v: DateDayVector) =>
+        new FieldDeserializer[java.sql.Date, DateDayVector](v) {
+          def value(i: Int): java.sql.Date = DateTimeUtils.toJavaDate(vector.get(i))
+        }
+      case (LocalDateEncoder(_), v: DateDayVector) =>
+        new FieldDeserializer[LocalDate, DateDayVector](v) {
+          def value(i: Int): LocalDate = DateTimeUtils.daysToLocalDate(vector.get(i))
+        }
+      case (TimestampEncoder(_), v: TimeStampMicroTZVector) =>
+        new FieldDeserializer[java.sql.Timestamp, TimeStampMicroTZVector](v) {
+          def value(i: Int): java.sql.Timestamp = DateTimeUtils.toJavaTimestamp(vector.get(i))
+        }
+      case (InstantEncoder(_), v: TimeStampMicroTZVector) =>
+        new FieldDeserializer[Instant, TimeStampMicroTZVector](v) {
+          def value(i: Int): Instant = DateTimeUtils.microsToInstant(vector.get(i))
+        }
+      case (LocalDateTimeEncoder, v: TimeStampMicroVector) =>
+        new FieldDeserializer[LocalDateTime, TimeStampMicroVector](v) {
+          def value(i: Int): LocalDateTime = DateTimeUtils.microsToLocalDateTime(vector.get(i))
+        }
+
+      case (OptionEncoder(value), v) =>
+        val deserializer = deserializerFor(value, v)
+        new Deserializer[Any] {
+          override def get(i: Int): Any = Option(deserializer.get(i))
+        }
+
+      case (ArrayEncoder(element, _), v: ListVector) =>
+        val deserializer = deserializerFor(element, v.getDataVector)
+        new FieldDeserializer[AnyRef, ListVector](v) {
+          def value(i: Int): AnyRef = getArray(vector, i, deserializer)(element.clsTag)
+        }
+
+      case (IterableEncoder(tag, element, _, _), v: ListVector) =>
+        val deserializer = deserializerFor(element, v.getDataVector)
+        if (isSubClass(Classes.WRAPPED_ARRAY, tag)) {
+          // Wrapped array is a bit special because we need to use an array of the element type.
+          // Some parts of our codebase (unfortunately) rely on this for type inference on results.
+          new FieldDeserializer[mutable.WrappedArray[Any], ListVector](v) {
+            def value(i: Int): mutable.WrappedArray[Any] = {
+              val array = getArray(vector, i, deserializer)(element.clsTag)
+              mutable.WrappedArray.make(array)
+            }
+          }
+        } else if (isSubClass(Classes.ITERABLE, tag)) {
+          val companion = resolveCompanion[GenericCompanion[Iterable]](tag)
+          new FieldDeserializer[Iterable[Any], ListVector](v) {
+            def value(i: Int): Iterable[Any] = {
+              val builder = companion.newBuilder[Any]
+              loadListIntoBuilder(vector, i, deserializer, builder)
+              builder.result()
+            }
+          }
+        } else if (isSubClass(Classes.JLIST, tag)) {
+          val newInstance = resolveJavaListCreator(tag)
+          new FieldDeserializer[JList[Any], ListVector](v) {
+            def value(i: Int): JList[Any] = {
+              var index = v.getElementStartIndex(i)
+              val end = v.getElementEndIndex(i)
+              val list = newInstance(end - index)
+              while (index < end) {
+                list.add(deserializer.get(index))
+                index += 1
+              }
+              list
+            }
+          }
+        } else {
+          throw unsupportedCollectionType(tag.runtimeClass)
+        }
+
+      case (MapEncoder(tag, key, value, _), v: MapVector) =>
+        val structVector = v.getDataVector.asInstanceOf[StructVector]
+        val keyDeserializer = deserializerFor(key, structVector.getChild(MapVector.KEY_NAME))
+        val valueDeserializer = deserializerFor(value, structVector.getChild(MapVector.VALUE_NAME))
+        if (isSubClass(Classes.MAP, tag)) {
+          val companion = resolveCompanion[GenMapFactory[Map]](tag)
+          new FieldDeserializer[Map[Any, Any], MapVector](v) {
+            def value(i: Int): Map[Any, Any] = {
+              val builder = companion.newBuilder[Any, Any]
+              var index = v.getElementStartIndex(i)
+              val end = v.getElementEndIndex(i)
+              builder.sizeHint(end - index)
+              while (index < end) {
+                builder += (keyDeserializer.get(index) -> valueDeserializer.get(index))
+                index += 1
+              }
+              builder.result()
+            }
+          }
+        } else if (isSubClass(Classes.JMAP, tag)) {
+          val newInstance = resolveJavaMapCreator(tag)
+          new FieldDeserializer[JMap[Any, Any], MapVector](v) {
+            def value(i: Int): JMap[Any, Any] = {
+              val map = newInstance()
+              var index = v.getElementStartIndex(i)
+              val end = v.getElementEndIndex(i)
+              while (index < end) {
+                map.put(keyDeserializer.get(index), valueDeserializer.get(index))
+                index += 1
+              }
+              map
+            }
+          }
+        } else {
+          throw unsupportedCollectionType(tag.runtimeClass)
+        }
+
+      case (ProductEncoder(tag, fields), StructVectors(struct, vectors)) =>
+        val methodType = MethodType.methodType(
+          classOf[Unit],
+          fields.map(_.enc.clsTag.runtimeClass).asJava)
+        val constructor = methodLookup.findConstructor(tag.runtimeClass, methodType)
+          .asSpreader(0, classOf[Array[Any]], fields.size)
+        val deserializers = if (isTuple(tag.runtimeClass)) {
+          fields.zip(vectors).map {
+            case (field, vector) =>
+              deserializerFor(field.enc, vector)
+          }
+        } else {
+          val lookup = createFieldLookup(vectors)
+          fields.toArray.map { field =>
+            deserializerFor(field.enc, lookup(field.name))
+          }
+        }
+        new StructFieldSerializer[Any](struct) {
+          def value(i: Int): Any = {
+            val parameters = deserializers.map(_.get(i))
+            constructor.invoke(parameters) // See if we can get invokeExact to work here.
+          }
+        }
+
+      case (r @ RowEncoder(fields), StructVectors(struct, vectors)) =>
+        val lookup = createFieldLookup(vectors)
+        val deserializers = fields.toArray.map { field =>
+          deserializerFor(field.enc, lookup(field.name))
+        }
+        new StructFieldSerializer[Any](struct) {
+          def value(i: Int): Any = {
+            val values = deserializers.map(_.get(i))
+            new GenericRowWithSchema(values, r.schema)
+          }
+        }
+
+      case (JavaBeanEncoder(tag, fields), StructVectors(struct, vectors)) =>
+        val constructor = methodLookup.findConstructor(
+          tag.runtimeClass,
+          MethodType.methodType(classOf[Unit]))
+        val lookup = createFieldLookup(vectors)
+        val setters = fields.map { field =>
+          val vector = lookup(field.name)
+          val deserializer = deserializerFor(field.enc, vector)
+          val setter = methodLookup.findVirtual(
+            tag.runtimeClass,
+            field.writeMethod.get,
+            MethodType.methodType(classOf[Unit], field.enc.clsTag.runtimeClass))
+          (bean: Any, i: Int) => setter.invoke(bean, deserializer.get(i))
+        }
+        new StructFieldSerializer[Any](struct) {
+          def value(i: Int): Any = {
+            val instance = constructor.invoke()
+            setters.foreach(_(instance, i))
+            instance
+          }
+        }
+
+      case (CalendarIntervalEncoder | _: UDTEncoder[_], _) =>
+        throw QueryExecutionErrors.unsupportedDataTypeError(encoder.dataType)
+
+      case _ =>
+        throw new RuntimeException(s"Unsupported Encoder($encoder)/Vector($data) combination.")
+    }
+  }
+
+  private val methodLookup = MethodHandles.lookup()
+
+  /**
+   * Resolve the companion object for a scala class. In our particular case the class we pass in
+   * is a Scala collection. We use the companion to create a builder for that collection.
+   */
+  private def resolveCompanion[T](tag: ClassTag[_]): T = {
+    val mirror = scala.reflect.runtime.currentMirror
+    val module = mirror.classSymbol(tag.runtimeClass).companion.asModule
+    mirror.reflectModule(module).instance.asInstanceOf[T]
+  }
+
+  /**
+   * Create a function that creates a [[util.List]] instance. The int parameter of the creator
+   * function is a size hint.
+   *
+   * If the [[ClassTag]] `tag` points to an interface instead of a concrete class we try to use
+   * [[util.ArrayList]]. For concrete classes we try to use a constructor that takes a single
+   * [[Int]] argument, it is assumed this is a size hint. If no such constructor exists we fallback
+   * to a no-args constructor.
+   */
+  private def resolveJavaListCreator(tag: ClassTag[_]): Int => JList[Any] = {
+    val cls = tag.runtimeClass
+    val modifiers = cls.getModifiers
+    if (Modifier.isInterface(modifiers) || Modifier.isAbstract(modifiers)) {
+      // Abstract class or interface; we try to use ArrayList.
+      if (!cls.isAssignableFrom(classOf[util.ArrayList[_]])) {
+        unsupportedCollectionType(cls)
+      }
+      (size: Int) => new util.ArrayList[Any](size)
+    } else {
+      try {
+        // Try to use a constructor that (hopefully) takes a size argument.
+        val ctor = methodLookup.findConstructor(
+          tag.runtimeClass,
+          MethodType.methodType(classOf[Unit], Integer.TYPE))
+        size => ctor.invoke(size).asInstanceOf[JList[Any]]
+      } catch {
+        case _: java.lang.NoSuchMethodException =>
+          // Use a no-args constructor.
+          val ctor = methodLookup.findConstructor(
+            tag.runtimeClass,
+            MethodType.methodType(classOf[Unit]))
+          _ => ctor.invoke().asInstanceOf[JList[Any]]
+      }
+    }
+  }
+
+  /**
+   * Create a function that creates a [[util.Map]] instance.
+   *
+   * If the [[ClassTag]] `tag` points to an interface instead of a concrete class we try to use
+   * [[util.HashMap]]. For concrete classes we try to use a no-args constructor.
+   */
+  private def resolveJavaMapCreator(tag: ClassTag[_]): () => JMap[Any, Any] = {
+    val cls = tag.runtimeClass
+    val modifiers = cls.getModifiers
+    if (Modifier.isInterface(modifiers) || Modifier.isAbstract(modifiers)) {
+      // Abstract class or interface; we try to use HashMap.
+      if (!cls.isAssignableFrom(classOf[java.util.HashMap[_, _]])) {
+        unsupportedCollectionType(cls)
+      }
+      () => new util.HashMap[Any, Any]()
+    } else {
+      // Use a no-args constructor.
+      val ctor = methodLookup.findConstructor(
+        tag.runtimeClass,
+        MethodType.methodType(classOf[Unit]))
+      () => ctor.invoke().asInstanceOf[JMap[Any, Any]]
+    }
+  }
+
+  /**
+   * Create a function that can lookup one [[FieldVector vectors]] in `fields` by name.
+   * This lookup is case insensitive. If the schema contains fields with duplicate (with
+   * case-insensitive resolution) names an exception is thrown. The returned function will throw
+   * an exception when no column can be found for a name.
+   *
+   * A small note on the binding process in general. Over complete schemas are currently allowed,
+   * meaning that the data can have more column than the encoder. In this the over complete
+   * (unbound) columns are ignored.
+   */
+  private def createFieldLookup(fields: Seq[FieldVector]): String => FieldVector = {
+    def toKey(k: String): String = k.toLowerCase(Locale.ROOT)
+    val lookup = mutable.Map.empty[String, FieldVector]
+    fields.foreach { field =>
+      val key = toKey(field.getName)
+      val old = lookup.put(key, field)
+      if (old.isDefined) {
+        throw QueryCompilationErrors.ambiguousColumnOrFieldError(
+          field.getName :: Nil,
+          fields.count(f => toKey(f.getName) == key))
+      }
+    }
+    name => {
+      lookup.getOrElse(toKey(name), throw QueryCompilationErrors.columnNotFoundError(name))
+    }
+  }
+
+  private def isTuple(cls: Class[_]): Boolean = cls.getName.startsWith("scala.Tuple")
+
+  private def getString(v: VarCharVector, i: Int): String = {
+    // This is currently a bit heavy on allocations:
+    // - byte array created in VarCharVector.get
+    // - CharBuffer created CharSetEncoder
+    // - char array in String
+    // By using direct buffers and reusing the char buffer
+    // we could get rid of the first two allocations.
+    Text.decode(v.get(i))
+  }
+
+  private def loadListIntoBuilder(
+                                   v: ListVector,
+                                   i: Int,
+                                   deserializer: Deserializer[Any],
+                                   builder: mutable.Builder[Any, _]): Unit = {
+    var index = v.getElementStartIndex(i)
+    val end = v.getElementEndIndex(i)
+    builder.sizeHint(end - index)
+    while (index < end) {
+      builder += deserializer.get(index)
+      index += 1
+    }
+  }
+
+  private def getArray(
+                        v: ListVector,
+                        i: Int,
+                        deserializer: Deserializer[Any])(
+                        implicit tag: ClassTag[Any]): AnyRef = {
+    val builder = mutable.ArrayBuilder.make[Any]
+    loadListIntoBuilder(v, i, deserializer, builder)
+    builder.result()
+  }
+
+  abstract class Deserializer[+E] {
+    def get(i: Int): E
+  }
+
+  abstract class FieldDeserializer[E, V <: FieldVector](val vector: V) extends Deserializer[E] {
+    def value(i: Int): E
+    def isNull(i: Int): Boolean = vector.isNull(i)
+    override def get(i: Int): E = {
+      if (!isNull(i)) {
+        value(i)
+      } else {
+        null.asInstanceOf[E]
+      }
+    }
+  }
+
+  abstract class StructFieldSerializer[E](v: StructVector)
+    extends FieldDeserializer[E, StructVector](v) {
+    override def isNull(i: Int): Boolean = vector != null && vector.isNull(i)
+  }
+}
+
+trait TypedDeserializingIterator[E] extends CloseableIterator[E] {
+  def encoder: AgnosticEncoder[E]
+  def schema: StructType = encoder.schema
+}
+
+class EmptyDeserializingIterator[E](override val encoder: AgnosticEncoder[E])
+  extends TypedDeserializingIterator[E] {
+  override def close(): Unit = ()
+  override def hasNext: Boolean = false
+  override def next(): E = throw new NoSuchElementException()
+}
+
+class ArrowDeserializingIterator[E](
+    override val encoder: AgnosticEncoder[E],
+    private[this] val reader: ConcatenatingArrowStreamReader)
+  extends TypedDeserializingIterator[E] {
+  private[this] var index = 0
+  private[this] val root = reader.getVectorSchemaRoot
+  private[this] val deserializer = ArrowDeserializers.deserializerFor(encoder, root)
+
+  override def hasNext: Boolean = {
+    if (index >= root.getRowCount) {
+      reader.loadNextBatch()
+      index = 0
+    }
+    index < root.getRowCount
+  }
+
+  override def next(): E = {
+    if (!hasNext) {
+      throw new NoSuchElementException()
+    }
+    val result = deserializer.get(index)
+    index += 1
+    result
+  }
+
+  override def close(): Unit = reader.close()
+}
+
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
index f6b140bae557b..ed27336985416 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderUtils.scala
@@ -24,8 +24,11 @@ import org.apache.arrow.vector.complex.StructVector
 
 private[arrow] object ArrowEncoderUtils {
   object Classes {
+    val WRAPPED_ARRAY: Class[_] = classOf[scala.collection.mutable.WrappedArray[_]]
     val ITERABLE: Class[_] = classOf[scala.collection.Iterable[_]]
+    val MAP: Class[_] = classOf[scala.collection.Map[_, _]]
     val JLIST: Class[_] = classOf[java.util.List[_]]
+    val JMAP: Class[_] = classOf[java.util.Map[_, _]]
   }
 
   def isSubClass(cls: Class[_], tag: ClassTag[_]): Boolean = {

From 91f0547b89640d4783d3a9b3fee506f391831046 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Fri, 14 Jul 2023 19:21:45 -0400
Subject: [PATCH 10/16] Make it work

---
 .../sql/connect/client/SparkResult.scala      | 191 +++++++++---------
 .../client/arrow/ArrowDeserializer.scala      | 108 ++++------
 .../ConcatenatingArrowStreamReader.scala      | 185 +++++++++++++++++
 .../client/arrow/ArrowEncoderSuite.scala      |  34 +---
 4 files changed, 324 insertions(+), 194 deletions(-)
 create mode 100644 connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ConcatenatingArrowStreamReader.scala

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
index c0ad026e76460..41727195991ad 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
@@ -16,55 +16,46 @@
  */
 package org.apache.spark.sql.connect.client
 
-import java.util.Collections
-
-import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import org.apache.arrow.memory.BufferAllocator
-import org.apache.arrow.vector.FieldVector
-import org.apache.arrow.vector.ipc.ArrowStreamReader
+import org.apache.arrow.vector.ipc.message.{ArrowMessage, ArrowRecordBatch}
+import org.apache.arrow.vector.types.pojo
 
 import org.apache.spark.connect.proto
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ProductEncoder, UnboundRowEncoder}
-import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.Deserializer
-import org.apache.spark.sql.catalyst.types.DataTypeUtils
-import org.apache.spark.sql.connect.client.util.{AutoCloseables, Cleanable}
+import org.apache.spark.sql.connect.client.arrow.{AbstractMessageIterator, ArrowDeserializingIterator, ConcatenatingArrowStreamReader, MessageIterator}
+import org.apache.spark.sql.connect.client.util.Cleanable
 import org.apache.spark.sql.connect.common.DataTypeProtoConverter
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.util.ArrowUtils
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 
 private[sql] class SparkResult[T](
     responses: java.util.Iterator[proto.ExecutePlanResponse],
     allocator: BufferAllocator,
     encoder: AgnosticEncoder[T])
     extends AutoCloseable
-    with Cleanable {
+    with Cleanable { self =>
 
   private[this] var numRecords: Int = 0
   private[this] var structType: StructType = _
-  private[this] var boundEncoder: ExpressionEncoder[T] = _
-  private[this] var nextBatchIndex: Int = 0
-  private val idxToBatches = mutable.Map.empty[Int, ColumnarBatch]
-
-  private def createEncoder(schema: StructType): ExpressionEncoder[T] = {
-    val agnosticEncoder = createEncoder(encoder, schema).asInstanceOf[AgnosticEncoder[T]]
-    ExpressionEncoder(agnosticEncoder)
-  }
+  private[this] var arrowSchema: pojo.Schema = _
+  private[this] var nextResultIndex: Int = 0
+  private val resultMap = mutable.Map.empty[Int, (Long, Seq[ArrowMessage])]
 
   /**
    * Update RowEncoder and recursively update the fields of the ProductEncoder if found.
    */
-  private def createEncoder[_](
-      enc: AgnosticEncoder[_],
-      dataType: DataType): AgnosticEncoder[_] = {
+  private def createEncoder[E](
+      enc: AgnosticEncoder[E],
+      dataType: DataType): AgnosticEncoder[E] = {
     enc match {
       case UnboundRowEncoder =>
         // Replace the row encoder with the encoder inferred from the schema.
-        RowEncoder.encoderFor(dataType.asInstanceOf[StructType])
+        RowEncoder
+          .encoderFor(dataType.asInstanceOf[StructType])
+          .asInstanceOf[AgnosticEncoder[E]]
       case ProductEncoder(clsTag, fields) if ProductEncoder.isTuple(clsTag) =>
         // Recursively continue updating the tuple product encoder
         val schema = dataType.asInstanceOf[StructType]
@@ -85,48 +76,43 @@ private[sql] class SparkResult[T](
         // The original schema should arrive before ArrowBatches.
         structType =
           DataTypeProtoConverter.toCatalystType(response.getSchema).asInstanceOf[StructType]
-      } else if (response.hasArrowBatch) {
+      }
+      if (response.hasArrowBatch) {
         val ipcStreamBytes = response.getArrowBatch.getData
-        val reader = new ArrowStreamReader(ipcStreamBytes.newInput(), allocator)
-        try {
-          val root = reader.getVectorSchemaRoot
-          if (structType == null) {
-            // If the schema is not available yet, fallback to the schema from Arrow.
-            structType = ArrowUtils.fromArrowSchema(root.getSchema)
-          }
-          // TODO: create encoders that directly operate on arrow vectors.
-          if (boundEncoder == null) {
-            boundEncoder = createEncoder(structType)
-              .resolveAndBind(DataTypeUtils.toAttributes(structType))
-          }
-          while (reader.loadNextBatch()) {
-            val rowCount = root.getRowCount
-            if (rowCount > 0) {
-              val vectors = root.getFieldVectors.asScala
-                .map(v => new ArrowColumnVector(transferToNewVector(v)))
-                .toArray[ColumnVector]
-              idxToBatches.put(nextBatchIndex, new ColumnarBatch(vectors, rowCount))
-              nextBatchIndex += 1
-              numRecords += rowCount
-              if (stopOnFirstNonEmptyResponse) {
-                return true
-              }
-            }
+        val reader = new MessageIterator(ipcStreamBytes.newInput(), allocator)
+        if (arrowSchema == null) {
+          arrowSchema = reader.schema
+        } else if (arrowSchema != reader.schema) {
+          // Uh oh...
+        }
+        if (structType == null) {
+          // If the schema is not available yet, fallback to the arrow schema.
+          structType = ArrowUtils.fromArrowSchema(reader.schema)
+        }
+        var numRecordsInBatch = 0
+        val messages = Seq.newBuilder[ArrowMessage]
+        while (reader.hasNext) {
+          val message = reader.next()
+          message match {
+            case batch: ArrowRecordBatch =>
+              numRecordsInBatch += batch.getLength
+            case _ =>
           }
-        } finally {
-          reader.close()
+          messages += message
+        }
+        numRecords += numRecordsInBatch
+        resultMap.put(nextResultIndex, (reader.bytesRead, messages.result()))
+        nextResultIndex += 1
+        if (stopOnFirstNonEmptyResponse && numRecordsInBatch > 0) {
+          return true
         }
+      } else {
+        throw new UnsupportedOperationException(s"Unsupported response: $response")
       }
     }
     false
   }
 
-  private def transferToNewVector(in: FieldVector): FieldVector = {
-    val pair = in.getTransferPair(allocator)
-    pair.transfer()
-    pair.getTo.asInstanceOf[FieldVector]
-  }
-
   /**
    * Returns the number of elements in the result.
    */
@@ -173,53 +159,68 @@ private[sql] class SparkResult[T](
     buildIterator(destructive = true)
 
   private def buildIterator(destructive: Boolean): java.util.Iterator[T] with AutoCloseable = {
+    if (schema == null) {
+      processResponses(true)
+    }
+    val iterator = new ArrowDeserializingIterator(
+      createEncoder(encoder, structType),
+      new ConcatenatingArrowStreamReader(
+        allocator,
+        Iterator.single(new ResultMessageIterator(destructive)),
+        destructive))
     new java.util.Iterator[T] with AutoCloseable {
-      private[this] var batchIndex: Int = -1
-      private[this] var iterator: java.util.Iterator[InternalRow] = Collections.emptyIterator()
-      private[this] var deserializer: Deserializer[T] = _
+      override def hasNext: Boolean = iterator.hasNext
+      override def next(): T = iterator.next()
+      override def close(): Unit = iterator.close()
+    }
+  }
 
-      override def hasNext: Boolean = {
-        if (iterator.hasNext) {
-          return true
-        }
+  /**
+   * Close this result, freeing any underlying resources.
+   */
+  override def close(): Unit = cleaner.close()
 
-        val nextBatchIndex = batchIndex + 1
-        if (destructive) {
-          idxToBatches.remove(batchIndex).foreach(_.close())
-        }
+  override val cleaner: AutoCloseable = new SparkResultCloseable(resultMap)
 
-        val hasNextBatch = if (!idxToBatches.contains(nextBatchIndex)) {
-          processResponses(stopOnFirstNonEmptyResponse = true)
+  private class ResultMessageIterator(destructive: Boolean) extends AbstractMessageIterator {
+    private[this] var totalBytesRead = 0L
+    private[this] var nextResultIndex = 0
+    private[this] var iterator: Iterator[ArrowMessage] = Iterator.empty
+
+    override def schema: pojo.Schema = arrowSchema
+    override def bytesRead: Long = totalBytesRead
+    override def hasNext: Boolean = {
+      if (iterator.hasNext) {
+        return true
+      }
+      val hasNextResult = if (!resultMap.contains(nextResultIndex)) {
+        self.processResponses(true)
+      } else {
+        true
+      }
+      if (hasNextResult) {
+        val Some((sizeInBytes, messages)) = if (destructive) {
+          resultMap.remove(nextResultIndex)
         } else {
-          true
-        }
-        if (hasNextBatch) {
-          batchIndex = nextBatchIndex
-          iterator = idxToBatches(nextBatchIndex).rowIterator()
-          if (deserializer == null) {
-            deserializer = boundEncoder.createDeserializer()
-          }
+          resultMap.get(nextResultIndex)
         }
-        hasNextBatch
+        totalBytesRead += sizeInBytes
+        iterator = messages.iterator
+        nextResultIndex += 1
       }
+      hasNextResult
+    }
 
-      override def next(): T = {
-        if (!hasNext) {
-          throw new NoSuchElementException
-        }
-        deserializer(iterator.next())
+    override def next(): ArrowMessage = {
+      if (!hasNext) {
+        throw new NoSuchElementException()
       }
-
-      override def close(): Unit = SparkResult.this.close()
+      iterator.next()
     }
   }
+}
 
-  /**
-   * Close this result, freeing any underlying resources.
-   */
-  override def close(): Unit = {
-    idxToBatches.values.foreach(_.close())
-  }
-
-  override def cleaner: AutoCloseable = AutoCloseables(idxToBatches.values.toSeq)
+private[client] class SparkResultCloseable(resultMap: mutable.Map[Int, (Long, Seq[ArrowMessage])])
+    extends AutoCloseable {
+  override def close(): Unit = resultMap.values.foreach(_._2.foreach(_.close()))
 }
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
index 1f01d1bca31cb..33d1b025ab733 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.sql.connect.client.arrow
 
-import java.io.IOException
+import java.io.{ByteArrayInputStream, IOException}
 import java.lang.invoke.{MethodHandles, MethodType}
 import java.lang.reflect.Modifier
 import java.math.{BigDecimal => JBigDecimal, BigInteger => JBigInteger}
@@ -32,32 +32,35 @@ import scala.reflect.ClassTag
 import org.apache.arrow.memory.BufferAllocator
 import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, DurationVector, FieldVector, Float4Vector, Float8Vector, IntervalYearVector, IntVector, NullVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
 import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
+import org.apache.arrow.vector.ipc.ArrowReader
 import org.apache.arrow.vector.util.Text
 
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders._
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.types.{Decimal, StructType}
-import org.apache.spark.sql.util.ArrowUtils
 
 /**
  * Helper class for converting arrow batches into user objects.
  */
 object ArrowDeserializers {
   import ArrowEncoderUtils._
+
   /**
-   * Create an Iterator of `T`. This iterator takes an Iterator of Arrow IPC Streams,
-   * and deserializes these streams into one or more instances of `T`
+   * Create an Iterator of `T`. This iterator takes an Iterator of Arrow IPC Streams, and
+   * deserializes these streams into one or more instances of `T`
    */
   def deserializeFromArrow[T](
       input: Iterator[Array[Byte]],
       encoder: AgnosticEncoder[T],
       allocator: BufferAllocator): TypedDeserializingIterator[T] = {
     try {
-      val reader = new ConcatenatingArrowStreamReader(allocator, input)
+      val reader = new ConcatenatingArrowStreamReader(
+        allocator,
+        input.map(bytes => new MessageIterator(new ByteArrayInputStream(bytes), allocator)),
+        destructive = true)
       new ArrowDeserializingIterator(encoder, reader)
     } catch {
       case _: IOException =>
@@ -65,27 +68,6 @@ object ArrowDeserializers {
     }
   }
 
-  /**
-   * Create an Iterator of [[Row]]. This iterator takes an Iterator of Arrow IPC Streams,
-   * and deserializes these streams into one or more instances of [[T]].
-   *
-   * The schema of the contained in the first IPC stream is used to construct the Row encoder.
-   * All subsequent streams must have the same schema.
-   */
-  def deserializeFromArrow(
-      input: Iterator[Array[Byte]],
-      allocator: BufferAllocator): TypedDeserializingIterator[Row] = {
-    try {
-      val reader = new ConcatenatingArrowStreamReader(allocator, input)
-      val schema = ArrowUtils.fromArrowSchema(reader.getVectorSchemaRoot.getSchema)
-      val encoder = org.apache.spark.sql.catalyst.encoders.RowEncoder.encoderFor(schema)
-      new ArrowDeserializingIterator(encoder, reader)
-    } catch {
-      case e: IOException =>
-        new EmptyDeserializingIterator(RowEncoder(Nil))
-    }
-  }
-
   /**
    * Create a deserializer of `T` on top of the given `root`.
    */
@@ -143,7 +125,7 @@ object ArrowDeserializers {
           def value(i: Int): String = getString(vector, i)
         }
       case (JavaEnumEncoder(tag), v: VarCharVector) =>
-        // TODO  see if we can get Enum.valueOf working...
+        // It would be nice if we can get Enum.valueOf working...
         val valueOf = methodLookup.findStatic(
           tag.runtimeClass,
           "valueOf",
@@ -266,7 +248,8 @@ object ArrowDeserializers {
       case (MapEncoder(tag, key, value, _), v: MapVector) =>
         val structVector = v.getDataVector.asInstanceOf[StructVector]
         val keyDeserializer = deserializerFor(key, structVector.getChild(MapVector.KEY_NAME))
-        val valueDeserializer = deserializerFor(value, structVector.getChild(MapVector.VALUE_NAME))
+        val valueDeserializer =
+          deserializerFor(value, structVector.getChild(MapVector.VALUE_NAME))
         if (isSubClass(Classes.MAP, tag)) {
           val companion = resolveCompanion[GenMapFactory[Map]](tag)
           new FieldDeserializer[Map[Any, Any], MapVector](v) {
@@ -301,15 +284,14 @@ object ArrowDeserializers {
         }
 
       case (ProductEncoder(tag, fields), StructVectors(struct, vectors)) =>
-        val methodType = MethodType.methodType(
-          classOf[Unit],
-          fields.map(_.enc.clsTag.runtimeClass).asJava)
-        val constructor = methodLookup.findConstructor(tag.runtimeClass, methodType)
+        val methodType =
+          MethodType.methodType(classOf[Unit], fields.map(_.enc.clsTag.runtimeClass).asJava)
+        val constructor = methodLookup
+          .findConstructor(tag.runtimeClass, methodType)
           .asSpreader(0, classOf[Array[Any]], fields.size)
         val deserializers = if (isTuple(tag.runtimeClass)) {
-          fields.zip(vectors).map {
-            case (field, vector) =>
-              deserializerFor(field.enc, vector)
+          fields.zip(vectors).toArray.map { case (field, vector) =>
+            deserializerFor(field.enc, vector)
           }
         } else {
           val lookup = createFieldLookup(vectors)
@@ -337,9 +319,8 @@ object ArrowDeserializers {
         }
 
       case (JavaBeanEncoder(tag, fields), StructVectors(struct, vectors)) =>
-        val constructor = methodLookup.findConstructor(
-          tag.runtimeClass,
-          MethodType.methodType(classOf[Unit]))
+        val constructor =
+          methodLookup.findConstructor(tag.runtimeClass, MethodType.methodType(classOf[Unit]))
         val lookup = createFieldLookup(vectors)
         val setters = fields.map { field =>
           val vector = lookup(field.name)
@@ -384,8 +365,8 @@ object ArrowDeserializers {
    *
    * If the [[ClassTag]] `tag` points to an interface instead of a concrete class we try to use
    * [[util.ArrayList]]. For concrete classes we try to use a constructor that takes a single
-   * [[Int]] argument, it is assumed this is a size hint. If no such constructor exists we fallback
-   * to a no-args constructor.
+   * [[Int]] argument, it is assumed this is a size hint. If no such constructor exists we
+   * fallback to a no-args constructor.
    */
   private def resolveJavaListCreator(tag: ClassTag[_]): Int => JList[Any] = {
     val cls = tag.runtimeClass
@@ -406,9 +387,8 @@ object ArrowDeserializers {
       } catch {
         case _: java.lang.NoSuchMethodException =>
           // Use a no-args constructor.
-          val ctor = methodLookup.findConstructor(
-            tag.runtimeClass,
-            MethodType.methodType(classOf[Unit]))
+          val ctor =
+            methodLookup.findConstructor(tag.runtimeClass, MethodType.methodType(classOf[Unit]))
           _ => ctor.invoke().asInstanceOf[JList[Any]]
       }
     }
@@ -431,16 +411,15 @@ object ArrowDeserializers {
       () => new util.HashMap[Any, Any]()
     } else {
       // Use a no-args constructor.
-      val ctor = methodLookup.findConstructor(
-        tag.runtimeClass,
-        MethodType.methodType(classOf[Unit]))
+      val ctor =
+        methodLookup.findConstructor(tag.runtimeClass, MethodType.methodType(classOf[Unit]))
       () => ctor.invoke().asInstanceOf[JMap[Any, Any]]
     }
   }
 
   /**
-   * Create a function that can lookup one [[FieldVector vectors]] in `fields` by name.
-   * This lookup is case insensitive. If the schema contains fields with duplicate (with
+   * Create a function that can lookup one [[FieldVector vectors]] in `fields` by name. This
+   * lookup is case insensitive. If the schema contains fields with duplicate (with
    * case-insensitive resolution) names an exception is thrown. The returned function will throw
    * an exception when no column can be found for a name.
    *
@@ -478,10 +457,10 @@ object ArrowDeserializers {
   }
 
   private def loadListIntoBuilder(
-                                   v: ListVector,
-                                   i: Int,
-                                   deserializer: Deserializer[Any],
-                                   builder: mutable.Builder[Any, _]): Unit = {
+      v: ListVector,
+      i: Int,
+      deserializer: Deserializer[Any],
+      builder: mutable.Builder[Any, _]): Unit = {
     var index = v.getElementStartIndex(i)
     val end = v.getElementEndIndex(i)
     builder.sizeHint(end - index)
@@ -491,11 +470,8 @@ object ArrowDeserializers {
     }
   }
 
-  private def getArray(
-                        v: ListVector,
-                        i: Int,
-                        deserializer: Deserializer[Any])(
-                        implicit tag: ClassTag[Any]): AnyRef = {
+  private def getArray(v: ListVector, i: Int, deserializer: Deserializer[Any])(implicit
+      tag: ClassTag[Any]): AnyRef = {
     val builder = mutable.ArrayBuilder.make[Any]
     loadListIntoBuilder(v, i, deserializer, builder)
     builder.result()
@@ -518,7 +494,7 @@ object ArrowDeserializers {
   }
 
   abstract class StructFieldSerializer[E](v: StructVector)
-    extends FieldDeserializer[E, StructVector](v) {
+      extends FieldDeserializer[E, StructVector](v) {
     override def isNull(i: Int): Boolean = vector != null && vector.isNull(i)
   }
 }
@@ -529,24 +505,25 @@ trait TypedDeserializingIterator[E] extends CloseableIterator[E] {
 }
 
 class EmptyDeserializingIterator[E](override val encoder: AgnosticEncoder[E])
-  extends TypedDeserializingIterator[E] {
+    extends TypedDeserializingIterator[E] {
   override def close(): Unit = ()
   override def hasNext: Boolean = false
   override def next(): E = throw new NoSuchElementException()
 }
 
 class ArrowDeserializingIterator[E](
-    override val encoder: AgnosticEncoder[E],
-    private[this] val reader: ConcatenatingArrowStreamReader)
-  extends TypedDeserializingIterator[E] {
+    val encoder: AgnosticEncoder[E],
+    private[this] val reader: ArrowReader)
+    extends TypedDeserializingIterator[E] {
   private[this] var index = 0
   private[this] val root = reader.getVectorSchemaRoot
   private[this] val deserializer = ArrowDeserializers.deserializerFor(encoder, root)
 
   override def hasNext: Boolean = {
     if (index >= root.getRowCount) {
-      reader.loadNextBatch()
-      index = 0
+      if (reader.loadNextBatch()) {
+        index = 0
+      }
     }
     index < root.getRowCount
   }
@@ -562,4 +539,3 @@ class ArrowDeserializingIterator[E](
 
   override def close(): Unit = reader.close()
 }
-
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ConcatenatingArrowStreamReader.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ConcatenatingArrowStreamReader.scala
new file mode 100644
index 0000000000000..90963c831c252
--- /dev/null
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ConcatenatingArrowStreamReader.scala
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import java.io.{InputStream, IOException}
+import java.nio.channels.Channels
+
+import org.apache.arrow.flatbuf.MessageHeader
+import org.apache.arrow.memory.{ArrowBuf, BufferAllocator}
+import org.apache.arrow.vector.ipc.{ArrowReader, ReadChannel}
+import org.apache.arrow.vector.ipc.message.{ArrowDictionaryBatch, ArrowMessage, ArrowRecordBatch, MessageChannelReader, MessageResult, MessageSerializer}
+import org.apache.arrow.vector.types.pojo.Schema
+
+/**
+ * An [[ArrowReader]] that concatenates multiple [[MessageIterator]]s into a single stream. Each
+ * iterator represents a single IPC stream. The concatenated streams all must have the same
+ * schema. If the schema is different an exception is thrown.
+ *
+ * In some cases we want to retain the messages (see `SparkResult`). Normally a stream reader
+ * closes its messages when it consumes them. In order to prevent that from happening in
+ * non-destructive mode we clone the messages before passing them to the reading logic.
+ */
+class ConcatenatingArrowStreamReader(
+    allocator: BufferAllocator,
+    input: Iterator[AbstractMessageIterator],
+    destructive: Boolean)
+    extends ArrowReader(allocator) {
+
+  private[this] var totalBytesRead: Long = 0
+  private[this] var current: AbstractMessageIterator = _
+
+  override protected def readSchema(): Schema = {
+    // readSchema() should only be called once during initialization.
+    assert(current == null)
+    if (!input.hasNext) {
+      // ArrowStreamReader throws the same exception.
+      throw new IOException("Unexpected end of input. Missing schema.")
+    }
+    current = input.next()
+    current.schema
+  }
+
+  private def nextMessage(): ArrowMessage = {
+    // readSchema() should have been invoked at this point so 'current' should be initialized.
+    assert(current != null)
+    // Try to find a non-empty message iterator.
+    while (!current.hasNext && input.hasNext) {
+      totalBytesRead += current.bytesRead
+      current = input.next()
+      if (current.schema != getVectorSchemaRoot.getSchema) {
+        throw new IllegalStateException()
+      }
+    }
+    if (current.hasNext) {
+      current.next()
+    } else {
+      null
+    }
+  }
+
+  override def loadNextBatch(): Boolean = {
+    // Keep looping until we load a non-empty batch or until we exhaust the input.
+    var message = nextMessage()
+    while (message != null) {
+      message match {
+        case rb: ArrowRecordBatch =>
+          loadRecordBatch(cloneIfNonDestructive(rb))
+          if (getVectorSchemaRoot.getRowCount > 0) {
+            return true
+          }
+        case db: ArrowDictionaryBatch =>
+          loadDictionary(cloneIfNonDestructive(db))
+      }
+      message = nextMessage()
+    }
+    false
+  }
+
+  private def cloneIfNonDestructive(batch: ArrowRecordBatch): ArrowRecordBatch = {
+    if (destructive) {
+      return batch
+    }
+    cloneRecordBatch(batch)
+  }
+
+  private def cloneIfNonDestructive(batch: ArrowDictionaryBatch): ArrowDictionaryBatch = {
+    if (destructive) {
+      return batch
+    }
+    new ArrowDictionaryBatch(
+      batch.getDictionaryId,
+      cloneRecordBatch(batch.getDictionary),
+      batch.isDelta)
+  }
+
+  private def cloneRecordBatch(batch: ArrowRecordBatch): ArrowRecordBatch = {
+    new ArrowRecordBatch(
+      batch.getLength,
+      batch.getNodes,
+      batch.getBuffers,
+      batch.getBodyCompression,
+      true,
+      true)
+  }
+
+  override def bytesRead(): Long = {
+    if (current != null) {
+      totalBytesRead + current.bytesRead
+    } else {
+      0
+    }
+  }
+
+  override def closeReadSource(): Unit = ()
+}
+
+trait AbstractMessageIterator extends Iterator[ArrowMessage] {
+  def schema: Schema
+  def bytesRead: Long
+}
+
+/**
+ * Decode an Arrow IPC stream into individual messages. Please note that this iterator MUST have a
+ * valid IPC stream as its input, otherwise construction will fail.
+ */
+class MessageIterator(input: InputStream, allocator: BufferAllocator)
+    extends AbstractMessageIterator {
+  private[this] val in = new ReadChannel(Channels.newChannel(input))
+  private[this] val reader = new MessageChannelReader(in, allocator)
+  private[this] var result: MessageResult = _
+
+  // Eagerly read the schema.
+  val schema: Schema = {
+    val result = reader.readNext()
+    if (result == null) {
+      throw new IOException("Unexpected end of input. Missing schema.")
+    }
+    MessageSerializer.deserializeSchema(result.getMessage)
+  }
+
+  override def bytesRead: Long = reader.bytesRead()
+
+  override def hasNext: Boolean = {
+    if (result == null) {
+      result = reader.readNext()
+    }
+    result != null
+  }
+
+  override def next(): ArrowMessage = {
+    if (!hasNext) {
+      throw new NoSuchElementException()
+    }
+    val message = result.getMessage.headerType() match {
+      case MessageHeader.RecordBatch =>
+        MessageSerializer.deserializeRecordBatch(result.getMessage, bodyBuffer(result))
+      case MessageHeader.DictionaryBatch =>
+        MessageSerializer.deserializeDictionaryBatch(result.getMessage, bodyBuffer(result))
+    }
+    result = null
+    message
+  }
+
+  private def bodyBuffer(result: MessageResult): ArrowBuf = {
+    var buffer = result.getBodyBuffer
+    if (buffer == null) {
+      buffer = allocator.getEmpty
+    }
+    buffer
+  }
+}
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
index 0c327484e477d..73c401c26cd5b 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -21,24 +21,19 @@ import java.util
 import java.util.{Collections, Objects}
 
 import scala.beans.BeanProperty
-import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.reflect.classTag
-import scala.util.control.NonFatal
 
-import com.google.protobuf.ByteString
 import org.apache.arrow.memory.{BufferAllocator, RootAllocator}
 import org.apache.arrow.vector.VarBinaryVector
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkUnsupportedOperationException
-import org.apache.spark.connect.proto
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, JavaTypeInference, ScalaReflection}
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedIntEncoder, CalendarIntervalEncoder, DateEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, PrimitiveDoubleEncoder, PrimitiveFloatEncoder, RowEncoder, StringEncoder, TimestampEncoder, UDTEncoder}
 import org.apache.spark.sql.catalyst.encoders.RowEncoder.{encoderFor => toRowEncoder}
-import org.apache.spark.sql.connect.client.SparkResult
 import org.apache.spark.sql.connect.client.arrow.FooEnum.FooEnum
 import org.apache.spark.sql.connect.client.util.ConnectFunSuite
 import org.apache.spark.sql.types.{ArrayType, DataType, Decimal, DecimalType, IntegerType, Metadata, SQLUserDefinedType, StructType, UserDefinedType}
@@ -96,15 +91,7 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
     }
 
     val resultIterator =
-      try {
-        deserializeFromArrow(inspectedIterator, encoder, deserializerAllocator)
-      } catch {
-        case NonFatal(e) =>
-          arrowIterator.close()
-          serializerAllocator.close()
-          deserializerAllocator.close()
-          throw e
-      }
+      ArrowDeserializers.deserializeFromArrow(inspectedIterator, encoder, deserializerAllocator)
     new CloseableIterator[T] {
       override def close(): Unit = {
         arrowIterator.close()
@@ -117,25 +104,6 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
     }
   }
 
-  // Temporary hack until we merge the deserializer.
-  private def deserializeFromArrow[E](
-      batches: Iterator[Array[Byte]],
-      encoder: AgnosticEncoder[E],
-      allocator: BufferAllocator): CloseableIterator[E] = {
-    val responses = batches.map { batch =>
-      val builder = proto.ExecutePlanResponse.newBuilder()
-      builder.getArrowBatchBuilder.setData(ByteString.copyFrom(batch))
-      builder.build()
-    }
-    val result = new SparkResult[E](responses.asJava, allocator, encoder)
-    new CloseableIterator[E] {
-      private val itr = result.iterator
-      override def close(): Unit = itr.close()
-      override def hasNext: Boolean = itr.hasNext
-      override def next(): E = itr.next()
-    }
-  }
-
   private def roundTripAndCheck[T](
       encoder: AgnosticEncoder[T],
       toInputIterator: () => Iterator[Any],

From af6f6c5f6495ce97f3d97927f2aed89743701ee6 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Sat, 15 Jul 2023 02:06:52 -0400
Subject: [PATCH 11/16] Fix JDK 8 compilation

---
 .../spark/sql/connect/client/arrow/ArrowDeserializer.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
index 33d1b025ab733..4dc6da30644f1 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
@@ -288,7 +288,7 @@ object ArrowDeserializers {
           MethodType.methodType(classOf[Unit], fields.map(_.enc.clsTag.runtimeClass).asJava)
         val constructor = methodLookup
           .findConstructor(tag.runtimeClass, methodType)
-          .asSpreader(0, classOf[Array[Any]], fields.size)
+          .asSpreader(classOf[Array[Any]], fields.size)
         val deserializers = if (isTuple(tag.runtimeClass)) {
           fields.zip(vectors).toArray.map { case (field, vector) =>
             deserializerFor(field.enc, vector)

From 3a8733938c935cf833f09c44c6a6d5b7f19fb0a1 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Sun, 16 Jul 2023 19:26:50 -0400
Subject: [PATCH 12/16] Fix SparkResult failure & CTOR lookup

---
 .../spark/sql/connect/client/SparkResult.scala |  2 --
 .../client/arrow/ArrowDeserializer.scala       | 18 ++++++++----------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
index 41727195991ad..b2a63208472b7 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
@@ -106,8 +106,6 @@ private[sql] class SparkResult[T](
         if (stopOnFirstNonEmptyResponse && numRecordsInBatch > 0) {
           return true
         }
-      } else {
-        throw new UnsupportedOperationException(s"Unsupported response: $response")
       }
     }
     false
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
index 4dc6da30644f1..1bb002d928025 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
@@ -24,7 +24,6 @@ import java.time._
 import java.util
 import java.util.{List => JList, Locale, Map => JMap}
 
-import scala.collection.JavaConverters._
 import scala.collection.generic.{GenericCompanion, GenMapFactory}
 import scala.collection.mutable
 import scala.reflect.ClassTag
@@ -35,6 +34,7 @@ import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
 import org.apache.arrow.vector.ipc.ArrowReader
 import org.apache.arrow.vector.util.Text
 
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders._
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@@ -284,25 +284,23 @@ object ArrowDeserializers {
         }
 
       case (ProductEncoder(tag, fields), StructVectors(struct, vectors)) =>
-        val methodType =
-          MethodType.methodType(classOf[Unit], fields.map(_.enc.clsTag.runtimeClass).asJava)
-        val constructor = methodLookup
-          .findConstructor(tag.runtimeClass, methodType)
-          .asSpreader(classOf[Array[Any]], fields.size)
+        // We should try to make this work with MethodHandles.
+        val Some(constructor) = ScalaReflection.findConstructor(
+          tag.runtimeClass,
+          fields.map(_.enc.clsTag.runtimeClass))
         val deserializers = if (isTuple(tag.runtimeClass)) {
-          fields.zip(vectors).toArray.map { case (field, vector) =>
+          fields.zip(vectors).map { case (field, vector) =>
             deserializerFor(field.enc, vector)
           }
         } else {
           val lookup = createFieldLookup(vectors)
-          fields.toArray.map { field =>
+          fields.map { field =>
             deserializerFor(field.enc, lookup(field.name))
           }
         }
         new StructFieldSerializer[Any](struct) {
           def value(i: Int): Any = {
-            val parameters = deserializers.map(_.get(i))
-            constructor.invoke(parameters) // See if we can get invokeExact to work here.
+            constructor(deserializers.map(_.get(i).asInstanceOf[AnyRef]))
           }
         }
 

From 7c8028d77806e5a8a0e3a459e5f619382d2db8e6 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Tue, 18 Jul 2023 05:08:14 -0400
Subject: [PATCH 13/16] Fix tests and style

---
 .../sql/connect/client/SparkResult.scala      | 89 +++++++++++++------
 .../client/arrow/ArrowDeserializer.scala      | 10 +--
 .../apache/spark/sql/ClientE2ETestSuite.scala | 49 +++++-----
 .../KeyValueGroupedDatasetE2ETestSuite.scala  | 36 +++++---
 .../spark/sql/application/ReplE2ESuite.scala  |  6 +-
 5 files changed, 117 insertions(+), 73 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
index b2a63208472b7..c7261becfb10c 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.sql.connect.client
 
+import java.util.Objects
+
 import scala.collection.mutable
 
 import org.apache.arrow.memory.BufferAllocator
@@ -25,7 +27,7 @@ import org.apache.arrow.vector.types.pojo
 import org.apache.spark.connect.proto
 import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ProductEncoder, UnboundRowEncoder}
-import org.apache.spark.sql.connect.client.arrow.{AbstractMessageIterator, ArrowDeserializingIterator, ConcatenatingArrowStreamReader, MessageIterator}
+import org.apache.spark.sql.connect.client.arrow.{AbstractMessageIterator, ArrowDeserializingIterator, CloseableIterator, ConcatenatingArrowStreamReader, MessageIterator}
 import org.apache.spark.sql.connect.client.util.Cleanable
 import org.apache.spark.sql.connect.common.DataTypeProtoConverter
 import org.apache.spark.sql.types.{DataType, StructType}
@@ -69,19 +71,26 @@ private[sql] class SparkResult[T](
     }
   }
 
-  private def processResponses(stopOnFirstNonEmptyResponse: Boolean): Boolean = {
-    while (responses.hasNext) {
+  private def processResponses(
+      stopOnSchema: Boolean = false,
+      stopOnArrowSchema: Boolean = false,
+      stopOnFirstNonEmptyResponse: Boolean = false): Boolean = {
+    var nonEmpty = false
+    var stop = false
+    while (!stop && responses.hasNext) {
       val response = responses.next()
       if (response.hasSchema) {
         // The original schema should arrive before ArrowBatches.
         structType =
           DataTypeProtoConverter.toCatalystType(response.getSchema).asInstanceOf[StructType]
+        stop |= stopOnSchema
       }
       if (response.hasArrowBatch) {
         val ipcStreamBytes = response.getArrowBatch.getData
         val reader = new MessageIterator(ipcStreamBytes.newInput(), allocator)
         if (arrowSchema == null) {
           arrowSchema = reader.schema
+          stop |= stopOnArrowSchema
         } else if (arrowSchema != reader.schema) {
           // Uh oh...
         }
@@ -100,15 +109,17 @@ private[sql] class SparkResult[T](
           }
           messages += message
         }
-        numRecords += numRecordsInBatch
-        resultMap.put(nextResultIndex, (reader.bytesRead, messages.result()))
-        nextResultIndex += 1
-        if (stopOnFirstNonEmptyResponse && numRecordsInBatch > 0) {
-          return true
+        // Skip the entire result if it is empty.
+        if (numRecordsInBatch > 0) {
+          numRecords += numRecordsInBatch
+          resultMap.put(nextResultIndex, (reader.bytesRead, messages.result()))
+          nextResultIndex += 1
+          nonEmpty |= true
+          stop |= stopOnFirstNonEmptyResponse
         }
       }
     }
-    false
+    nonEmpty
   }
 
   /**
@@ -116,7 +127,7 @@ private[sql] class SparkResult[T](
    */
   def length: Int = {
     // We need to process all responses to make sure numRecords is correct.
-    processResponses(stopOnFirstNonEmptyResponse = false)
+    processResponses()
     numRecords
   }
 
@@ -125,7 +136,9 @@ private[sql] class SparkResult[T](
    *   the schema of the result.
    */
   def schema: StructType = {
-    processResponses(stopOnFirstNonEmptyResponse = true)
+    if (structType == null) {
+      processResponses(stopOnSchema = true)
+    }
     structType
   }
 
@@ -157,19 +170,35 @@ private[sql] class SparkResult[T](
     buildIterator(destructive = true)
 
   private def buildIterator(destructive: Boolean): java.util.Iterator[T] with AutoCloseable = {
-    if (schema == null) {
-      processResponses(true)
-    }
-    val iterator = new ArrowDeserializingIterator(
-      createEncoder(encoder, structType),
-      new ConcatenatingArrowStreamReader(
-        allocator,
-        Iterator.single(new ResultMessageIterator(destructive)),
-        destructive))
     new java.util.Iterator[T] with AutoCloseable {
-      override def hasNext: Boolean = iterator.hasNext
-      override def next(): T = iterator.next()
-      override def close(): Unit = iterator.close()
+      private[this] var iterator: CloseableIterator[T] = _
+
+      private def initialize(): Unit = {
+        if (iterator == null) {
+          iterator = new ArrowDeserializingIterator(
+            createEncoder(encoder, schema),
+            new ConcatenatingArrowStreamReader(
+              allocator,
+              Iterator.single(new ResultMessageIterator(destructive)),
+              destructive))
+        }
+      }
+
+      override def hasNext: Boolean = {
+        initialize()
+        iterator.hasNext
+      }
+
+      override def next(): T = {
+        initialize()
+        iterator.next()
+      }
+
+      override def close(): Unit = {
+        if (iterator != null) {
+          iterator.close()
+        }
+      }
     }
   }
 
@@ -185,14 +214,24 @@ private[sql] class SparkResult[T](
     private[this] var nextResultIndex = 0
     private[this] var iterator: Iterator[ArrowMessage] = Iterator.empty
 
-    override def schema: pojo.Schema = arrowSchema
     override def bytesRead: Long = totalBytesRead
+
+    override def schema: pojo.Schema = {
+      if (arrowSchema == null) {
+        // We need a schema to proceed. Spark Connect will always
+        // return a result (with a schema) even if the result is empty.
+        processResponses(stopOnArrowSchema = true)
+        Objects.requireNonNull(arrowSchema)
+      }
+      arrowSchema
+    }
+
     override def hasNext: Boolean = {
       if (iterator.hasNext) {
         return true
       }
       val hasNextResult = if (!resultMap.contains(nextResultIndex)) {
-        self.processResponses(true)
+        self.processResponses(stopOnFirstNonEmptyResponse = true)
       } else {
         true
       }
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
index 1bb002d928025..5eb79a8438938 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
@@ -74,7 +74,7 @@ object ArrowDeserializers {
   private[arrow] def deserializerFor[T](
       encoder: AgnosticEncoder[T],
       root: VectorSchemaRoot): Deserializer[T] = {
-    val data: AnyRef = if (encoder.schema == encoder.dataType) {
+    val data: AnyRef = if (encoder.isStruct) {
       root
     } else {
       // The input schema is allowed to have multiple columns,
@@ -285,9 +285,8 @@ object ArrowDeserializers {
 
       case (ProductEncoder(tag, fields), StructVectors(struct, vectors)) =>
         // We should try to make this work with MethodHandles.
-        val Some(constructor) = ScalaReflection.findConstructor(
-          tag.runtimeClass,
-          fields.map(_.enc.clsTag.runtimeClass))
+        val Some(constructor) =
+          ScalaReflection.findConstructor(tag.runtimeClass, fields.map(_.enc.clsTag.runtimeClass))
         val deserializers = if (isTuple(tag.runtimeClass)) {
           fields.zip(vectors).map { case (field, vector) =>
             deserializerFor(field.enc, vector)
@@ -341,7 +340,8 @@ object ArrowDeserializers {
         throw QueryExecutionErrors.unsupportedDataTypeError(encoder.dataType)
 
       case _ =>
-        throw new RuntimeException(s"Unsupported Encoder($encoder)/Vector($data) combination.")
+        throw new RuntimeException(
+          s"Unsupported Encoder($encoder)/Vector(${data.getClass}) combination.")
     }
   }
 
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
index 73c04389c0597..07dd2a96bd8f7 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -39,7 +39,6 @@ import org.apache.spark.sql.connect.client.util.SparkConnectServerUtils.port
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateMethodTester {
 
@@ -571,7 +570,8 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
     (col("id") / lit(10.0d)).as("b"),
     col("id"),
     lit("world").as("d"),
-    (col("id") % 2).cast("int").as("a"))
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    (col("id") % 2).cast("double").as("a"))
 
   private def validateMyTypeResult(result: Array[MyType]): Unit = {
     result.zipWithIndex.foreach { case (MyType(id, a, b), i) =>
@@ -818,10 +818,11 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
   }
 
   test("toJSON") {
+    // TODO SPARK-44449 make this int again when upcasting is in.
     val expected = Array(
-      """{"b":0.0,"id":0,"d":"world","a":0}""",
-      """{"b":0.1,"id":1,"d":"world","a":1}""",
-      """{"b":0.2,"id":2,"d":"world","a":0}""")
+      """{"b":0.0,"id":0,"d":"world","a":0.0}""",
+      """{"b":0.1,"id":1,"d":"world","a":1.0}""",
+      """{"b":0.2,"id":2,"d":"world","a":0.0}""")
     val result = spark
       .range(3)
       .select(generateMyTypeColumns: _*)
@@ -893,14 +894,12 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
 
   test("Dataset result destructive iterator") {
     // Helper methods for accessing private field `idxToBatches` from SparkResult
-    val _idxToBatches =
-      PrivateMethod[mutable.Map[Int, ColumnarBatch]](Symbol("idxToBatches"))
+    val getResultMap =
+      PrivateMethod[mutable.Map[Int, Any]](Symbol("resultMap"))
 
-    def getColumnarBatches(result: SparkResult[_]): Seq[ColumnarBatch] = {
-      val idxToBatches = result invokePrivate _idxToBatches()
-
-      // Sort by key to get stable results.
-      idxToBatches.toSeq.sortBy(_._1).map(_._2)
+    def assertResultsMapEmpty(result: SparkResult[_]): Unit = {
+      val resultMap = result invokePrivate getResultMap()
+      assert(resultMap.isEmpty)
     }
 
     val df = spark
@@ -911,25 +910,19 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
       try {
         // build and verify the destructive iterator
         val iterator = result.destructiveIterator
-        // batches is empty before traversing the result iterator
-        assert(getColumnarBatches(result).isEmpty)
-        var previousBatch: ColumnarBatch = null
-        val buffer = mutable.Buffer.empty[Long]
+        // resultMap Map is empty before traversing the result iterator
+        assertResultsMapEmpty(result)
+        val buffer = mutable.Set.empty[Long]
         while (iterator.hasNext) {
-          // always having 1 batch, since a columnar batch will be removed and closed after
-          // its data got consumed.
-          val batches = getColumnarBatches(result)
-          assert(batches.size === 1)
-          assert(batches.head != previousBatch)
-          previousBatch = batches.head
-
-          buffer.append(iterator.next())
+          // resultMap is empty during iteration because results get removed immediately on access.
+          assertResultsMapEmpty(result)
+          buffer += iterator.next()
         }
-        // Batches should be closed and removed after traversing all the records.
-        assert(getColumnarBatches(result).isEmpty)
+        // resultMap Map is empty afterward because all results have been removed.
+        assertResultsMapEmpty(result)
 
-        val expectedResult = Seq(6L, 7L, 8L)
-        assert(buffer.size === 3 && expectedResult.forall(buffer.contains))
+        val expectedResult = Set(6L, 7L, 8L)
+        assert(buffer.size === 3 && expectedResult == buffer)
       } finally {
         result.close()
       }
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala
index e15069f2d9e96..ab3e13da53178 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala
@@ -68,10 +68,11 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("keyAs - keys") {
+    // TODO SPARK-44449 make this long again when upcasting is in.
     // It is okay to cast from Long to Double, but not Long to Int.
     val values = spark
       .range(10)
-      .groupByKey(v => v % 2)
+      .groupByKey(v => (v % 2).toDouble)
       .keyAs[Double]
       .keys
       .collectAsList()
@@ -232,9 +233,10 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("agg, keyAs") {
+    // TODO SPARK-44449 make this long again when upcasting is in.
     val ds = spark
       .range(10)
-      .groupByKey(v => v % 2)
+      .groupByKey(v => (v % 2).toDouble)
       .keyAs[Double]
       .agg(count("*"))
 
@@ -244,7 +246,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   test("typed aggregation: expr") {
     val session: SparkSession = spark
     import session.implicits._
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1).agg(sum("_2").as[Long]),
@@ -254,7 +257,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("typed aggregation: expr, expr") {
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1).agg(sum("_2").as[Long], sum($"_2" + 1).as[Long]),
@@ -264,7 +268,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("typed aggregation: expr, expr, expr") {
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1).agg(sum("_2").as[Long], sum($"_2" + 1).as[Long], count("*")),
@@ -274,7 +279,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("typed aggregation: expr, expr, expr, expr") {
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1)
@@ -289,7 +295,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("typed aggregation: expr, expr, expr, expr, expr") {
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1)
@@ -305,7 +312,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("typed aggregation: expr, expr, expr, expr, expr, expr") {
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1)
@@ -322,7 +330,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("typed aggregation: expr, expr, expr, expr, expr, expr, expr") {
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1)
@@ -340,7 +349,8 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
   }
 
   test("typed aggregation: expr, expr, expr, expr, expr, expr, expr, expr") {
-    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    // TODO SPARK-44449 make this int again when upcasting is in.
+    val ds = Seq(("a", 10L), ("a", 20L), ("b", 1L), ("b", 2L), ("c", 1L)).toDS()
 
     checkDatasetUnorderly(
       ds.groupByKey(_._1)
@@ -473,9 +483,9 @@ class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper {
     val ds = Seq(("a", 1, 10), ("a", 2, 20), ("b", 2, 1), ("b", 1, 2), ("c", 1, 1))
       .toDF("key", "seq", "value")
     val grouped = ds.groupBy($"value").as[String, (String, Int, Int)]
-    val keys = grouped.keyAs[String].keys.sort($"value")
-
-    checkDataset(keys, "1", "2", "10", "20")
+    // TODO SPARK-44449 make this string again when upcasting is in.
+    val keys = grouped.keyAs[Int].keys.sort($"value")
+    checkDataset(keys, 1, 2, 10, 20)
   }
 
   test("flatMapGroupsWithState") {
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
index 58758a1384031..800ce43a60df0 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
@@ -208,8 +208,9 @@ class ReplE2ESuite extends RemoteSparkSession with BeforeAndAfterEach {
   }
 
   test("UDF Registration") {
+    // TODO SPARK-44449 make this long again when upcasting is in.
     val input = """
-        |class A(x: Int) { def get = x * 100 }
+        |class A(x: Int) { def get: Long = x * 100 }
         |val myUdf = udf((x: Int) => new A(x).get)
         |spark.udf.register("dummyUdf", myUdf)
         |spark.sql("select dummyUdf(id) from range(5)").as[Long].collect()
@@ -219,8 +220,9 @@ class ReplE2ESuite extends RemoteSparkSession with BeforeAndAfterEach {
   }
 
   test("UDF closure registration") {
+    // TODO SPARK-44449 make this int again when upcasting is in.
     val input = """
-        |class A(x: Int) { def get = x * 15 }
+        |class A(x: Int) { def get: Long = x * 15 }
         |spark.udf.register("directUdf", (x: Int) => new A(x).get)
         |spark.sql("select directUdf(id) from range(5)").as[Long].collect()
       """.stripMargin

From c198ce7005b5e67774206952162bcf571d7a2f94 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Tue, 18 Jul 2023 13:29:05 -0400
Subject: [PATCH 14/16] Fix 2.13 build

---
 .../client/arrow/ScalaCollectionUtils.scala   | 38 +++++++++++++++++++
 .../client/arrow/ScalaCollectionUtils.scala   | 37 ++++++++++++++++++
 .../sql/connect/client/SparkResult.scala      |  8 ++--
 .../client/arrow/ArrowDeserializer.scala      |  9 ++---
 4 files changed, 83 insertions(+), 9 deletions(-)
 create mode 100644 connector/connect/client/jvm/src/main/scala-2.12/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala
 create mode 100644 connector/connect/client/jvm/src/main/scala-2.13/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala

diff --git a/connector/connect/client/jvm/src/main/scala-2.12/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala b/connector/connect/client/jvm/src/main/scala-2.12/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala
new file mode 100644
index 0000000000000..c2e01d974e0e4
--- /dev/null
+++ b/connector/connect/client/jvm/src/main/scala-2.12/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import scala.collection.generic.{GenericCompanion, GenMapFactory}
+import scala.collection.mutable
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.connect.client.arrow.ArrowDeserializers.resolveCompanion
+
+/**
+ * A couple of scala version specific collection utility functions.
+ */
+private[arrow] object ScalaCollectionUtils {
+  def getIterableCompanion(tag: ClassTag[_]): GenericCompanion[Iterable] = {
+    ArrowDeserializers.resolveCompanion[GenericCompanion[Iterable]](tag)
+  }
+  def getMapCompanion(tag: ClassTag[_]): GenMapFactory[Map] = {
+    resolveCompanion[GenMapFactory[Map]](tag)
+  }
+  def wrap[T](array: AnyRef): mutable.WrappedArray[T] = {
+    mutable.WrappedArray.make(array)
+  }
+}
diff --git a/connector/connect/client/jvm/src/main/scala-2.13/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala b/connector/connect/client/jvm/src/main/scala-2.13/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala
new file mode 100644
index 0000000000000..8a80e34162283
--- /dev/null
+++ b/connector/connect/client/jvm/src/main/scala-2.13/org/apache/spark/sql/connect/client/arrow/ScalaCollectionUtils.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client.arrow
+
+import scala.collection.{mutable, IterableFactory, MapFactory}
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.connect.client.arrow.ArrowDeserializers.resolveCompanion
+
+/**
+ * A couple of scala version specific collection utility functions.
+ */
+private[arrow] object ScalaCollectionUtils {
+  def getIterableCompanion(tag: ClassTag[_]): IterableFactory[Iterable] = {
+    ArrowDeserializers.resolveCompanion[IterableFactory[Iterable]](tag)
+  }
+  def getMapCompanion(tag: ClassTag[_]): MapFactory[Map] = {
+    resolveCompanion[MapFactory[Map]](tag)
+  }
+  def wrap[T](array: AnyRef): mutable.WrappedArray[T] = {
+    mutable.WrappedArray.make(array.asInstanceOf[Array[T]])
+  }
+}
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
index c7261becfb10c..c7e9d22682f86 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
@@ -212,7 +212,7 @@ private[sql] class SparkResult[T](
   private class ResultMessageIterator(destructive: Boolean) extends AbstractMessageIterator {
     private[this] var totalBytesRead = 0L
     private[this] var nextResultIndex = 0
-    private[this] var iterator: Iterator[ArrowMessage] = Iterator.empty
+    private[this] var current: Iterator[ArrowMessage] = Iterator.empty
 
     override def bytesRead: Long = totalBytesRead
 
@@ -227,7 +227,7 @@ private[sql] class SparkResult[T](
     }
 
     override def hasNext: Boolean = {
-      if (iterator.hasNext) {
+      if (current.hasNext) {
         return true
       }
       val hasNextResult = if (!resultMap.contains(nextResultIndex)) {
@@ -242,7 +242,7 @@ private[sql] class SparkResult[T](
           resultMap.get(nextResultIndex)
         }
         totalBytesRead += sizeInBytes
-        iterator = messages.iterator
+        current = messages.iterator
         nextResultIndex += 1
       }
       hasNextResult
@@ -252,7 +252,7 @@ private[sql] class SparkResult[T](
       if (!hasNext) {
         throw new NoSuchElementException()
       }
-      iterator.next()
+      current.next()
     }
   }
 }
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
index 5eb79a8438938..91589e0945aed 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
@@ -24,7 +24,6 @@ import java.time._
 import java.util
 import java.util.{List => JList, Locale, Map => JMap}
 
-import scala.collection.generic.{GenericCompanion, GenMapFactory}
 import scala.collection.mutable
 import scala.reflect.ClassTag
 
@@ -215,11 +214,11 @@ object ArrowDeserializers {
           new FieldDeserializer[mutable.WrappedArray[Any], ListVector](v) {
             def value(i: Int): mutable.WrappedArray[Any] = {
               val array = getArray(vector, i, deserializer)(element.clsTag)
-              mutable.WrappedArray.make(array)
+              ScalaCollectionUtils.wrap(array)
             }
           }
         } else if (isSubClass(Classes.ITERABLE, tag)) {
-          val companion = resolveCompanion[GenericCompanion[Iterable]](tag)
+          val companion = ScalaCollectionUtils.getIterableCompanion(tag)
           new FieldDeserializer[Iterable[Any], ListVector](v) {
             def value(i: Int): Iterable[Any] = {
               val builder = companion.newBuilder[Any]
@@ -251,7 +250,7 @@ object ArrowDeserializers {
         val valueDeserializer =
           deserializerFor(value, structVector.getChild(MapVector.VALUE_NAME))
         if (isSubClass(Classes.MAP, tag)) {
-          val companion = resolveCompanion[GenMapFactory[Map]](tag)
+          val companion = ScalaCollectionUtils.getMapCompanion(tag)
           new FieldDeserializer[Map[Any, Any], MapVector](v) {
             def value(i: Int): Map[Any, Any] = {
               val builder = companion.newBuilder[Any, Any]
@@ -351,7 +350,7 @@ object ArrowDeserializers {
    * Resolve the companion object for a scala class. In our particular case the class we pass in
    * is a Scala collection. We use the companion to create a builder for that collection.
    */
-  private def resolveCompanion[T](tag: ClassTag[_]): T = {
+  private[arrow] def resolveCompanion[T](tag: ClassTag[_]): T = {
     val mirror = scala.reflect.runtime.currentMirror
     val module = mirror.classSymbol(tag.runtimeClass).companion.asModule
     mirror.reflectModule(module).instance.asInstanceOf[T]

From 113e6931d435c6e29c4a170ee09b348c4b0f882a Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Tue, 18 Jul 2023 18:52:13 -0400
Subject: [PATCH 15/16] Maven build + CR + additional tests

---
 connector/connect/client/jvm/pom.xml          |  19 ++++
 .../sql/connect/client/SparkResult.scala      |   8 +-
 .../client/arrow/ArrowDeserializer.scala      |  15 +--
 .../client/arrow/ArrowEncoderSuite.scala      | 101 ++++++++++++++++--
 4 files changed, 122 insertions(+), 21 deletions(-)

diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
index 8a51bf65d6a88..0f6783cbd685b 100644
--- a/connector/connect/client/jvm/pom.xml
+++ b/connector/connect/client/jvm/pom.xml
@@ -140,6 +140,7 @@
     </dependency>
   </dependencies>
   <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
       <!-- Shade all Guava / Protobuf / Netty dependencies of this build -->
@@ -224,6 +225,24 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>add-sources</id>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>add-source</goal>
+            </goals>
+            <configuration>
+              <sources>
+                <source>src/main/scala-${scala.binary.version}</source>
+              </sources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
 </project>
\ No newline at end of file
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
index c7e9d22682f86..1cdc2035de60b 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala
@@ -92,7 +92,13 @@ private[sql] class SparkResult[T](
           arrowSchema = reader.schema
           stop |= stopOnArrowSchema
         } else if (arrowSchema != reader.schema) {
-          // Uh oh...
+          throw new IllegalStateException(
+            s"""Schema Mismatch between expected and received schema:
+               |=== Expected Schema ===
+               |$arrowSchema
+               |=== Received Schema ===
+               |${reader.schema}
+               |""".stripMargin)
         }
         if (structType == null) {
           // If the schema is not available yet, fallback to the arrow schema.
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
index 91589e0945aed..154866d699a34 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowDeserializer.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders._
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
-import org.apache.spark.sql.types.{Decimal, StructType}
+import org.apache.spark.sql.types.Decimal
 
 /**
  * Helper class for converting arrow batches into user objects.
@@ -54,7 +54,7 @@ object ArrowDeserializers {
   def deserializeFromArrow[T](
       input: Iterator[Array[Byte]],
       encoder: AgnosticEncoder[T],
-      allocator: BufferAllocator): TypedDeserializingIterator[T] = {
+      allocator: BufferAllocator): CloseableIterator[T] = {
     try {
       val reader = new ConcatenatingArrowStreamReader(
         allocator,
@@ -496,13 +496,8 @@ object ArrowDeserializers {
   }
 }
 
-trait TypedDeserializingIterator[E] extends CloseableIterator[E] {
-  def encoder: AgnosticEncoder[E]
-  def schema: StructType = encoder.schema
-}
-
-class EmptyDeserializingIterator[E](override val encoder: AgnosticEncoder[E])
-    extends TypedDeserializingIterator[E] {
+class EmptyDeserializingIterator[E](val encoder: AgnosticEncoder[E])
+    extends CloseableIterator[E] {
   override def close(): Unit = ()
   override def hasNext: Boolean = false
   override def next(): E = throw new NoSuchElementException()
@@ -511,7 +506,7 @@ class EmptyDeserializingIterator[E](override val encoder: AgnosticEncoder[E])
 class ArrowDeserializingIterator[E](
     val encoder: AgnosticEncoder[E],
     private[this] val reader: ArrowReader)
-    extends TypedDeserializingIterator[E] {
+    extends CloseableIterator[E] {
   private[this] var index = 0
   private[this] val root = reader.getVectorSchemaRoot
   private[this] val deserializer = ArrowDeserializers.deserializerFor(encoder, root)
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
index 73c401c26cd5b..7e7eab036aed3 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -29,7 +29,7 @@ import org.apache.arrow.vector.VarBinaryVector
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkUnsupportedOperationException
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, JavaTypeInference, ScalaReflection}
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedIntEncoder, CalendarIntervalEncoder, DateEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, PrimitiveDoubleEncoder, PrimitiveFloatEncoder, RowEncoder, StringEncoder, TimestampEncoder, UDTEncoder}
@@ -214,6 +214,17 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
     assert(inspector.sizeInBytes > 0)
   }
 
+  test("deserializing empty iterator") {
+    withAllocator { allocator =>
+      val iterator = ArrowDeserializers.deserializeFromArrow(
+        Iterator.empty,
+        singleIntEncoder,
+        allocator)
+      assert(iterator.isEmpty)
+      assert(allocator.getAllocatedMemory == 0)
+    }
+  }
+
   test("single batch") {
     val inspector = new CountingBatchInspector
     roundTripAndCheckIdentical(singleIntEncoder, inspectBatch = inspector) { () =>
@@ -501,15 +512,22 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
       val maybeNull = MaybeNull(11)
       Iterator.tabulate(100) { i =>
         val bean = new JavaMapData
-        bean.setDummyToDoubleListMap(maybeNull {
-          val map = new util.HashMap[DummyBean, java.util.List[java.lang.Double]]
-          (0 until (i % 5)).foreach { j =>
-            val dummy = new DummyBean
-            dummy.setBigInteger(maybeNull(java.math.BigInteger.valueOf(i * j)))
+        bean.setMetricMap(maybeNull {
+          val map = new util.HashMap[String, util.List[java.lang.Double]]
+          (0 until (i % 20)).foreach { i =>
             val values = Array.tabulate(i % 40) { j =>
               Double.box(j.toDouble)
             }
-            map.put(dummy, maybeNull(util.Arrays.asList(values: _*)))
+            map.put("k" + i, maybeNull(util.Arrays.asList(values: _*)))
+          }
+          map
+        })
+        bean.setDummyToStringMap(maybeNull {
+          val map = new util.HashMap[DummyBean, String]
+          (0 until (i % 5)).foreach { j =>
+            val dummy = new DummyBean
+            dummy.setBigInteger(maybeNull(java.math.BigInteger.valueOf(i * j)))
+            map.put(dummy, maybeNull("s" + i + "v" + j))
           }
           map
         })
@@ -643,6 +661,63 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
           .add("Ca", "array<int>")
           .add("Cb", "binary")))
 
+  test("bind to schema") {
+    // Binds to a wider schema. The narrow schema has fewer (nested) fields, has a slightly
+    // different field order, and uses different cased names in a couple of places.
+    withAllocator { allocator =>
+      val input = Row(
+        887,
+        "foo",
+        Row(Seq(1, 7, 5), Array[Byte](8.toByte, 756.toByte), 5f),
+        Seq(Row(null, "a", false), Row(javaBigDecimal(57853, 10), "b", false)))
+      val expected = Row(
+        "foo",
+        Seq(Row(null, false), Row(javaBigDecimal(57853, 10), false)),
+        Row(Seq(1, 7, 5), Array[Byte](8.toByte, 756.toByte)))
+      val arrowBatches = serializeToArrow(Iterator.single(input), wideSchemaEncoder, allocator)
+      val result = ArrowDeserializers.deserializeFromArrow(
+        arrowBatches,
+        narrowSchemaEncoder,
+        allocator)
+      val actual = result.next()
+      assert(result.isEmpty)
+      assert(expected === actual)
+      result.close()
+      arrowBatches.close()
+    }
+  }
+
+  test("unknown field") {
+    withAllocator { allocator =>
+      val arrowBatches = serializeToArrow(Iterator.empty, narrowSchemaEncoder, allocator)
+      intercept[AnalysisException] {
+        ArrowDeserializers.deserializeFromArrow(
+          arrowBatches,
+          wideSchemaEncoder,
+          allocator)
+      }
+      arrowBatches.close()
+    }
+  }
+
+  test("duplicate fields") {
+    val duplicateSchemaEncoder = toRowEncoder(new StructType()
+      .add("foO", "string")
+      .add("Foo", "string"))
+    val fooSchemaEncoder = toRowEncoder(new StructType()
+      .add("foo", "string"))
+    withAllocator { allocator =>
+      val arrowBatches = serializeToArrow(Iterator.empty, duplicateSchemaEncoder, allocator)
+      intercept[AnalysisException] {
+        ArrowDeserializers.deserializeFromArrow(
+          arrowBatches,
+          fooSchemaEncoder,
+          allocator)
+      }
+      arrowBatches.close()
+    }
+  }
+
   /* ******************************************************************** *
    * Arrow serialization/deserialization specific errors
    * ******************************************************************** */
@@ -801,17 +876,23 @@ case class MapData(intStringMap: Map[Int, String], metricMap: Map[String, Array[
 
 class JavaMapData {
   @scala.beans.BeanProperty
-  var dummyToDoubleListMap: java.util.Map[DummyBean, java.util.List[java.lang.Double]] = _
+  var dummyToStringMap: java.util.Map[DummyBean, String] = _
+
+  @scala.beans.BeanProperty
+  var metricMap: java.util.HashMap[String, java.util.List[java.lang.Double]] = _
 
   def canEqual(other: Any): Boolean = other.isInstanceOf[JavaMapData]
 
   override def equals(other: Any): Boolean = other match {
     case that: JavaMapData if that canEqual this =>
-      dummyToDoubleListMap == that.dummyToDoubleListMap
+      dummyToStringMap == that.dummyToStringMap &&
+        metricMap == that.metricMap
     case _ => false
   }
 
-  override def hashCode(): Int = Objects.hashCode(dummyToDoubleListMap)
+  override def hashCode(): Int = {
+    java.util.Arrays.deepHashCode(Array(dummyToStringMap, metricMap))
+  }
 }
 
 class DummyBean {

From 6d140a4f6c8d91678d382e726938a439069463a2 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <herman@databricks.com>
Date: Wed, 19 Jul 2023 03:45:37 -0400
Subject: [PATCH 16/16] Style

---
 .../client/arrow/ArrowEncoderSuite.scala      | 36 ++++++++-----------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
index 7e7eab036aed3..16eec3eee3110 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/arrow/ArrowEncoderSuite.scala
@@ -216,10 +216,8 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
 
   test("deserializing empty iterator") {
     withAllocator { allocator =>
-      val iterator = ArrowDeserializers.deserializeFromArrow(
-        Iterator.empty,
-        singleIntEncoder,
-        allocator)
+      val iterator =
+        ArrowDeserializers.deserializeFromArrow(Iterator.empty, singleIntEncoder, allocator)
       assert(iterator.isEmpty)
       assert(allocator.getAllocatedMemory == 0)
     }
@@ -675,10 +673,8 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
         Seq(Row(null, false), Row(javaBigDecimal(57853, 10), false)),
         Row(Seq(1, 7, 5), Array[Byte](8.toByte, 756.toByte)))
       val arrowBatches = serializeToArrow(Iterator.single(input), wideSchemaEncoder, allocator)
-      val result = ArrowDeserializers.deserializeFromArrow(
-        arrowBatches,
-        narrowSchemaEncoder,
-        allocator)
+      val result =
+        ArrowDeserializers.deserializeFromArrow(arrowBatches, narrowSchemaEncoder, allocator)
       val actual = result.next()
       assert(result.isEmpty)
       assert(expected === actual)
@@ -691,28 +687,24 @@ class ArrowEncoderSuite extends ConnectFunSuite with BeforeAndAfterAll {
     withAllocator { allocator =>
       val arrowBatches = serializeToArrow(Iterator.empty, narrowSchemaEncoder, allocator)
       intercept[AnalysisException] {
-        ArrowDeserializers.deserializeFromArrow(
-          arrowBatches,
-          wideSchemaEncoder,
-          allocator)
+        ArrowDeserializers.deserializeFromArrow(arrowBatches, wideSchemaEncoder, allocator)
       }
       arrowBatches.close()
     }
   }
 
   test("duplicate fields") {
-    val duplicateSchemaEncoder = toRowEncoder(new StructType()
-      .add("foO", "string")
-      .add("Foo", "string"))
-    val fooSchemaEncoder = toRowEncoder(new StructType()
-      .add("foo", "string"))
+    val duplicateSchemaEncoder = toRowEncoder(
+      new StructType()
+        .add("foO", "string")
+        .add("Foo", "string"))
+    val fooSchemaEncoder = toRowEncoder(
+      new StructType()
+        .add("foo", "string"))
     withAllocator { allocator =>
       val arrowBatches = serializeToArrow(Iterator.empty, duplicateSchemaEncoder, allocator)
       intercept[AnalysisException] {
-        ArrowDeserializers.deserializeFromArrow(
-          arrowBatches,
-          fooSchemaEncoder,
-          allocator)
+        ArrowDeserializers.deserializeFromArrow(arrowBatches, fooSchemaEncoder, allocator)
       }
       arrowBatches.close()
     }
@@ -886,7 +878,7 @@ class JavaMapData {
   override def equals(other: Any): Boolean = other match {
     case that: JavaMapData if that canEqual this =>
       dummyToStringMap == that.dummyToStringMap &&
-        metricMap == that.metricMap
+      metricMap == that.metricMap
     case _ => false
   }