From 52d96f7d71f3c6088e4a96f238cbe89082c436f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Helge=20Br=C3=BCgner?=
 <11437312+moredatapls@users.noreply.github.com>
Date: Fri, 17 Feb 2023 22:53:30 +0100
Subject: [PATCH] Upgrade to Spark 3.3.0 (#197)

* add jdbcutils class, upgrade to spark 3.3

* replace usages of createconnectionfactory with new createconnection

* added integration test for the jdbc connection

* refactored test dir

* added read write integration test

* bump package version

* add version to the readme

* add version to the other table in the readme

* fix typo in readme

* add newline

* removed integration tests and readme changes, update profile name
---
 .github/workflows/scala.yml                   |  2 +-
 README.md                                     | 25 ++++++++++---------
 pom.xml                                       |  6 ++---
 .../sqlserver/jdbc/spark/DefaultSource.scala  | 11 +++-----
 .../ReliableSingleInstanceStrategy.scala      | 12 ++++-----
 .../jdbc/spark/utils/BulkCopyUtils.scala      | 16 ++++++------
 .../jdbc/spark/utils/DataPoolUtils.scala      | 10 ++++----
 .../jdbc/spark/utils/JdbcUtils.scala          | 19 ++++++++++++++
 .../{ => unit}/bulkwrite/DataSourceTest.scala |  8 +++---
 test/scala_test/pom.xml                       |  4 +--
 10 files changed, 66 insertions(+), 47 deletions(-)
 create mode 100644 src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/JdbcUtils.scala
 rename src/test/scala/com/microsoft/sqlserver/jdbc/spark/{ => unit}/bulkwrite/DataSourceTest.scala (97%)

diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml
index 3fc4d7ca..ebeca400 100644
--- a/.github/workflows/scala.yml
+++ b/.github/workflows/scala.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macOS-latest]
-        profile: ['spark31']
+        profile: ['spark33']
     timeout-minutes: 15
 
     steps:
diff --git a/README.md b/README.md
index f4f8d83a..d5496f34 100644
--- a/README.md
+++ b/README.md
@@ -11,11 +11,12 @@ This library contains the source code for the Apache Spark Connector for SQL Ser
 
 There are three version sets of the connector available through Maven, a 2.4.x, a 3.0.x and a 3.1.x compatible version. All versions can be found [here](https://search.maven.org/search?q=spark-mssql-connector) and can be imported using the coordinates below:
 
-| Connector | Maven Coordinate | Scala Version |
-| --------- | ---------------- | ------------- |
-| Spark 2.4.x compatible connnector | `com.microsoft.azure:spark-mssql-connector:1.0.2` | 2.11 |
-| Spark 3.0.x compatible connnector | `com.microsoft.azure:spark-mssql-connector_2.12:1.1.0` | 2.12 |
-| Spark 3.1.x compatible connnector | `com.microsoft.azure:spark-mssql-connector_2.12:1.2.0` | 2.12 |
+| Connector                        | Maven Coordinate                                       | Scala Version |
+|----------------------------------|--------------------------------------------------------|---------------|
+| Spark 2.4.x compatible connector | `com.microsoft.azure:spark-mssql-connector:1.0.2`      | 2.11          |
+| Spark 3.0.x compatible connector | `com.microsoft.azure:spark-mssql-connector_2.12:1.1.0` | 2.12          |
+| Spark 3.1.x compatible connector | `com.microsoft.azure:spark-mssql-connector_2.12:1.2.0` | 2.12          |
+| Spark 3.3.x compatible connector | `com.microsoft.azure:spark-mssql-connector_2.12:1.3.0` | 2.12          |
 
 ## Current Releases
 
@@ -35,13 +36,13 @@ For main changes from previous releases and known issues please refer to [CHANGE
 * Reliable connector support for Sql Server Single Instance
 
 
-| Component | Versions Supported |
-| --------- | ------------------ |
-| Apache Spark | 2.4.x, 3.0.x, 3.1.x |
-| Scala | 2.11, 2.12 |
-| Microsoft JDBC Driver for SQL Server | 8.4.1 |
-| Microsoft SQL Server | SQL Server 2008 or later |
-| Azure SQL Databases | Supported |
+| Component                            | Versions Supported         |
+|--------------------------------------|----------------------------|
+| Apache Spark                         | 2.4.x, 3.0.x, 3.1.x, 3.3.x |
+| Scala                                | 2.11, 2.12                 |
+| Microsoft JDBC Driver for SQL Server | 8.4.1                      |
+| Microsoft SQL Server                 | SQL Server 2008 or later   |
+| Azure SQL Databases                  | Supported                  |
 
 *Note: Azure Synapse (Azure SQL DW) use is not tested with this connector. While it may work, there may be unintended consequences.*
 
diff --git a/pom.xml b/pom.xml
index 4611f55d..cfc55dee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
     <groupId>com.microsoft.azure</groupId>
     <artifactId>spark-mssql-connector</artifactId>
     <packaging>jar</packaging>
-    <version>1.2.0</version>
+    <version>1.3.0</version>
     <name>${project.groupId}:${project.artifactId}</name>
     <description>The Apache Spark Connector for SQL Server and Azure SQL is a high-performance connector that enables you to use transactional data in big data analytics and persists results for ad-hoc queries or reporting.</description>
     <url>https://github.com/microsoft/sql-spark-connector</url>
@@ -201,14 +201,14 @@
     </build>
     <profiles>
         <profile>
-            <id>spark31</id>
+            <id>spark33</id>
             <activation>
                 <activeByDefault>true</activeByDefault>
             </activation>
             <properties>
                 <scala.binary.version>2.12</scala.binary.version>
                 <scala.version>2.12.11</scala.version>
-                <spark.version>3.1.2</spark.version>
+                <spark.version>3.3.0</spark.version>
             </properties>
             <dependencies>
                 <dependency>
diff --git a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/DefaultSource.scala b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/DefaultSource.scala
index ed29d1a8..ff3236ea 100644
--- a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/DefaultSource.scala
+++ b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/DefaultSource.scala
@@ -13,15 +13,12 @@
 */
 package com.microsoft.sqlserver.jdbc.spark
 
-import java.sql.{Connection, ResultSet, SQLException}
-
+import com.microsoft.sqlserver.jdbc.spark.BulkCopyUtils._
+import com.microsoft.sqlserver.jdbc.spark.utils.JdbcUtils.createConnection
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{SQLContext, DataFrame, SaveMode}
-import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.createConnectionFactory
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.sources.BaseRelation
-
-import com.microsoft.sqlserver.jdbc.spark.BulkCopyUtils._
+import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
 
 /**
  * DefaultSource extends JDBCRelationProvider to provide a implmentation for MSSQLSpark connector.
@@ -56,7 +53,7 @@ class DefaultSource extends JdbcRelationProvider with Logging {
         // if no user input app name provided, will use SparkMSSQLConnector:NotSpecified
         val applicationName = s"SparkMSSQLConnector:${parameters.getOrElse("applicationname", "NotSpecified")}"
         val options = new SQLServerBulkJdbcOptions(parameters  + ("applicationname" -> applicationName))
-        val conn = createConnectionFactory(options)()
+        val conn = createConnection(options)
         val df = repartitionDataFrame(rawDf, options)
 
         logInfo(s"JDBC Driver major/mior version " +
diff --git a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/connectors/ReliableSingleInstanceStrategy.scala b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/connectors/ReliableSingleInstanceStrategy.scala
index b4cb4e5b..fd20c5a6 100644
--- a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/connectors/ReliableSingleInstanceStrategy.scala
+++ b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/connectors/ReliableSingleInstanceStrategy.scala
@@ -13,14 +13,14 @@
 */
 package com.microsoft.sqlserver.jdbc.spark
 
-import java.sql.{Connection, ResultSetMetaData, SQLException}
-
 import com.microsoft.sqlserver.jdbc.spark.BulkCopyUtils.{executeUpdate, savePartition}
+import com.microsoft.sqlserver.jdbc.spark.utils.JdbcUtils.createConnection
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
-import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.createConnectionFactory
 import org.apache.spark.sql.{DataFrame, Row}
 
+import java.sql.{Connection, SQLException}
+
 /**
  * Implements the Reliable write strategy for Single Instances that's that's resilient to executor restart.
  * Write is implemented as a 2 phase commit and executor restarts do not result in duplicate inserts.
@@ -49,7 +49,7 @@ object ReliableSingleInstanceStrategy extends  DataIOStrategy with Logging {
          appId: String): Unit = {
     logInfo("write : reliable write to single instance called")
     // Initialize - create connection and cleanup existing tables if any
-    val conn = createConnectionFactory(options)()
+    val conn = createConnection(options)
     val stagingTableList = getStagingTableNames(appId, options.dbtable, df.rdd.getNumPartitions)
     cleanupStagingTables(conn, stagingTableList, options)
     createStagingTables(conn, stagingTableList,options)
@@ -125,7 +125,7 @@ object ReliableSingleInstanceStrategy extends  DataIOStrategy with Logging {
                dfColMetaData: Array[ColumnMetadata],
                options: SQLServerBulkJdbcOptions): Unit = {
     logDebug(s"idempotentInsertToTable : Started")
-    val conn = createConnectionFactory(options)()
+    val conn = createConnection(options)
     try {
       BulkCopyUtils.mssqlTruncateTable(conn, tableName)
     } catch {
@@ -151,7 +151,7 @@ object ReliableSingleInstanceStrategy extends  DataIOStrategy with Logging {
                 options: SQLServerBulkJdbcOptions): Unit = {
     logInfo("unionStagingTables: insert to final table")
     val insertStmt = stmtInsertWithUnion(stagingTableList, dfColMetadata, options)
-    val conn = createConnectionFactory(options)()
+    val conn = createConnection(options)
     executeUpdate(conn,insertStmt)
   }
 
diff --git a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/BulkCopyUtils.scala b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/BulkCopyUtils.scala
index 8b06e8e6..26a444ad 100644
--- a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/BulkCopyUtils.scala
+++ b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/BulkCopyUtils.scala
@@ -13,15 +13,15 @@
 */
 package com.microsoft.sqlserver.jdbc.spark
 
-import java.sql.{Connection, ResultSet, ResultSetMetaData, SQLException}
-
+import com.microsoft.sqlserver.jdbc.spark.utils.JdbcUtils.createConnection
+import com.microsoft.sqlserver.jdbc.{SQLServerBulkCopy, SQLServerBulkCopyOptions}
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-import org.apache.spark.sql.types.{ByteType, DataType, ShortType, StructType}
+import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.{getSchema, schemaString}
 import org.apache.spark.sql.jdbc.JdbcDialects
-import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.{createConnectionFactory, getSchema, schemaString}
-import com.microsoft.sqlserver.jdbc.{SQLServerBulkCopy, SQLServerBulkCopyOptions}
+import org.apache.spark.sql.types.{ByteType, ShortType}
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 
+import java.sql.{Connection, ResultSet, ResultSetMetaData, SQLException}
 import scala.collection.mutable.ListBuffer
 
 /**
@@ -35,7 +35,7 @@ object BulkCopyUtils extends Logging {
     * a connection, sets connection properties and does a BulkWrite. Called when writing data to 
     * master instance and data pools both. URL in options is used to create the relevant connection.
     *
-    * @param itertor - iterator for row of the partition.
+    * @param iterator - iterator for row of the partition.
     * @param dfColMetadata - array of ColumnMetadata type
     * @param options - SQLServerBulkJdbcOptions with url for the connection
     */
@@ -47,7 +47,7 @@ object BulkCopyUtils extends Logging {
         options: SQLServerBulkJdbcOptions ): Unit = {
 
         logDebug("savePartition:Entered")
-        val conn = createConnectionFactory(options)()
+        val conn = createConnection(options)
         conn.setAutoCommit(false)
         conn.setTransactionIsolation(options.isolationLevel)
         var committed = false
diff --git a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/DataPoolUtils.scala b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/DataPoolUtils.scala
index e67a3988..91f8df3f 100644
--- a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/DataPoolUtils.scala
+++ b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/DataPoolUtils.scala
@@ -13,19 +13,19 @@
 */
 package com.microsoft.sqlserver.jdbc.spark
 
+import com.microsoft.sqlserver.jdbc.spark.utils.JdbcUtils.createConnection
+
 import java.net.{InetAddress, UnknownHostException}
 import java.nio.file.{Files, Paths}
-
 import org.apache.spark.deploy.history.LogInfo
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.createConnectionFactory
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
 import scala.annotation.tailrec
 import scala.io.Source
 
-/** 
+/**
 * DataPoolUtils
 *
 */
@@ -38,8 +38,8 @@ object DataPoolUtils extends Logging {
   def getDataPoolNodeList(options:SQLServerBulkJdbcOptions): List[String] = {
     logInfo(s"Searching DMV for data pools \n")
     import scala.collection.mutable.ListBuffer
-    val stmt = createConnectionFactory(options)().createStatement()
-    var nodeList = new ListBuffer[String]()
+    val stmt = createConnection(options).createStatement()
+    val nodeList = new ListBuffer[String]()
     val query = s"select address from sys.dm_db_data_pool_nodes"
     try {
       val rs = stmt.executeQuery(query)
diff --git a/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/JdbcUtils.scala b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/JdbcUtils.scala
new file mode 100644
index 00000000..474ec873
--- /dev/null
+++ b/src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/JdbcUtils.scala
@@ -0,0 +1,19 @@
+package com.microsoft.sqlserver.jdbc.spark.utils
+
+import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
+import org.apache.spark.sql.jdbc.JdbcDialects
+
+import java.sql.Connection
+
+object JdbcUtils {
+  /**
+   * Creates a JDBC connection using the input JDBC options.
+   * @param options The options which are used to create the connection.
+   * @return A JDBC connection.
+   */
+  def createConnection(options: JDBCOptions): Connection = {
+    val dialect = JdbcDialects.get(options.url)
+    val conn = dialect.createConnectionFactory(options)(-1)
+    conn
+  }
+}
diff --git a/src/test/scala/com/microsoft/sqlserver/jdbc/spark/bulkwrite/DataSourceTest.scala b/src/test/scala/com/microsoft/sqlserver/jdbc/spark/unit/bulkwrite/DataSourceTest.scala
similarity index 97%
rename from src/test/scala/com/microsoft/sqlserver/jdbc/spark/bulkwrite/DataSourceTest.scala
rename to src/test/scala/com/microsoft/sqlserver/jdbc/spark/unit/bulkwrite/DataSourceTest.scala
index 9445ffa5..b5e033d4 100644
--- a/src/test/scala/com/microsoft/sqlserver/jdbc/spark/bulkwrite/DataSourceTest.scala
+++ b/src/test/scala/com/microsoft/sqlserver/jdbc/spark/unit/bulkwrite/DataSourceTest.scala
@@ -11,12 +11,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package com.microsoft.sqlserver.jdbc.spark
-import java.sql.Connection
+package com.microsoft.sqlserver.jdbc.spark.unit.bulkwrite
 
-import org.scalatest.matchers.should.Matchers
+import com.microsoft.sqlserver.jdbc.spark.{BulkCopyUtils, DataPoolUtils, SQLServerBulkJdbcOptions}
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.test.SharedSparkSession
+import org.scalatest.matchers.should.Matchers
+
+import java.sql.Connection
 
 class DataSourceTest extends SparkFunSuite with Matchers with SharedSparkSession {
 
diff --git a/test/scala_test/pom.xml b/test/scala_test/pom.xml
index 0e0ddcd4..53743ccd 100644
--- a/test/scala_test/pom.xml
+++ b/test/scala_test/pom.xml
@@ -123,14 +123,14 @@
     </build>
     <profiles>
         <profile>
-            <id>spark31</id>
+            <id>spark33</id>
             <activation>
                 <activeByDefault>true</activeByDefault>
             </activation>
             <properties>
                 <scala.binary.version>2.12</scala.binary.version>
                 <scala.version>2.12.11</scala.version>
-                <spark.version>3.1.2</spark.version>
+                <spark.version>3.3.0</spark.version>
             </properties>
         </profile>
     </profiles>