Merge pull request #1549 from apache/master

Create a new pull request by comparing changes across two branches
GulajavaMinistudio · Sep 4, 2023 · 2fd732a · 2fd732a
2 parents 3348846 + 60d8fc4
commit 2fd732a
Show file tree

Hide file tree

Showing 141 changed files with 4,741 additions and 1,207 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -36,3 +36,4 @@ notifications:
   pullrequests: reviews@spark.apache.org
   issues: reviews@spark.apache.org
   commits: commits@spark.apache.org
+  jira_options: link label
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -283,7 +283,7 @@ jobs:
         name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
-      if: failure()
+      if: ${{ !success() }}
       uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
@@ -425,7 +425,7 @@ jobs:
       run: |
         if [[ "$MODULES_TO_TEST" != *"pyspark-ml"* ]] && [[ "$BRANCH" != "branch-3.5" ]]; then
           # uninstall libraries dedicated for ML testing
-          python3.9 -m pip uninstall -y torch torchvision torcheval torchtnt tensorboard mlflow
+          python3.9 -m pip uninstall -y torch torchvision torcheval torchtnt tensorboard mlflow deepspeed
         fi
         if [ -f ./dev/free_disk_space_container ]; then
           ./dev/free_disk_space_container
@@ -470,7 +470,7 @@ jobs:
         name: test-results-${{ matrix.modules }}--8-${{ inputs.hadoop }}-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
-      if: failure()
+      if: ${{ !success() }}
       uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-${{ matrix.modules }}--8-${{ inputs.hadoop }}-hive2.3
@@ -961,7 +961,7 @@ jobs:
         name: test-results-tpcds--8-${{ inputs.hadoop }}-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
-      if: failure()
+      if: ${{ !success() }}
       uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-tpcds--8-${{ inputs.hadoop }}-hive2.3
@@ -1028,7 +1028,7 @@ jobs:
         name: test-results-docker-integration--8-${{ inputs.hadoop }}-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
-      if: failure()
+      if: ${{ !success() }}
       uses: actions/upload-artifact@v3
       with:
         name: unit-tests-log-docker-integration--8-${{ inputs.hadoop }}-hive2.3
@@ -1103,7 +1103,7 @@ jobs:
           eval $(minikube docker-env)
           build/sbt -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
       - name: Upload Spark on K8S integration tests log files
-        if: failure()
+        if: ${{ !success() }}
         uses: actions/upload-artifact@v3
         with:
           name: spark-on-kubernetes-it-log

diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
@@ -178,6 +178,13 @@ public static BloomFilter readFrom(InputStream in) throws IOException {
     return BloomFilterImpl.readFrom(in);
   }
 
+  /**
+   * Reads in a {@link BloomFilter} from a byte array.
+   */
+  public static BloomFilter readFrom(byte[] bytes) throws IOException {
+    return BloomFilterImpl.readFrom(bytes);
+  }
+
   /**
    * Computes the optimal k (number of hashes per item inserted in Bloom filter), given the
    * expected insertions and total number of bits in the Bloom filter.

diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java
@@ -266,6 +266,12 @@ public static BloomFilterImpl readFrom(InputStream in) throws IOException {
     return filter;
   }
 
+  public static BloomFilterImpl readFrom(byte[] bytes) throws IOException {
+    try (ByteArrayInputStream bis = new ByteArrayInputStream(bytes)) {
+      return readFrom(bis);
+    }
+  }
+
   private void writeObject(ObjectOutputStream out) throws IOException {
     writeTo(out);
   }

diff --git a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.util.sketch
 
-import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.io.ByteArrayOutputStream
 
 import scala.reflect.ClassTag
 import scala.util.Random
@@ -34,9 +34,7 @@ class BloomFilterSuite extends AnyFunSuite { // scalastyle:ignore funsuite
     filter.writeTo(out)
     out.close()
 
-    val in = new ByteArrayInputStream(out.toByteArray)
-    val deserialized = BloomFilter.readFrom(in)
-    in.close()
+    val deserialized = BloomFilter.readFrom(out.toByteArray)
 
     assert(filter == deserialized)
   }

diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -1035,6 +1035,11 @@
           "Cannot safely cast <colName> <srcType> to <targetType>."
         ]
       },
+      "EXTRA_COLUMNS" : {
+        "message" : [
+          "Cannot write extra columns <extraColumns>."
+        ]
+      },
       "EXTRA_STRUCT_FIELDS" : {
         "message" : [
           "Cannot write extra fields <extraFields> to the struct <colName>."
@@ -2215,6 +2220,12 @@
     ],
     "sqlState" : "42607"
   },
+  "NON_FOLDABLE_ARGUMENT" : {
+    "message" : [
+      "The function <funcName> requires the parameter <paramName> to be a foldable expression of the type <paramType>, but the actual argument is a non-foldable."
+    ],
+    "sqlState" : "22024"
+  },
   "NON_LAST_MATCHED_CLAUSE_OMIT_CONDITION" : {
     "message" : [
       "When there are more than one MATCHED clauses in a MERGE statement, only the last MATCHED clause can omit the condition."
@@ -2687,6 +2698,18 @@
       "Failed to analyze the Python user defined table function: <msg>"
     ]
   },
+  "TABLE_VALUED_FUNCTION_REQUIRED_METADATA_INCOMPATIBLE_WITH_CALL" : {
+    "message" : [
+      "Failed to evaluate the table function <functionName> because its table metadata <requestedMetadata>, but the function call <invalidFunctionCallProperty>."
+    ],
+    "sqlState" : "22023"
+  },
+  "TABLE_VALUED_FUNCTION_REQUIRED_METADATA_INVALID" : {
+    "message" : [
+      "Failed to evaluate the table function <functionName> because its table metadata was invalid; <reason>."
+    ],
+    "sqlState" : "22023"
+  },
   "TABLE_VALUED_FUNCTION_TOO_MANY_TABLE_ARGUMENTS" : {
     "message" : [
       "There are too many table arguments for table-valued function. It allows one table argument, but got: <num>. If you want to allow it, please set \"spark.sql.allowMultipleTableArguments.enabled\" to \"true\""
@@ -4029,11 +4052,6 @@
       "<funcName>() doesn't support the <mode> mode. Acceptable modes are <permissiveMode> and <failFastMode>."
     ]
   },
-  "_LEGACY_ERROR_TEMP_1100" : {
-    "message" : [
-      "The '<argName>' parameter of function '<funcName>' needs to be a <requiredType> literal."
-    ]
-  },
   "_LEGACY_ERROR_TEMP_1103" : {
     "message" : [
       "Unsupported component type <clz> in arrays."

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -987,7 +987,7 @@ object functions {
    * @group agg_funcs
    * @since 3.5.0
    */
-  def std(e: Column): Column = stddev(e)
+  def std(e: Column): Column = Column.fn("std", e)
 
   /**
    * Aggregate function: alias for `stddev_samp`.
@@ -2337,15 +2337,15 @@ object functions {
    * @group math_funcs
    * @since 3.5.0
    */
-  def ceiling(e: Column, scale: Column): Column = ceil(e, scale)
+  def ceiling(e: Column, scale: Column): Column = Column.fn("ceiling", e, scale)
 
   /**
    * Computes the ceiling of the given value of `e` to 0 decimal places.
    *
    * @group math_funcs
    * @since 3.5.0
    */
-  def ceiling(e: Column): Column = ceil(e)
+  def ceiling(e: Column): Column = Column.fn("ceiling", e)
 
   /**
    * Convert a number in a string column from one base to another.
@@ -2800,7 +2800,7 @@ object functions {
    * @group math_funcs
    * @since 3.5.0
    */
-  def power(l: Column, r: Column): Column = pow(l, r)
+  def power(l: Column, r: Column): Column = Column.fn("power", l, r)
 
   /**
    * Returns the positive value of dividend mod divisor.
@@ -2937,7 +2937,7 @@ object functions {
    * @group math_funcs
    * @since 3.5.0
    */
-  def sign(e: Column): Column = signum(e)
+  def sign(e: Column): Column = Column.fn("sign", e)
 
   /**
    * Computes the signum of the given value.
@@ -4428,7 +4428,7 @@ object functions {
    * @since 3.5.0
    */
   def printf(format: Column, arguments: Column*): Column =
-    Column.fn("format_string", lit(format) +: arguments: _*)
+    Column.fn("printf", (format +: arguments): _*)
 
   /**
    * Decodes a `str` in 'application/x-www-form-urlencoded' format using a specific encoding

diff --git a/...ctor/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/...ctor/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -3235,16 +3235,19 @@ class PlanGenerationTestSuite
   private val testDescFilePath: String = s"${IntegrationTestUtils.sparkHome}/connector/" +
     "connect/common/src/test/resources/protobuf-tests/common.desc"
 
-  test("from_protobuf messageClassName") {
-    binary.select(
-      pbFn.from_protobuf(fn.col("bytes"), "org.apache.spark.sql.protobuf.protos.TestProtoObj"))
+  // TODO(SPARK-45030): Re-enable this test when all Maven test scenarios succeed and there
+  //  are no other negative impacts. For the problem description, please refer to SPARK-45029
+  ignore("from_protobuf messageClassName") {
+    binary.select(pbFn.from_protobuf(fn.col("bytes"), classOf[StorageLevel].getName))
   }
 
-  test("from_protobuf messageClassName options") {
+  // TODO(SPARK-45030): Re-enable this test when all Maven test scenarios succeed and there
+  //  are no other negative impacts. For the problem description, please refer to SPARK-45029
+  ignore("from_protobuf messageClassName options") {
     binary.select(
       pbFn.from_protobuf(
         fn.col("bytes"),
-        "org.apache.spark.sql.protobuf.protos.TestProtoObj",
+        classOf[StorageLevel].getName,
         Map("recursive.fields.max.depth" -> "2").asJava))
   }
 

diff --git a/...c/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/...c/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala
@@ -208,6 +208,8 @@ object CheckConnectJvmClientCompatibility {
       // functions
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.unwrap_udt"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.udaf"),
+      ProblemFilters.exclude[DirectMissingMethodProblem](
+        "org.apache.spark.sql.functions.try_reflect"),
 
       // KeyValueGroupedDataset
       ProblemFilters.exclude[Problem](

diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/describe.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/describe.explain
@@ -1,6 +1,6 @@
 Project [summary#0, element_at(id#0, summary#0, None, false) AS id#0, element_at(b#0, summary#0, None, false) AS b#0]
 +- Project [id#0, b#0, summary#0]
    +- Generate explode([count,mean,stddev,min,max]), false, [summary#0]
-      +- Aggregate [map(cast(count as string), cast(count(id#0L) as string), cast(mean as string), cast(avg(id#0L) as string), cast(stddev as string), cast(stddev_samp(cast(id#0L as double)) as string), cast(min as string), cast(min(id#0L) as string), cast(max as string), cast(max(id#0L) as string)) AS id#0, map(cast(count as string), cast(count(b#0) as string), cast(mean as string), cast(avg(b#0) as string), cast(stddev as string), cast(stddev_samp(b#0) as string), cast(min as string), cast(min(b#0) as string), cast(max as string), cast(max(b#0) as string)) AS b#0]
+      +- Aggregate [map(cast(count as string), cast(count(id#0L) as string), cast(mean as string), cast(avg(id#0L) as string), cast(stddev as string), cast(stddev(cast(id#0L as double)) as string), cast(min as string), cast(min(id#0L) as string), cast(max as string), cast(max(id#0L) as string)) AS id#0, map(cast(count as string), cast(count(b#0) as string), cast(mean as string), cast(avg(b#0) as string), cast(stddev as string), cast(stddev(b#0) as string), cast(min as string), cast(min(b#0) as string), cast(max as string), cast(max(b#0) as string)) AS b#0]
          +- Project [id#0L, b#0]
             +- LocalRelation <empty>, [id#0L, a#0, b#0]
diff --git a/...mon/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName.explain b/...mon/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName.explain
diff --git a/...test/resources/query-tests/explain-results/from_protobuf_messageClassName_options.explain b/...test/resources/query-tests/explain-results/from_protobuf_messageClassName_options.explain
diff --git a/...or/connect/common/src/test/resources/query-tests/explain-results/function_ceiling.explain b/...or/connect/common/src/test/resources/query-tests/explain-results/function_ceiling.explain
@@ -1,2 +1,2 @@
-Project [CEIL(b#0) AS CEIL(b)#0L]
+Project [ceiling(b#0) AS ceiling(b)#0L]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...nect/common/src/test/resources/query-tests/explain-results/function_ceiling_scale.explain b/...nect/common/src/test/resources/query-tests/explain-results/function_ceiling_scale.explain
@@ -1,2 +1,2 @@
-Project [ceil(cast(b#0 as decimal(30,15)), 2) AS ceil(b, 2)#0]
+Project [ceiling(cast(b#0 as decimal(30,15)), 2) AS ceiling(b, 2)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...onnect/common/src/test/resources/query-tests/explain-results/function_java_method.explain b/...onnect/common/src/test/resources/query-tests/explain-results/function_java_method.explain
@@ -1,2 +1,2 @@
-Project [java_method(java.util.UUID, fromString, g#0) AS java_method(java.util.UUID, fromString, g)#0]
+Project [java_method(java.util.UUID, fromString, g#0, true) AS java_method(java.util.UUID, fromString, g)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...tor/connect/common/src/test/resources/query-tests/explain-results/function_printf.explain b/...tor/connect/common/src/test/resources/query-tests/explain-results/function_printf.explain
@@ -1,2 +1,2 @@
-Project [format_string(g#0, a#0, g#0) AS format_string(g, a, g)#0]
+Project [printf(g#0, a#0, g#0) AS printf(g, a, g)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...or/connect/common/src/test/resources/query-tests/explain-results/function_reflect.explain b/...or/connect/common/src/test/resources/query-tests/explain-results/function_reflect.explain
@@ -1,2 +1,2 @@
-Project [reflect(java.util.UUID, fromString, g#0) AS reflect(java.util.UUID, fromString, g)#0]
+Project [reflect(java.util.UUID, fromString, g#0, true) AS reflect(java.util.UUID, fromString, g)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...ector/connect/common/src/test/resources/query-tests/explain-results/function_sign.explain b/...ector/connect/common/src/test/resources/query-tests/explain-results/function_sign.explain
@@ -1,2 +1,2 @@
-Project [SIGNUM(b#0) AS SIGNUM(b)#0]
+Project [sign(b#0) AS sign(b)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_std.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_std.explain
@@ -1,2 +1,2 @@
-Aggregate [stddev(cast(a#0 as double)) AS stddev(a)#0]
+Aggregate [std(cast(a#0 as double)) AS std(a)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.json b/...connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.json
diff --git a/...ct/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.proto.bin b/...ct/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.proto.bin
diff --git a/...common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.json b/...common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.json
diff --git a/...n/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.proto.bin b/...n/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ceiling.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ceiling.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "ceil",
+        "functionName": "ceiling",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "b"

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ceiling.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ceiling.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "ceil",
+        "functionName": "ceiling",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "b"

diff --git a/...or/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.proto.bin b/...or/connect/common/src/test/resources/query-tests/queries/function_ceiling_scale.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_printf.json b/connector/connect/common/src/test/resources/query-tests/queries/function_printf.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "format_string",
+        "functionName": "printf",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "g"

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin