Merge pull request #1561 from apache/master

Create a new pull request by comparing changes across two branches
GulajavaMinistudio · Sep 21, 2023 · f093827 · f093827
2 parents bb9cdba + a2bab5e
commit f093827
Show file tree

Hide file tree

Showing 99 changed files with 2,650 additions and 1,594 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -249,7 +249,7 @@ jobs:
     - name: Install Java ${{ matrix.java }}
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: ${{ matrix.java }}
     - name: Install Python 3.8
       uses: actions/setup-python@v4
@@ -435,7 +435,7 @@ jobs:
     - name: Install Java ${{ matrix.java }}
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: ${{ matrix.java }}
     - name: List Python packages (Python 3.9, PyPy3)
       run: |
@@ -539,7 +539,7 @@ jobs:
     - name: Install Java ${{ inputs.java }}
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: ${{ inputs.java }}
     - name: Run tests
       env: ${{ fromJSON(inputs.envs) }}
@@ -653,7 +653,7 @@ jobs:
     - name: Install Java 8
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: 8
     - name: License test
       run: ./dev/check-license
@@ -780,7 +780,7 @@ jobs:
         java:
           - 11
           - 17
-          - 21-ea
+          - 21
     runs-on: ubuntu-22.04
     timeout-minutes: 300
     steps:
@@ -817,7 +817,7 @@ jobs:
     - name: Install Java ${{ matrix.java }}
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: ${{ matrix.java }}
     - name: Build with Maven
       run: |
@@ -868,7 +868,7 @@ jobs:
     - name: Install Java 8
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: 8
     - name: Build with SBT
       run: |
@@ -919,7 +919,7 @@ jobs:
     - name: Install Java 8
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: 8
     - name: Cache TPC-DS generated data
       id: cache-tpcds-sf-1
@@ -1025,7 +1025,7 @@ jobs:
     - name: Install Java 8
       uses: actions/setup-java@v3
       with:
-        distribution: temurin
+        distribution: zulu
         java-version: 8
     - name: Run tests
       run: |
@@ -1084,7 +1084,7 @@ jobs:
       - name: Install Java ${{ inputs.java }}
         uses: actions/setup-java@v3
         with:
-          distribution: temurin
+          distribution: zulu
           java-version: ${{ inputs.java }}
       - name: start minikube
         run: |

diff --git a/.github/workflows/build_java21.yml b/.github/workflows/build_java21.yml
@@ -17,7 +17,7 @@
 # under the License.
 #
 
-name: "Build (master, Scala 2.12, Hadoop 3, JDK 21-ea)"
+name: "Build (master, Scala 2.12, Hadoop 3, JDK 21)"
 
 on:
   schedule:
@@ -31,7 +31,7 @@ jobs:
     uses: ./.github/workflows/build_and_test.yml
     if: github.repository == 'apache/spark'
     with:
-      java: 21-ea
+      java: 21
       branch: master
       hadoop: hadoop3
       envs: >-

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -2894,6 +2894,8 @@ setMethod("from_json", signature(x = "Column", schema = "characterOrstructTypeOr
               # treated as struct or element type of array in order to make it more
               # R-friendly.
               if (class(schema) == "Column") {
+                df <- createDataFrame(list(list(0)))
+                jschema <- collect(select(df, schema))[[1]][[1]]
                 jschema <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
                                        "createArrayType",
                                        jschema)

diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -314,6 +314,18 @@
       "<details>"
     ]
   },
+  "CANNOT_WRITE_STATE_STORE" : {
+    "message" : [
+      "Error writing state store files for provider <providerClass>."
+    ],
+    "subClass" : {
+      "CANNOT_COMMIT" : {
+        "message" : [
+          "Cannot perform commit during state checkpoint."
+        ]
+      }
+    }
+  },
   "CAST_INVALID_INPUT" : {
     "message" : [
       "The value <expression> of the type <sourceType> cannot be cast to <targetType> because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. If necessary set <ansiConfig> to \"false\" to bypass this error."

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.errors.DataTypeErrors
 import org.apache.spark.sql.expressions.{ScalarUserDefinedFunction, UserDefinedFunction}
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.types.DataType.parseTypeWithFallback
+import org.apache.spark.util.SparkClassUtils
 
 /**
  * Commonly used functions available for DataFrame operations. Using functions defined here
@@ -1831,7 +1832,7 @@ object functions {
    * @group normal_funcs
    * @since 3.4.0
    */
-  def rand(): Column = Column.fn("rand")
+  def rand(): Column = Column.fn("rand", lit(SparkClassUtils.random.nextLong))
 
   /**
    * Generate a column with independent and identically distributed (i.i.d.) samples from the
@@ -1855,7 +1856,7 @@ object functions {
    * @group normal_funcs
    * @since 3.4.0
    */
-  def randn(): Column = Column.fn("randn")
+  def randn(): Column = Column.fn("randn", lit(SparkClassUtils.random.nextLong))
 
   /**
    * Partition ID.
@@ -3392,7 +3393,7 @@ object functions {
    * @group misc_funcs
    * @since 3.5.0
    */
-  def uuid(): Column = Column.fn("uuid")
+  def uuid(): Column = Column.fn("uuid", lit(SparkClassUtils.random.nextLong))
 
   /**
    * Returns an encrypted value of `input` using AES in given `mode` with the specified `padding`.
@@ -3711,7 +3712,7 @@ object functions {
    * @group misc_funcs
    * @since 3.5.0
    */
-  def random(): Column = Column.fn("random")
+  def random(): Column = Column.fn("random", lit(SparkClassUtils.random.nextLong))
 
   /**
    * Returns the bit position for the given input column.
@@ -7069,7 +7070,7 @@ object functions {
    * @group collection_funcs
    * @since 3.4.0
    */
-  def shuffle(e: Column): Column = Column.fn("shuffle", e)
+  def shuffle(e: Column): Column = Column.fn("shuffle", e, lit(SparkClassUtils.random.nextLong))
 
   /**
    * Returns a reversed string or an array with reverse order of elements.
@@ -7102,7 +7103,8 @@ object functions {
    * @group collection_funcs
    * @since 3.4.0
    */
-  def sequence(start: Column, stop: Column): Column = sequence(start, stop, lit(1L))
+  def sequence(start: Column, stop: Column): Column =
+    Column.fn("sequence", start, stop)
 
   /**
    * Creates an array containing the left argument repeated the number of times given by the right

diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -1296,6 +1296,24 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
       assert(rc == 100)
     }
   }
+
+  test("SPARK-45216: Non-deterministic functions with seed") {
+    val session: SparkSession = spark
+    import session.implicits._
+
+    val df = Seq(Array.range(0, 10)).toDF("a")
+
+    val r = rand()
+    val r2 = randn()
+    val r3 = random()
+    val r4 = uuid()
+    val r5 = shuffle(col("a"))
+    df.select(r, r.as("r"), r2, r2.as("r2"), r3, r3.as("r3"), r4, r4.as("r4"), r5, r5.as("r5"))
+      .collect
+      .foreach { row =>
+        (0 until 5).foreach(i => assert(row.get(i * 2) === row.get(i * 2 + 1)))
+      }
+  }
 }
 
 private[sql] case class ClassData(a: String, b: Int)

diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
@@ -218,7 +218,6 @@ class FunctionTestSuite extends ConnectFunSuite {
     to_json(a, Collections.emptyMap[String, String]),
     to_json(a, Map.empty[String, String]))
   testEquals("sort_array", sort_array(a), sort_array(a, asc = true))
-  testEquals("sequence", sequence(lit(1), lit(10)), sequence(lit(1), lit(10), lit(1L)))
   testEquals(
     "from_csv",
     from_csv(a, lit(schema.toDDL), Collections.emptyMap[String, String]),
@@ -279,14 +278,14 @@ class FunctionTestSuite extends ConnectFunSuite {
     assert(e.hasUnresolvedFunction)
     val fn = e.getUnresolvedFunction
     assert(fn.getFunctionName == "rand")
-    assert(fn.getArgumentsCount == 0)
+    assert(fn.getArgumentsCount == 1)
   }
 
   test("randn no seed") {
     val e = randn().expr
     assert(e.hasUnresolvedFunction)
     val fn = e.getUnresolvedFunction
     assert(fn.getFunctionName == "randn")
-    assert(fn.getArgumentsCount == 0)
+    assert(fn.getArgumentsCount == 1)
   }
 }
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/base.proto b/connector/connect/common/src/main/protobuf/spark/connect/base.proto
@@ -778,6 +778,60 @@ message ReleaseExecuteResponse {
   optional string operation_id = 2;
 }
 
+message FetchErrorDetailsRequest {
+
+  // (Required)
+  // The session_id specifies a Spark session for a user identified by user_context.user_id.
+  // The id should be a UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`.
+  string session_id = 1;
+
+  // User context
+  UserContext user_context = 2;
+
+  // (Required)
+  // The id of the error.
+  string error_id = 3;
+}
+
+message FetchErrorDetailsResponse {
+
+  message StackTraceElement {
+    // The fully qualified name of the class containing the execution point.
+    string declaring_class = 1;
+
+    // The name of the method containing the execution point.
+    string method_name = 2;
+
+    // The name of the file containing the execution point.
+    string file_name = 3;
+
+    // The line number of the source line containing the execution point.
+    int32 line_number = 4;
+  }
+
+  // Error defines the schema for the representing exception.
+  message Error {
+    // The fully qualified names of the exception class and its parent classes.
+    repeated string error_type_hierarchy = 1;
+
+    // The detailed message of the exception.
+    string message = 2;
+
+    // The stackTrace of the exception. It will be set
+    // if the SQLConf spark.sql.connect.serverStacktrace.enabled is true.
+    repeated StackTraceElement stack_trace = 3;
+
+    // The index of the cause error in errors.
+    optional int32 cause_idx = 4;
+  }
+
+  // The index of the root error in errors. The field will not be set if the error is not found.
+  optional int32 root_error_idx = 1;
+
+  // A list of errors.
+  repeated Error errors = 2;
+}
+
 // Main interface for the SparkConnect service.
 service SparkConnectService {
 
@@ -813,5 +867,8 @@ service SparkConnectService {
   // Non reattachable executions are released automatically and immediately after the ExecutePlan
   // RPC and ReleaseExecute may not be used.
   rpc ReleaseExecute(ReleaseExecuteRequest) returns (ReleaseExecuteResponse) {}
+
+  // FetchErrorDetails retrieves the matched exception with details based on a provided error id.
+  rpc FetchErrorDetails(FetchErrorDetailsRequest) returns (FetchErrorDetailsResponse) {}
 }
 
diff --git a/...connect/common/src/test/resources/query-tests/explain-results/function_arrays_zip.explain b/...connect/common/src/test/resources/query-tests/explain-results/function_arrays_zip.explain
@@ -1,2 +1,2 @@
-Project [arrays_zip(e#0, sequence(cast(1 as bigint), cast(20 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles)), e, 1) AS arrays_zip(e, sequence(1, 20, 1))#0]
+Project [arrays_zip(e#0, sequence(1, 20, None, Some(America/Los_Angeles)), e, 1) AS arrays_zip(e, sequence(1, 20))#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...tor/connect/common/src/test/resources/query-tests/explain-results/function_concat.explain b/...tor/connect/common/src/test/resources/query-tests/explain-results/function_concat.explain
@@ -1,2 +1,2 @@
-Project [concat(cast(e#0 as array<bigint>), cast(array(1, 2) as array<bigint>), sequence(cast(33 as bigint), cast(40 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles))) AS concat(e, array(1, 2), sequence(33, 40, 1))#0]
+Project [concat(e#0, array(1, 2), sequence(33, 40, None, Some(America/Los_Angeles))) AS concat(e, array(1, 2), sequence(33, 40))#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...or/connect/common/src/test/resources/query-tests/explain-results/function_flatten.explain b/...or/connect/common/src/test/resources/query-tests/explain-results/function_flatten.explain
@@ -1,2 +1,2 @@
-Project [flatten(array(cast(e#0 as array<bigint>), sequence(cast(1 as bigint), cast(10 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles)))) AS flatten(array(e, sequence(1, 10, 1)))#0]
+Project [flatten(array(e#0, sequence(1, 10, None, Some(America/Los_Angeles)))) AS flatten(array(e, sequence(1, 10)))#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...r/connect/common/src/test/resources/query-tests/explain-results/function_sequence.explain b/...r/connect/common/src/test/resources/query-tests/explain-results/function_sequence.explain
@@ -1,2 +1,2 @@
-Project [sequence(cast(1 as bigint), cast(10 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles)) AS sequence(1, 10, 1)#0]
+Project [sequence(1, 10, None, Some(America/Los_Angeles)) AS sequence(1, 10)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json
@@ -29,10 +29,6 @@
               "literal": {
                 "integer": 20
               }
-            }, {
-              "literal": {
-                "long": "1"
-              }
             }]
           }
         }]

diff --git a/...ector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin b/...ector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_concat.json b/connector/connect/common/src/test/resources/query-tests/queries/function_concat.json
@@ -42,10 +42,6 @@
               "literal": {
                 "integer": 40
               }
-            }, {
-              "literal": {
-                "long": "1"
-              }
             }]
           }
         }]

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.json b/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.json
@@ -32,10 +32,6 @@
                   "literal": {
                     "integer": 10
                   }
-                }, {
-                  "literal": {
-                    "long": "1"
-                  }
                 }]
               }
             }]

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.json
@@ -22,10 +22,6 @@
           "literal": {
             "integer": 10
           }
-        }, {
-          "literal": {
-            "long": "1"
-          }
         }]
       }
     }]

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin