ericm-db · ericm-db · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -69,12 +69,11 @@ jobs:
         # In order to get diff files
         with:
           fetch-depth: 0
-      - name: Cache Scala, SBT and Maven
+      - name: Cache SBT and Maven
         uses: actions/cache@v4
         with:
           path: |
             build/apache-maven-*
-            build/scala-*
             build/*.jar
             ~/.sbt
           key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
@@ -137,12 +136,11 @@ jobs:
       # In order to get diff files
       with:
         fetch-depth: 0
-    - name: Cache Scala, SBT and Maven
+    - name: Cache SBT and Maven
       uses: actions/cache@v4
       with:
         path: |
           build/apache-maven-*
-          build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
diff --git a/.github/workflows/build_branch34.yml b/.github/workflows/build_branch34.yml
@@ -43,9 +43,9 @@ jobs:
       jobs: >-
         {
           "build": "true",
-          "pyspark": "true",
           "sparkr": "true",
           "tpcds-1g": "true",
           "docker-integration-tests": "true",
+          "k8s-integration-tests": "true",
           "lint" : "true"
         }
diff --git a/...kflows/cancel_duplicate_workflow_runs.yml → .github/workflows/build_branch34_python.yml b/...kflows/cancel_duplicate_workflow_runs.yml → .github/workflows/build_branch34_python.yml
@@ -17,22 +17,29 @@
 # under the License.
 #
 
-name: Cancelling Duplicates
+name: "Build / Python-only (branch-3.4)"
+
 on:
-  workflow_run:
-    workflows: 
-      - 'Build'
-    types: ['requested']
+  schedule:
+    - cron: '0 9 * * *'
 
 jobs:
-  cancel-duplicate-workflow-runs:
-    name: "Cancel duplicate workflow runs"
-    runs-on: ubuntu-latest
-    steps:
-      - uses: potiuk/cancel-workflow-runs@4723494a065d162f8e9efd071b98e0126e00f866 # @master
-        name: "Cancel duplicate workflow runs"
-        with:
-          cancelMode: allDuplicates
-          token: ${{ secrets.GITHUB_TOKEN }}
-          sourceRunId: ${{ github.event.workflow_run.id }}
-          skipEventTypes: '["push", "schedule"]'
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/build_and_test.yml
+    if: github.repository == 'apache/spark'
+    with:
+      java: 8
+      branch: branch-3.4
+      hadoop: hadoop3
+      envs: >-
+        {
+          "PYTHON_TO_TEST": ""
+        }
+      jobs: >-
+        {
+          "pyspark": "true",
+          "pyspark-pandas": "true"
+        }
diff --git a/.github/workflows/build_branch35.yml b/.github/workflows/build_branch35.yml
@@ -43,9 +43,9 @@ jobs:
       jobs: >-
         {
           "build": "true",
-          "pyspark": "true",
           "sparkr": "true",
           "tpcds-1g": "true",
           "docker-integration-tests": "true",
+          "k8s-integration-tests": "true",
           "lint" : "true"
         }
diff --git a/.github/workflows/build_branch35_python.yml b/.github/workflows/build_branch35_python.yml
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build / Python-only (branch-3.5)"
+
+on:
+  schedule:
+    - cron: '0 11 * * *'
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/build_and_test.yml
+    if: github.repository == 'apache/spark'
+    with:
+      java: 8
+      branch: branch-3.5
+      hadoop: hadoop3
+      envs: >-
+        {
+          "PYTHON_TO_TEST": ""
+        }
+      jobs: >-
+        {
+          "pyspark": "true",
+          "pyspark-pandas": "true"
+        }
diff --git a/.github/workflows/build_java21.yml b/.github/workflows/build_java21.yml
@@ -46,5 +46,9 @@ jobs:
           "pyspark": "true",
           "sparkr": "true",
           "tpcds-1g": "true",
-          "docker-integration-tests": "true"
+          "docker-integration-tests": "true",
+          "yarn": "true",
+          "k8s-integration-tests": "true",
+          "buf": "true",
+          "ui": "true"
         }
diff --git a/.github/workflows/build_maven_java21_macos14.yml b/.github/workflows/build_maven_java21_macos14.yml
@@ -21,7 +21,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, macos-14)"
 
 on:
   schedule:
-    - cron: '0 20 * * *'
+    - cron: '0 20 */2 * *'
 
 jobs:
   run-build:

diff --git a/.github/workflows/build_ansi.yml → .github/workflows/build_non_ansi.yml b/.github/workflows/build_ansi.yml → .github/workflows/build_non_ansi.yml
@@ -17,11 +17,11 @@
 # under the License.
 #
 
-name: "Build / ANSI (master, Hadoop 3, JDK 17, Scala 2.13)"
+name: "Build / Non-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)"
 
 on:
   schedule:
-    - cron: '0 1,13 * * *'
+    - cron: '0 1 * * *'
 
 jobs:
   run-build:
@@ -36,13 +36,15 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
-          "SPARK_ANSI_SQL_MODE": "true",
+          "SPARK_ANSI_SQL_MODE": "false",
         }
       jobs: >-
         {
           "build": "true",
+          "docs": "true",
           "pyspark": "true",
           "sparkr": "true",
           "tpcds-1g": "true",
-          "docker-integration-tests": "true"
+          "docker-integration-tests": "true",
+          "yarn": "true"
         }
diff --git a/.github/workflows/build_python_3.10.yml b/.github/workflows/build_python_3.10.yml
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build / Python-only (master, Python 3.10)"
+
+on:
+  schedule:
+    - cron: '0 17 * * *'
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/build_and_test.yml
+    if: github.repository == 'apache/spark'
+    with:
+      java: 17
+      branch: master
+      hadoop: hadoop3
+      envs: >-
+        {
+          "PYTHON_TO_TEST": "python3.10"
+        }
+      jobs: >-
+        {
+          "pyspark": "true",
+          "pyspark-pandas": "true"
+        }
diff --git a/.github/workflows/build_python_3.12.yml b/.github/workflows/build_python_3.12.yml
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build / Python-only (master, Python 3.12)"
+
+on:
+  schedule:
+    - cron: '0 19 * * *'
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/build_and_test.yml
+    if: github.repository == 'apache/spark'
+    with:
+      java: 17
+      branch: master
+      hadoop: hadoop3
+      envs: >-
+        {
+          "PYTHON_TO_TEST": "python3.12"
+        }
+      jobs: >-
+        {
+          "pyspark": "true",
+          "pyspark-pandas": "true"
+        }
diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml
@@ -33,12 +33,11 @@ jobs:
     steps:
       - name: Checkout Spark repository
         uses: actions/checkout@v4
-      - name: Cache Scala, SBT and Maven
+      - name: Cache SBT and Maven
         uses: actions/cache@v4
         with:
           path: |
             build/apache-maven-*
-            build/scala-*
             build/*.jar
             ~/.sbt
           key: build-spark-connect-python-only-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
@@ -63,16 +62,16 @@ jobs:
           architecture: x64
       - name: Build Spark
         run: |
-          ./build/sbt -Phive test:package
+          ./build/sbt -Phive Test/package
       - name: Install pure Python package (pyspark-connect)
         env:
           SPARK_TESTING: 1
         run: |
           cd python
           python packaging/connect/setup.py sdist
           cd dist
-          pip install pyspark-connect-*.tar.gz
-          pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' torch torchvision torcheval deepspeed unittest-xml-reporting
+          pip install pyspark*connect-*.tar.gz
+          pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' torch torchvision torcheval deepspeed unittest-xml-reporting
       - name: Run tests
         env:
           SPARK_TESTING: 1
@@ -81,26 +80,46 @@ jobs:
           # Make less noisy
           cp conf/log4j2.properties.template conf/log4j2.properties
           sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
-          # Start a Spark Connect server
-          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" --jars `find connector/connect/server/target -name spark-connect*SNAPSHOT.jar`
-          # Make sure running Python workers that contains pyspark.core once. They will be reused.
-          python -c "from pyspark.sql import SparkSession; _ = SparkSession.builder.remote('sc://localhost').getOrCreate().range(100).repartition(100).mapInPandas(lambda x: x, 'id INT').collect()"
+
+          # Start a Spark Connect server for local
+          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
+            --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
+            --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+
           # Remove Py4J and PySpark zipped library to make sure there is no JVM connection
-          rm python/lib/*
-          rm -r python/pyspark
+          mv python/lib lib.back
+          mv python/pyspark pyspark.back
+
           # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
           ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
           # None of tests are dependent on each other in Pandas API on Spark so run them in parallel
           ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
+
+          # Stop Spark Connect server.
+          ./sbin/stop-connect-server.sh
+          mv lib.back python/lib
+          mv pyspark.back python/pyspark
+
+          # Start a Spark Connect server for local-cluster
+          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
+            --master "local-cluster[2, 4, 1024]" \
+            --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
+            --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+
+          # Remove Py4J and PySpark zipped library to make sure there is no JVM connection
+          mv python/lib lib.back
+          mv python/pyspark lib.back
+
+          ./python/run-tests --parallelism=1 --python-executables=python3 --testnames "pyspark.resource.tests.test_connect_resources,pyspark.sql.tests.connect.client.test_artifact,pyspark.sql.tests.connect.client.test_artifact_localcluster,pyspark.sql.tests.connect.test_resources"
       - name: Upload test results to report
         if: always()
         uses: actions/upload-artifact@v4
         with:
           name: test-results-spark-connect-python-only
           path: "**/target/test-reports/*.xml"
-      - name: Upload unit tests log files
-        if: failure()
+      - name: Upload Spark Connect server log file
+        if: ${{ !success() }}
         uses: actions/upload-artifact@v4
         with:
           name: unit-tests-log-spark-connect-python-only
-          path: "**/target/unit-tests.log"
+          path: logs/*.out