Merge pull request #1641 from apache/master

Create a new pull request by comparing changes across two branches
GulajavaMinistudio · May 3, 2024 · 0fd5d4d · 0fd5d4d
2 parents e1c3b28 + aaf3995
commit 0fd5d4d
Show file tree

Hide file tree

Showing 331 changed files with 11,930 additions and 8,734 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -76,24 +76,22 @@ jobs:
       id: set-outputs
       run: |
         if [ -z "${{ inputs.jobs }}" ]; then
-          pyspark=true; sparkr=true; tpcds=true; docker=true;
+          pyspark=true; sparkr=true;
           pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
           pyspark=`./dev/is-changed.py -m $pyspark_modules`
           sparkr=`./dev/is-changed.py -m sparkr`
-          tpcds=`./dev/is-changed.py -m sql`
-          docker=`./dev/is-changed.py -m docker-integration-tests`
-          # 'build' and 'maven-build' are always true for now.
+          kubernetes=`./dev/is-changed.py -m kubernetes`
+          # 'build' is always true for now.
           # It does not save significant time and most of PRs trigger the build.
           precondition="
             {
               \"build\": \"true\",
               \"pyspark\": \"$pyspark\",
               \"sparkr\": \"$sparkr\",
-              \"tpcds-1g\": \"$tpcds\",
-              \"docker-integration-tests\": \"$docker\",
-              \"maven-build\": \"true\",
+              \"tpcds-1g\": \"false\",
+              \"docker-integration-tests\": \"false\",
               \"lint\" : \"true\",
-              \"k8s-integration-tests\" : \"true\",
+              \"k8s-integration-tests\" : \"$kubernetes\",
               \"buf\" : \"true\",
               \"ui\" : \"true\",
             }"
@@ -123,7 +121,7 @@ jobs:
     needs: precondition
     if: fromJson(needs.precondition.outputs.required).build == 'true'
     runs-on: ubuntu-latest
-    timeout-minutes: 300
+    timeout-minutes: 180
     strategy:
       fail-fast: false
       matrix:
@@ -193,6 +191,7 @@ jobs:
       HIVE_PROFILE: ${{ matrix.hive }}
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
+      NOLINT_ON_COMPILE: true
       SKIP_UNIDOC: true
       SKIP_MIMA: true
       SKIP_PACKAGING: true
@@ -333,7 +332,7 @@ jobs:
     if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true'
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-latest
-    timeout-minutes: 300
+    timeout-minutes: 180
     container:
       image: ${{ needs.precondition.outputs.image_url }}
     strategy:
@@ -364,7 +363,7 @@ jobs:
             pyspark-pandas-connect-part3
     env:
       MODULES_TO_TEST: ${{ matrix.modules }}
-      PYTHON_TO_TEST: 'python3.9'
+      PYTHON_TO_TEST: 'python3.11'
       HADOOP_PROFILE: ${{ inputs.hadoop }}
       HIVE_PROFILE: hive2.3
       GITHUB_PREV_SHA: ${{ github.event.before }}
@@ -480,7 +479,7 @@ jobs:
     if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true'
     name: "Build modules: sparkr"
     runs-on: ubuntu-latest
-    timeout-minutes: 300
+    timeout-minutes: 180
     container:
       image: ${{ needs.precondition.outputs.image_url }}
     env:
@@ -602,10 +601,11 @@ jobs:
     if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true'
     name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-latest
-    timeout-minutes: 300
+    timeout-minutes: 180
     env:
       LC_ALL: C.UTF-8
       LANG: C.UTF-8
+      NOLINT_ON_COMPILE: false
       PYSPARK_DRIVER_PYTHON: python3.9
       PYSPARK_PYTHON: python3.9
       GITHUB_PREV_SHA: ${{ github.event.before }}
@@ -786,74 +786,14 @@ jobs:
         path: site.tar.bz2
         retention-days: 1
 
-  maven-build:
-    needs: precondition
-    if: fromJson(needs.precondition.outputs.required).maven-build == 'true'
-    name: Java ${{ matrix.java }} build with Maven (${{ matrix.os }})
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - java: 17
-            os: ubuntu-latest
-          - java: 21
-            os: ubuntu-latest
-          - java: 21
-            os: macos-14 
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 300
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Maven local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.m2/repository
-        key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          java${{ matrix.java }}-maven-
-    - name: Install Java ${{ matrix.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ matrix.java }}
-    - name: Build with Maven
-      run: |
-        export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
-        export MAVEN_CLI_OPTS="--no-transfer-progress"
-        export JAVA_VERSION=${{ matrix.java }}
-        # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
-        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install
-        rm -rf ~/.m2/repository/org/apache/spark
-
   # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well
   tpcds-1g:
     needs: precondition
     if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true'
     name: Run TPC-DS queries with SF=1
     # Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation
     runs-on: ubuntu-20.04
-    timeout-minutes: 300
+    timeout-minutes: 180
     env:
       SPARK_LOCAL_IP: localhost
     steps:
@@ -954,7 +894,7 @@ jobs:
     if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true'
     name: Run Docker integration tests
     runs-on: ubuntu-latest
-    timeout-minutes: 300
+    timeout-minutes: 180
     env:
       HADOOP_PROFILE: ${{ inputs.hadoop }}
       HIVE_PROFILE: hive2.3
@@ -1022,7 +962,7 @@ jobs:
     if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true'
     name: Run Spark on Kubernetes Integration test
     runs-on: ubuntu-latest
-    timeout-minutes: 300
+    timeout-minutes: 180
     steps:
       - name: Checkout Spark repository
         uses: actions/checkout@v4
@@ -1094,7 +1034,7 @@ jobs:
     if: fromJson(needs.precondition.outputs.required).ui == 'true'
     name: Run Spark UI tests
     runs-on: ubuntu-latest
-    timeout-minutes: 300
+    timeout-minutes: 180
     steps:
       - uses: actions/checkout@v4
       - name: Use Node.js

diff --git a/.github/workflows/build_branch34.yml b/.github/workflows/build_branch34.yml
@@ -47,5 +47,6 @@ jobs:
           "sparkr": "true",
           "tpcds-1g": "true",
           "docker-integration-tests": "true",
+          "k8s-integration-tests": "true",
           "lint" : "true"
         }
diff --git a/.github/workflows/build_branch35.yml b/.github/workflows/build_branch35.yml
@@ -47,5 +47,6 @@ jobs:
           "sparkr": "true",
           "tpcds-1g": "true",
           "docker-integration-tests": "true",
+          "k8s-integration-tests": "true",
           "lint" : "true"
         }
diff --git a/.github/workflows/build_java21.yml b/.github/workflows/build_java21.yml
@@ -46,5 +46,6 @@ jobs:
           "pyspark": "true",
           "sparkr": "true",
           "tpcds-1g": "true",
-          "docker-integration-tests": "true"
+          "docker-integration-tests": "true",
+          "k8s-integration-tests": "true"
         }
diff --git a/.github/workflows/build_maven_java21_macos14.yml b/.github/workflows/build_maven_java21_macos14.yml
@@ -21,7 +21,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, macos-14)"
 
 on:
   schedule:
-    - cron: '0 20 * * *'
+    - cron: '0 20 */2 * *'
 
 jobs:
   run-build:

diff --git a/.github/workflows/build_non_ansi.yml b/.github/workflows/build_non_ansi.yml
@@ -21,7 +21,7 @@ name: "Build / NON-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)"
 
 on:
   schedule:
-    - cron: '0 1,13 * * *'
+    - cron: '0 1 * * *'
 
 jobs:
   run-build:

diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml
@@ -86,9 +86,6 @@ jobs:
             --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
             --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
 
-          # Make sure running Python workers that contains pyspark.core once. They will be reused.
-          python -c "from pyspark.sql import SparkSession; _ = SparkSession.builder.remote('sc://localhost').getOrCreate().range(100).repartition(100).mapInPandas(lambda x: x, 'id INT').collect()"
-
           # Remove Py4J and PySpark zipped library to make sure there is no JVM connection
           mv python/lib lib.back
           mv python/pyspark pyspark.back
@@ -109,9 +106,6 @@ jobs:
             --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
             --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
 
-          # Make sure running Python workers that contains pyspark.core once. They will be reused.
-          python -c "from pyspark.sql import SparkSession; _ = SparkSession.builder.remote('sc://localhost').getOrCreate().range(100).repartition(100).mapInPandas(lambda x: x, 'id INT').show(n=100)" > /dev/null
-
           # Remove Py4J and PySpark zipped library to make sure there is no JVM connection
           mv python/lib lib.back
           mv python/pyspark lib.back

diff --git a/.github/workflows/build_python_connect35.yml b/.github/workflows/build_python_connect35.yml
@@ -0,0 +1,113 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Build / Spark Connect Python-only (master-server, 35-client, Python 3.11)
+
+on:
+  schedule:
+    - cron: '0 21 * * *'
+
+jobs:
+  # Build: build Spark and run the tests for specified modules using SBT
+  build:
+    name: "Build modules: pyspark-connect"
+    runs-on: ubuntu-latest
+    timeout-minutes: 100
+    if: github.repository == 'apache/spark'
+    steps:
+      - name: Checkout Spark repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Cache SBT and Maven
+        uses: actions/cache@v4
+        with:
+          path: |
+            build/apache-maven-*
+            build/*.jar
+            ~/.sbt
+          key: build-spark-connect-python-only-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+          restore-keys: |
+            build-spark-connect-python-only-
+      - name: Cache Coursier local repository
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/coursier
+          key: coursier-build-spark-connect-python-only-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            coursier-build-spark-connect-python-only-
+      - name: Install Java 17
+        uses: actions/setup-java@v4
+        with:
+          distribution: zulu
+          java-version: 17
+      - name: Install Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          architecture: x64
+      - name: Build Spark
+        run: |
+          ./build/sbt -Phive Test/package
+      - name: Install Python dependencies
+        run: |
+          pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
+
+          # Add Python deps for Spark Connect.
+          pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'
+
+          # Add torch as a testing dependency for TorchDistributor
+          pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval
+      - name: Run tests
+        env:
+          SPARK_TESTING: 1
+          SPARK_SKIP_JVM_REQUIRED_TESTS: 1
+          SPARK_CONNECT_TESTING_REMOTE: sc://localhost
+        run: |
+          # Make less noisy
+          cp conf/log4j2.properties.template conf/log4j2.properties
+          sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
+
+          # Start a Spark Connect server for local
+          PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
+            --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
+            --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+
+          # Checkout to branch-3.5 to use the tests in branch-3.5.
+          cd ..
+          git clone --single-branch --branch branch-3.5 $GITHUB_SERVER_URL/$GITHUB_REPOSITORY spark-3.5
+          cd spark-3.5
+
+          # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
+          # Run branch-3.5 tests
+          ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect
+          # None of tests are dependent on each other in Pandas API on Spark so run them in parallel
+          ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
+      - name: Upload test results to report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-results-spark-connect-python-only
+          path: "**/target/test-reports/*.xml"
+      - name: Upload Spark Connect server log file
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: unit-tests-log-spark-connect-python-only
+          path: logs/*.out
diff --git a/.github/workflows/build_rockdb_as_ui_backend.yml b/.github/workflows/build_rockdb_as_ui_backend.yml
@@ -42,7 +42,5 @@ jobs:
         {
           "build": "true",
           "pyspark": "true",
-          "sparkr": "true",
-          "tpcds-1g": "true",
-          "docker-integration-tests": "true"
+          "sparkr": "true"
         }
diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml
@@ -21,7 +21,7 @@ name: Publish Snapshot
 
 on:
   schedule:
-  - cron: '0 0,12 * * *'
+  - cron: '0 0 * * *'
   workflow_dispatch:
     inputs:
       branch: