Skip to content

Commit

Permalink
Merge pull request #1641 from apache/master
Browse files Browse the repository at this point in the history
Create a new pull request by comparing changes across two branches
  • Loading branch information
GulajavaMinistudio authored May 3, 2024
2 parents e1c3b28 + aaf3995 commit 0fd5d4d
Show file tree
Hide file tree
Showing 331 changed files with 11,930 additions and 8,734 deletions.
94 changes: 17 additions & 77 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,24 +76,22 @@ jobs:
id: set-outputs
run: |
if [ -z "${{ inputs.jobs }}" ]; then
pyspark=true; sparkr=true; tpcds=true; docker=true;
pyspark=true; sparkr=true;
pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
pyspark=`./dev/is-changed.py -m $pyspark_modules`
sparkr=`./dev/is-changed.py -m sparkr`
tpcds=`./dev/is-changed.py -m sql`
docker=`./dev/is-changed.py -m docker-integration-tests`
# 'build' and 'maven-build' are always true for now.
kubernetes=`./dev/is-changed.py -m kubernetes`
# 'build' is always true for now.
# It does not save significant time and most of PRs trigger the build.
precondition="
{
\"build\": \"true\",
\"pyspark\": \"$pyspark\",
\"sparkr\": \"$sparkr\",
\"tpcds-1g\": \"$tpcds\",
\"docker-integration-tests\": \"$docker\",
\"maven-build\": \"true\",
\"tpcds-1g\": \"false\",
\"docker-integration-tests\": \"false\",
\"lint\" : \"true\",
\"k8s-integration-tests\" : \"true\",
\"k8s-integration-tests\" : \"$kubernetes\",
\"buf\" : \"true\",
\"ui\" : \"true\",
}"
Expand Down Expand Up @@ -123,7 +121,7 @@ jobs:
needs: precondition
if: fromJson(needs.precondition.outputs.required).build == 'true'
runs-on: ubuntu-latest
timeout-minutes: 300
timeout-minutes: 180
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -193,6 +191,7 @@ jobs:
HIVE_PROFILE: ${{ matrix.hive }}
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
NOLINT_ON_COMPILE: true
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
Expand Down Expand Up @@ -333,7 +332,7 @@ jobs:
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true'
name: "Build modules: ${{ matrix.modules }}"
runs-on: ubuntu-latest
timeout-minutes: 300
timeout-minutes: 180
container:
image: ${{ needs.precondition.outputs.image_url }}
strategy:
Expand Down Expand Up @@ -364,7 +363,7 @@ jobs:
pyspark-pandas-connect-part3
env:
MODULES_TO_TEST: ${{ matrix.modules }}
PYTHON_TO_TEST: 'python3.9'
PYTHON_TO_TEST: 'python3.11'
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
Expand Down Expand Up @@ -480,7 +479,7 @@ jobs:
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true'
name: "Build modules: sparkr"
runs-on: ubuntu-latest
timeout-minutes: 300
timeout-minutes: 180
container:
image: ${{ needs.precondition.outputs.image_url }}
env:
Expand Down Expand Up @@ -602,10 +601,11 @@ jobs:
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true'
name: Linters, licenses, dependencies and documentation generation
runs-on: ubuntu-latest
timeout-minutes: 300
timeout-minutes: 180
env:
LC_ALL: C.UTF-8
LANG: C.UTF-8
NOLINT_ON_COMPILE: false
PYSPARK_DRIVER_PYTHON: python3.9
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
Expand Down Expand Up @@ -786,74 +786,14 @@ jobs:
path: site.tar.bz2
retention-days: 1

maven-build:
needs: precondition
if: fromJson(needs.precondition.outputs.required).maven-build == 'true'
name: Java ${{ matrix.java }} build with Maven (${{ matrix.os }})
strategy:
fail-fast: false
matrix:
include:
- java: 17
os: ubuntu-latest
- java: 21
os: ubuntu-latest
- java: 21
os: macos-14
runs-on: ${{ matrix.os }}
timeout-minutes: 300
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Maven local repository
uses: actions/cache@v4
with:
path: ~/.m2/repository
key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
java${{ matrix.java }}-maven-
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ matrix.java }}
- name: Build with Maven
run: |
export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
export MAVEN_CLI_OPTS="--no-transfer-progress"
export JAVA_VERSION=${{ matrix.java }}
# It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install
rm -rf ~/.m2/repository/org/apache/spark
# Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well
tpcds-1g:
needs: precondition
if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true'
name: Run TPC-DS queries with SF=1
# Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation
runs-on: ubuntu-20.04
timeout-minutes: 300
timeout-minutes: 180
env:
SPARK_LOCAL_IP: localhost
steps:
Expand Down Expand Up @@ -954,7 +894,7 @@ jobs:
if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true'
name: Run Docker integration tests
runs-on: ubuntu-latest
timeout-minutes: 300
timeout-minutes: 180
env:
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
Expand Down Expand Up @@ -1022,7 +962,7 @@ jobs:
if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true'
name: Run Spark on Kubernetes Integration test
runs-on: ubuntu-latest
timeout-minutes: 300
timeout-minutes: 180
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -1094,7 +1034,7 @@ jobs:
if: fromJson(needs.precondition.outputs.required).ui == 'true'
name: Run Spark UI tests
runs-on: ubuntu-latest
timeout-minutes: 300
timeout-minutes: 180
steps:
- uses: actions/checkout@v4
- name: Use Node.js
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_branch34.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ jobs:
"sparkr": "true",
"tpcds-1g": "true",
"docker-integration-tests": "true",
"k8s-integration-tests": "true",
"lint" : "true"
}
1 change: 1 addition & 0 deletions .github/workflows/build_branch35.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ jobs:
"sparkr": "true",
"tpcds-1g": "true",
"docker-integration-tests": "true",
"k8s-integration-tests": "true",
"lint" : "true"
}
3 changes: 2 additions & 1 deletion .github/workflows/build_java21.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,6 @@ jobs:
"pyspark": "true",
"sparkr": "true",
"tpcds-1g": "true",
"docker-integration-tests": "true"
"docker-integration-tests": "true",
"k8s-integration-tests": "true"
}
2 changes: 1 addition & 1 deletion .github/workflows/build_maven_java21_macos14.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, macos-14)"

on:
schedule:
- cron: '0 20 * * *'
- cron: '0 20 */2 * *'

jobs:
run-build:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_non_ansi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: "Build / NON-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)"

on:
schedule:
- cron: '0 1,13 * * *'
- cron: '0 1 * * *'

jobs:
run-build:
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/build_python_connect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,6 @@ jobs:
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
--jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
# Make sure running Python workers that contains pyspark.core once. They will be reused.
python -c "from pyspark.sql import SparkSession; _ = SparkSession.builder.remote('sc://localhost').getOrCreate().range(100).repartition(100).mapInPandas(lambda x: x, 'id INT').collect()"
# Remove Py4J and PySpark zipped library to make sure there is no JVM connection
mv python/lib lib.back
mv python/pyspark pyspark.back
Expand All @@ -109,9 +106,6 @@ jobs:
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
--jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
# Make sure running Python workers that contains pyspark.core once. They will be reused.
python -c "from pyspark.sql import SparkSession; _ = SparkSession.builder.remote('sc://localhost').getOrCreate().range(100).repartition(100).mapInPandas(lambda x: x, 'id INT').show(n=100)" > /dev/null
# Remove Py4J and PySpark zipped library to make sure there is no JVM connection
mv python/lib lib.back
mv python/pyspark lib.back
Expand Down
113 changes: 113 additions & 0 deletions .github/workflows/build_python_connect35.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: Build / Spark Connect Python-only (master-server, 35-client, Python 3.11)

on:
schedule:
- cron: '0 21 * * *'

jobs:
# Build: build Spark and run the tests for specified modules using SBT
build:
name: "Build modules: pyspark-connect"
runs-on: ubuntu-latest
timeout-minutes: 100
if: github.repository == 'apache/spark'
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-spark-connect-python-only-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-spark-connect-python-only-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: coursier-build-spark-connect-python-only-${{ hashFiles('**/pom.xml') }}
restore-keys: |
coursier-build-spark-connect-python-only-
- name: Install Java 17
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 17
- name: Install Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'
architecture: x64
- name: Build Spark
run: |
./build/sbt -Phive Test/package
- name: Install Python dependencies
run: |
pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
# Add Python deps for Spark Connect.
pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'
# Add torch as a testing dependency for TorchDistributor
pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval
- name: Run tests
env:
SPARK_TESTING: 1
SPARK_SKIP_JVM_REQUIRED_TESTS: 1
SPARK_CONNECT_TESTING_REMOTE: sc://localhost
run: |
# Make less noisy
cp conf/log4j2.properties.template conf/log4j2.properties
sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
# Start a Spark Connect server for local
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
--jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
# Checkout to branch-3.5 to use the tests in branch-3.5.
cd ..
git clone --single-branch --branch branch-3.5 $GITHUB_SERVER_URL/$GITHUB_REPOSITORY spark-3.5
cd spark-3.5
# Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
# Run branch-3.5 tests
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-spark-connect-python-only
path: "**/target/test-reports/*.xml"
- name: Upload Spark Connect server log file
if: failure()
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-spark-connect-python-only
path: logs/*.out
4 changes: 1 addition & 3 deletions .github/workflows/build_rockdb_as_ui_backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,5 @@ jobs:
{
"build": "true",
"pyspark": "true",
"sparkr": "true",
"tpcds-1g": "true",
"docker-integration-tests": "true"
"sparkr": "true"
}
2 changes: 1 addition & 1 deletion .github/workflows/publish_snapshot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: Publish Snapshot

on:
schedule:
- cron: '0 0,12 * * *'
- cron: '0 0 * * *'
workflow_dispatch:
inputs:
branch:
Expand Down
Loading

0 comments on commit 0fd5d4d

Please sign in to comment.