diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml index f521673f0f..1178d94a26 100644 --- a/.github/workflows/spark_test.yaml +++ b/.github/workflows/spark_test.yaml @@ -80,6 +80,10 @@ jobs: pipenv run pip install pyarrow==8.0.0 pipenv run pip install numpy==1.20.3 if: steps.git-diff.outputs.diff + - name: Build Spark 3.5.4-SNAPSHOT locally + - name: Build Spark 3.5.4-SNAPSHOT locally + run: python3 build/generate_spark_jars.py + - name: Run Scala/Java and Python tests # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml run: | diff --git a/build.sbt b/build.sbt index d1b3856a37..c1f931ae3e 100644 --- a/build.sbt +++ b/build.sbt @@ -52,7 +52,7 @@ val all_scala_versions = Seq(scala212, scala213) val default_scala_version = settingKey[String]("Default Scala version") Global / default_scala_version := scala212 -val LATEST_RELEASED_SPARK_VERSION = "3.5.3" +val LATEST_RELEASED_SPARK_VERSION = "3.5.4-SNAPSHOT" val SPARK_MASTER_VERSION = "4.0.0-SNAPSHOT" val sparkVersion = settingKey[String]("Spark version") spark / sparkVersion := getSparkVersion() @@ -174,6 +174,7 @@ def crossSparkSettings(): Seq[Setting[_]] = getSparkVersion() match { // For adding staged Spark RC versions, e.g.: // resolvers += "Apache Spark 3.5.0 (RC1) Staging" at "https://repository.apache.org/content/repositories/orgapachespark-1444/", resolvers += "Apache Spark 3.5.3 (RC1) Staging" at "https://repository.apache.org/content/repositories/orgapachespark-1464/", + // resolvers += "Spark 3.5.4 staging" at "https://repository.apache.org/content/groups/snapshots/", Compile / unmanagedSourceDirectories += (Compile / baseDirectory).value / "src" / "main" / "scala-spark-3.5", Test / unmanagedSourceDirectories += (Test / baseDirectory).value / "src" / "test" / "scala-spark-3.5", Antlr4 / antlr4Version := "4.9.3", diff --git a/build/generate_spark_jars.py b/build/generate_spark_jars.py new file mode 100644 index 0000000000..83266f07b4 --- /dev/null +++ b/build/generate_spark_jars.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# +# Copyright (2021) The Delta Lake Project Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os +import glob +import subprocess +import shlex +import shutil +from os import path + +def clone_and_build_spark(): + branch = "branch-3.5" + + print(f"Cloning Apache Spark repository ({branch})...") + run_cmd("git clone --depth 1 --branch %s https://github.com/apache/spark.git .temp_spark" % + (branch)) + + # Change to the cloned directory + os.chdir(".temp_spark") + + # Set MAVEN_OPTS environment variable + maven_opts = "-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" + os.environ["MAVEN_OPTS"] = maven_opts + print(f"Set MAVEN_OPTS to: {maven_opts}") + + # Build the JAR files + print("Building Spark JAR files...") + build_command = "./build/mvn -DskipTests clean package install" + run_cmd(build_command) + + print("Build completed successfully!") + +def run_cmd(cmd, throw_on_error=True, env=None, stream_output=False, **kwargs): + if isinstance(cmd, str): + cmd = shlex.split(cmd) + cmd_env = os.environ.copy() + if env: + cmd_env.update(env) + + if stream_output: + child = subprocess.Popen(cmd, env=cmd_env, **kwargs) + exit_code = child.wait() + if throw_on_error and exit_code != 0: + raise Exception("Non-zero exitcode: %s" % (exit_code)) + print("----\n") + return exit_code + else: + child = subprocess.Popen( + cmd, + env=cmd_env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + **kwargs) + (stdout, stderr) = child.communicate() + exit_code = child.wait() + if throw_on_error and exit_code != 0: + raise Exception( + "Non-zero exitcode: %s\n\nSTDOUT:\n%s\n\nSTDERR:%s" % + (exit_code, stdout, stderr)) + return (exit_code, stdout, stderr) + +if __name__ == "__main__": + clone_and_build_spark() \ No newline at end of file