diff --git a/.asf.yaml b/.asf.yaml
index c7267ca5f346a..60bc1df21a48a 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# https://cwiki.apache.org/confluence/display/INFRA/.asf.yaml+features+for+git+repositories
+# https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features
---
github:
description: "Apache Spark - A unified analytics engine for large-scale data processing"
diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
index fcd3588cf81ee..66c28ee9666ee 100644
--- a/.github/PULL_REQUEST_TEMPLATE
+++ b/.github/PULL_REQUEST_TEMPLATE
@@ -8,6 +8,8 @@ Thanks for sending a pull request! Here are some tips for you:
6. If possible, provide a concise example to reproduce the issue for a faster review.
7. If you want to add a new configuration, please read the guideline first for naming configurations in
'core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala'.
+ 8. If you want to add or modify an error message, please read the guideline first:
+ https://spark.apache.org/error-message-guidelines.html
-->
### What changes were proposed in this pull request?
diff --git a/.github/autolabeler.yml b/.github/autolabeler.yml
deleted file mode 100644
index 3bca01f89950a..0000000000000
--- a/.github/autolabeler.yml
+++ /dev/null
@@ -1,133 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Bot page: https://github.com/apps/probot-autolabeler
-# The matching patterns follow the .gitignore spec.
-# See: https://git-scm.com/docs/gitignore#_pattern_format
-# Also, note that the plugin uses 'ignore' package. See also
-# https://github.com/kaelzhang/node-ignore
-INFRA:
- - ".github/"
- - "appveyor.yml"
- - "/tools/"
- - "/dev/create-release/"
- - ".asf.yaml"
- - ".gitattributes"
- - ".gitignore"
- - "/dev/github_jira_sync.py"
- - "/dev/merge_spark_pr.py"
- - "/dev/run-tests-jenkins*"
-BUILD:
- - "/dev/"
- - "!/dev/github_jira_sync.py"
- - "!/dev/merge_spark_pr.py"
- - "!/dev/run-tests-jenkins*"
- - "!/dev/.rat-excludes"
- - "/build/"
- - "/project/"
- - "/assembly/"
- - "*pom.xml"
- - "/bin/docker-image-tool.sh"
- - "/bin/find-spark-home*"
- - "scalastyle-config.xml"
-DOCS:
- - "docs/"
- - "/README.md"
- - "/CONTRIBUTING.md"
-EXAMPLES:
- - "examples/"
- - "/bin/run-example*"
-CORE:
- - "/core/"
- - "!UI.scala"
- - "!ui/"
- - "/common/kvstore/"
- - "/common/network-common/"
- - "/common/network-shuffle/"
- - "/python/pyspark/*.py"
- - "/python/pyspark/tests/*.py"
-SPARK SUBMIT:
- - "/bin/spark-submit*"
-SPARK SHELL:
- - "/repl/"
- - "/bin/spark-shell*"
-SQL:
- - "sql/"
- - "/common/unsafe/"
- - "!/python/pyspark/sql/avro/"
- - "!/python/pyspark/sql/streaming.py"
- - "!/python/pyspark/sql/tests/test_streaming.py"
- - "/bin/spark-sql*"
- - "/bin/beeline*"
- - "/sbin/*thriftserver*.sh"
- - "*SQL*.R"
- - "DataFrame.R"
- - "WindowSpec.R"
- - "catalog.R"
- - "column.R"
- - "functions.R"
- - "group.R"
- - "schema.R"
- - "types.R"
-AVRO:
- - "/external/avro/"
- - "/python/pyspark/sql/avro/"
-DSTREAM:
- - "/streaming/"
- - "/data/streaming/"
- - "/external/flume*"
- - "/external/kinesis*"
- - "/external/kafka*"
- - "/python/pyspark/streaming/"
-GRAPHX:
- - "/graphx/"
- - "/data/graphx/"
-ML:
- - "ml/"
- - "*mllib_*.R"
-MLLIB:
- - "spark/mllib/"
- - "/mllib-local/"
- - "/python/pyspark/mllib/"
-STRUCTURED STREAMING:
- - "sql/**/streaming/"
- - "/external/kafka-0-10-sql/"
- - "/python/pyspark/sql/streaming.py"
- - "/python/pyspark/sql/tests/test_streaming.py"
- - "*streaming.R"
-PYTHON:
- - "/bin/pyspark*"
- - "python/"
-R:
- - "r/"
- - "R/"
- - "/bin/sparkR*"
-YARN:
- - "/resource-managers/yarn/"
-MESOS:
- - "/resource-managers/mesos/"
- - "/sbin/*mesos*.sh"
-KUBERNETES:
- - "/resource-managers/kubernetes/"
-WINDOWS:
- - "*.cmd"
- - "/R/pkg/tests/fulltests/test_Windows.R"
-WEB UI:
- - "ui/"
- - "UI.scala"
-DEPLOY:
- - "/sbin/"
diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 0000000000000..bd61902925e33
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,152 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#
+# Pull Request Labeler Github Action Configuration: https://github.com/marketplace/actions/labeler
+#
+# Note that we currently cannot use the negatioon operator (i.e. `!`) for miniglob matches as they
+# would match any file that doesn't touch them. What's needed is the concept of `any `, which takes a
+# list of constraints / globs and then matches all of the constraints for either `any` of the files or
+# `all` of the files in the change set.
+#
+# However, `any`/`all` are not supported in a released version and testing off of the `main` branch
+# resulted in some other errors when testing.
+#
+# An issue has been opened upstream requesting that a release be cut that has support for all/any:
+# - https://github.com/actions/labeler/issues/111
+#
+# While we wait for this issue to be handled upstream, we can remove
+# the negated / `!` matches for now and at least have labels again.
+#
+INFRA:
+ - ".github/**/*"
+ - "appveyor.yml"
+ - "tools/**/*"
+ - "dev/create-release/**/*"
+ - ".asf.yaml"
+ - ".gitattributes"
+ - ".gitignore"
+ - "dev/github_jira_sync.py"
+ - "dev/merge_spark_pr.py"
+ - "dev/run-tests-jenkins*"
+BUILD:
+ # Can be supported when a stable release with correct all/any is released
+ #- any: ['dev/**/*', '!dev/github_jira_sync.py', '!dev/merge_spark_pr.py', '!dev/.rat-excludes']
+ - "dev/**/*"
+ - "build/**/*"
+ - "project/**/*"
+ - "assembly/**/*"
+ - "**/*pom.xml"
+ - "bin/docker-image-tool.sh"
+ - "bin/find-spark-home*"
+ - "scalastyle-config.xml"
+ # These can be added in the above `any` clause (and the /dev/**/* glob removed) when
+ # `any`/`all` support is released
+ # - "!dev/github_jira_sync.py"
+ # - "!dev/merge_spark_pr.py"
+ # - "!dev/run-tests-jenkins*"
+ # - "!dev/.rat-excludes"
+DOCS:
+ - "docs/**/*"
+ - "**/README.md"
+ - "**/CONTRIBUTING.md"
+EXAMPLES:
+ - "examples/**/*"
+ - "bin/run-example*"
+# CORE needs to be updated when all/any are released upstream.
+CORE:
+ # - any: ["core/**/*", "!**/*UI.scala", "!**/ui/**/*"] # If any file matches all of the globs defined in the list started by `any`, label is applied.
+ - "core/**/*"
+ - "common/kvstore/**/*"
+ - "common/network-common/**/*"
+ - "common/network-shuffle/**/*"
+ - "python/pyspark/**/*.py"
+ - "python/pyspark/tests/**/*.py"
+SPARK SUBMIT:
+ - "bin/spark-submit*"
+SPARK SHELL:
+ - "repl/**/*"
+ - "bin/spark-shell*"
+SQL:
+#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming.py", "!python/pyspark/sql/tests/test_streaming.py"]
+ - "**/sql/**/*"
+ - "common/unsafe/**/*"
+ #- "!python/pyspark/sql/avro/**/*"
+ #- "!python/pyspark/sql/streaming.py"
+ #- "!python/pyspark/sql/tests/test_streaming.py"
+ - "bin/spark-sql*"
+ - "bin/beeline*"
+ - "sbin/*thriftserver*.sh"
+ - "**/*SQL*.R"
+ - "**/DataFrame.R"
+ - "**/*WindowSpec.R"
+ - "**/*catalog.R"
+ - "**/*column.R"
+ - "**/*functions.R"
+ - "**/*group.R"
+ - "**/*schema.R"
+ - "**/*types.R"
+AVRO:
+ - "external/avro/**/*"
+ - "python/pyspark/sql/avro/**/*"
+DSTREAM:
+ - "streaming/**/*"
+ - "data/streaming/**/*"
+ - "external/kinesis*"
+ - "external/kafka*"
+ - "python/pyspark/streaming/**/*"
+GRAPHX:
+ - "graphx/**/*"
+ - "data/graphx/**/*"
+ML:
+ - "**/ml/**/*"
+ - "**/*mllib_*.R"
+MLLIB:
+ - "**/spark/mllib/**/*"
+ - "mllib-local/**/*"
+ - "python/pyspark/mllib/**/*"
+STRUCTURED STREAMING:
+ - "**/sql/**/streaming/**/*"
+ - "external/kafka-0-10-sql/**/*"
+ - "python/pyspark/sql/streaming.py"
+ - "python/pyspark/sql/tests/test_streaming.py"
+ - "**/*streaming.R"
+PYTHON:
+ - "bin/pyspark*"
+ - "**/python/**/*"
+R:
+ - "**/r/**/*"
+ - "**/R/**/*"
+ - "bin/sparkR*"
+YARN:
+ - "resource-managers/yarn/**/*"
+MESOS:
+ - "resource-managers/mesos/**/*"
+ - "sbin/*mesos*.sh"
+KUBERNETES:
+ - "resource-managers/kubernetes/**/*"
+WINDOWS:
+ - "**/*.cmd"
+ - "R/pkg/tests/fulltests/test_Windows.R"
+WEB UI:
+ - "**/ui/**/*"
+ - "**/*UI.scala"
+DEPLOY:
+ - "sbin/**/*"
+
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000000000..76ae152f2307f
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,100 @@
+name: Run benchmarks
+
+on:
+ workflow_dispatch:
+ inputs:
+ class:
+ description: 'Benchmark class'
+ required: true
+ default: '*'
+ jdk:
+ description: 'JDK version: 8 or 11'
+ required: true
+ default: '8'
+ failfast:
+ description: 'Failfast: true or false'
+ required: true
+ default: 'true'
+ num-splits:
+ description: 'Number of job splits'
+ required: true
+ default: '1'
+
+jobs:
+ matrix-gen:
+ name: Generate matrix for job splits
+ runs-on: ubuntu-20.04
+ outputs:
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
+ env:
+ SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
+ steps:
+ - name: Generate matrix
+ id: set-matrix
+ run: echo "::set-output name=matrix::["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]"
+
+ benchmark:
+ name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)"
+ needs: matrix-gen
+ # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
+ runs-on: ubuntu-20.04
+ strategy:
+ fail-fast: false
+ matrix:
+ split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}}
+ env:
+ SPARK_BENCHMARK_FAILFAST: ${{ github.event.inputs.failfast }}
+ SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
+ SPARK_BENCHMARK_CUR_SPLIT: ${{ matrix.split }}
+ SPARK_GENERATE_BENCHMARK_FILES: 1
+ SPARK_LOCAL_IP: localhost
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ # In order to get diff files
+ with:
+ fetch-depth: 0
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ benchmark-coursier-${{ github.event.inputs.jdk }}
+ - name: Install Java ${{ github.event.inputs.jdk }}
+ uses: actions/setup-java@v1
+ with:
+ java-version: ${{ github.event.inputs.jdk }}
+ - name: Run benchmarks
+ run: |
+ ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl test:package
+ # Make less noisy
+ cp conf/log4j.properties.template conf/log4j.properties
+ sed -i 's/log4j.rootCategory=INFO, console/log4j.rootCategory=WARN, console/g' conf/log4j.properties
+ # In benchmark, we use local as master so set driver memory only. Note that GitHub Actions has 7 GB memory limit.
+ bin/spark-submit \
+ --driver-memory 6g --class org.apache.spark.benchmark.Benchmarks \
+ --jars "`find . -name '*-SNAPSHOT-tests.jar' -o -name '*avro*-SNAPSHOT.jar' | paste -sd ',' -`" \
+ "`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \
+ "${{ github.event.inputs.class }}"
+ # To keep the directory structure and file permissions, tar them
+ # See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files
+ echo "Preparing the benchmark results:"
+ tar -cvf benchmark-results-${{ github.event.inputs.jdk }}.tar `git diff --name-only` `git ls-files --others --exclude-standard`
+ - name: Upload benchmark results
+ uses: actions/upload-artifact@v2
+ with:
+ name: benchmark-results-${{ github.event.inputs.jdk }}-${{ matrix.split }}
+ path: benchmark-results-${{ github.event.inputs.jdk }}.tar
+
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
new file mode 100644
index 0000000000000..be5978e7ecb97
--- /dev/null
+++ b/.github/workflows/build_and_test.yml
@@ -0,0 +1,561 @@
+name: Build and test
+
+on:
+ push:
+ branches:
+ - '**'
+ - '!branch-*.*'
+
+jobs:
+ # Build: build Spark and run the tests for specified modules.
+ build:
+ name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
+ # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
+ runs-on: ubuntu-20.04
+ strategy:
+ fail-fast: false
+ matrix:
+ java:
+ - 8
+ hadoop:
+ - hadoop3.2
+ hive:
+ - hive2.3
+ # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
+ # Kinesis tests depends on external Amazon kinesis service.
+ # Note that the modules below are from sparktestsupport/modules.py.
+ modules:
+ - >-
+ core, unsafe, kvstore, avro,
+ network-common, network-shuffle, repl, launcher,
+ examples, sketch, graphx
+ - >-
+ catalyst, hive-thriftserver
+ - >-
+ streaming, sql-kafka-0-10, streaming-kafka-0-10,
+ mllib-local, mllib,
+ yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
+ # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
+ included-tags: [""]
+ excluded-tags: [""]
+ comment: [""]
+ include:
+ # Hive tests
+ - modules: hive
+ java: 8
+ hadoop: hadoop3.2
+ hive: hive2.3
+ included-tags: org.apache.spark.tags.SlowHiveTest
+ comment: "- slow tests"
+ - modules: hive
+ java: 8
+ hadoop: hadoop3.2
+ hive: hive2.3
+ excluded-tags: org.apache.spark.tags.SlowHiveTest
+ comment: "- other tests"
+ # SQL tests
+ - modules: sql
+ java: 8
+ hadoop: hadoop3.2
+ hive: hive2.3
+ included-tags: org.apache.spark.tags.ExtendedSQLTest
+ comment: "- slow tests"
+ - modules: sql
+ java: 8
+ hadoop: hadoop3.2
+ hive: hive2.3
+ excluded-tags: org.apache.spark.tags.ExtendedSQLTest
+ comment: "- other tests"
+ env:
+ MODULES_TO_TEST: ${{ matrix.modules }}
+ EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+ INCLUDED_TAGS: ${{ matrix.included-tags }}
+ HADOOP_PROFILE: ${{ matrix.hadoop }}
+ HIVE_PROFILE: ${{ matrix.hive }}
+ GITHUB_PREV_SHA: ${{ github.event.before }}
+ SPARK_LOCAL_IP: localhost
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ # In order to fetch changed files
+ with:
+ fetch-depth: 0
+ repository: apache/spark
+ ref: master
+ - name: Sync the current branch with the latest in Apache Spark
+ if: github.repository != 'apache/spark'
+ id: sync-branch
+ run: |
+ apache_spark_ref=`git rev-parse HEAD`
+ git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
+ echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
+ # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
+ - name: Install Java ${{ matrix.java }}
+ uses: actions/setup-java@v1
+ with:
+ java-version: ${{ matrix.java }}
+ - name: Install Python 3.8
+ uses: actions/setup-python@v2
+ # We should install one Python that is higher then 3+ for SQL and Yarn because:
+ # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
+ # - Yarn has a Python specific test too, for example, YarnClusterSuite.
+ if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+ with:
+ python-version: 3.8
+ architecture: x64
+ - name: Install Python packages (Python 3.8)
+ if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+ run: |
+ python3.8 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner
+ python3.8 -m pip list
+ # Run the tests.
+ - name: Run tests
+ run: |
+ export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
+ # Hive and SQL tests become flaky when running in parallel as it's too intensive.
+ if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi
+ ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
+ - name: Upload test results to report
+ if: always()
+ uses: actions/upload-artifact@v2
+ with:
+ name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+ path: "**/target/test-reports/*.xml"
+ - name: Upload unit tests log files
+ if: failure()
+ uses: actions/upload-artifact@v2
+ with:
+ name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+ path: "**/target/unit-tests.log"
+
+ pyspark:
+ name: "Build modules: ${{ matrix.modules }}"
+ runs-on: ubuntu-20.04
+ container:
+ image: dongjoon/apache-spark-github-action-image:20201025
+ strategy:
+ fail-fast: false
+ matrix:
+ modules:
+ - >-
+ pyspark-sql, pyspark-mllib, pyspark-resource
+ - >-
+ pyspark-core, pyspark-streaming, pyspark-ml
+ - >-
+ pyspark-pandas
+ env:
+ MODULES_TO_TEST: ${{ matrix.modules }}
+ HADOOP_PROFILE: hadoop3.2
+ HIVE_PROFILE: hive2.3
+ GITHUB_PREV_SHA: ${{ github.event.before }}
+ SPARK_LOCAL_IP: localhost
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ # In order to fetch changed files
+ with:
+ fetch-depth: 0
+ repository: apache/spark
+ ref: master
+ - name: Sync the current branch with the latest in Apache Spark
+ if: github.repository != 'apache/spark'
+ id: sync-branch
+ run: |
+ apache_spark_ref=`git rev-parse HEAD`
+ git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
+ echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
+ # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ pyspark-coursier-
+ - name: Install Python 3.6
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.6
+ architecture: x64
+ # This step takes much less time (~30s) than other Python versions so it is not included
+ # in the Docker image being used. There is also a technical issue to install Python 3.6 on
+ # Ubuntu 20.04. See also SPARK-33162.
+ - name: Install Python packages (Python 3.6)
+ run: |
+ python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner
+ python3.6 -m pip list
+ - name: Install Conda for pip packaging test
+ run: |
+ curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
+ bash miniconda.sh -b -p $HOME/miniconda
+ # Run the tests.
+ - name: Run tests
+ run: |
+ export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
+ export PATH=$PATH:$HOME/miniconda/bin
+ ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
+ - name: Upload test results to report
+ if: always()
+ uses: actions/upload-artifact@v2
+ with:
+ name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3
+ path: "**/target/test-reports/*.xml"
+ - name: Upload unit tests log files
+ if: failure()
+ uses: actions/upload-artifact@v2
+ with:
+ name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3
+ path: "**/target/unit-tests.log"
+
+ sparkr:
+ name: "Build modules: sparkr"
+ runs-on: ubuntu-20.04
+ container:
+ image: dongjoon/apache-spark-github-action-image:20201025
+ env:
+ HADOOP_PROFILE: hadoop3.2
+ HIVE_PROFILE: hive2.3
+ GITHUB_PREV_SHA: ${{ github.event.before }}
+ SPARK_LOCAL_IP: localhost
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ # In order to fetch changed files
+ with:
+ fetch-depth: 0
+ repository: apache/spark
+ ref: master
+ - name: Sync the current branch with the latest in Apache Spark
+ if: github.repository != 'apache/spark'
+ id: sync-branch
+ run: |
+ apache_spark_ref=`git rev-parse HEAD`
+ git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
+ echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
+ # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ sparkr-coursier-
+ - name: Run tests
+ run: |
+ # The followings are also used by `r-lib/actions/setup-r` to avoid
+ # R issues at docker environment
+ export TZ=UTC
+ export _R_CHECK_SYSTEM_CLOCK_=FALSE
+ export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
+ ./dev/run-tests --parallelism 2 --modules sparkr
+ - name: Upload test results to report
+ if: always()
+ uses: actions/upload-artifact@v2
+ with:
+ name: test-results-sparkr--8-hadoop3.2-hive2.3
+ path: "**/target/test-reports/*.xml"
+
+ # Static analysis, and documentation build
+ lint:
+ name: Linters, licenses, dependencies and documentation generation
+ runs-on: ubuntu-20.04
+ env:
+ LC_ALL: C.UTF-8
+ LANG: C.UTF-8
+ container:
+ image: dongjoon/apache-spark-github-action-image:20201025
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ docs-coursier-
+ - name: Cache Maven local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.m2/repository
+ key: docs-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ docs-maven-
+ - name: Install Python 3.6
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.6
+ architecture: x64
+ - name: Install Python linter dependencies
+ run: |
+ # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+ # See also https://github.com/sphinx-doc/sphinx/issues/7551.
+ # Jinja2 3.0.0+ causes error when building with Sphinx.
+ # See also https://issues.apache.org/jira/browse/SPARK-35375.
+ python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc 'jinja2<3.0.0'
+ - name: Install R linter dependencies and SparkR
+ run: |
+ apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
+ Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
+ Rscript -e "devtools::install_github('jimhester/lintr@v2.0.1')"
+ ./R/install-dev.sh
+ - name: Instll JavaScript linter dependencies
+ run: |
+ apt update
+ apt-get install -y nodejs npm
+ - name: Install dependencies for documentation generation
+ run: |
+ # pandoc is required to generate PySpark APIs as well in nbsphinx.
+ apt-get install -y libcurl4-openssl-dev pandoc
+ # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+ # See also https://github.com/sphinx-doc/sphinx/issues/7551.
+ # Jinja2 3.0.0+ causes error when building with Sphinx.
+ # See also https://issues.apache.org/jira/browse/SPARK-35375.
+ python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0'
+ apt-get update -y
+ apt-get install -y ruby ruby-dev
+ Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+ gem install bundler
+ cd docs
+ bundle install
+ - name: Scala linter
+ run: ./dev/lint-scala
+ - name: Java linter
+ run: ./dev/lint-java
+ - name: Python linter
+ run: ./dev/lint-python
+ - name: R linter
+ run: ./dev/lint-r
+ - name: JS linter
+ run: ./dev/lint-js
+ - name: License test
+ run: ./dev/check-license
+ - name: Dependencies test
+ run: ./dev/test-dependencies.sh
+ - name: Run documentation build
+ run: |
+ cd docs
+ bundle exec jekyll build
+
+ java-11:
+ name: Java 11 build with Maven
+ runs-on: ubuntu-20.04
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Maven local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.m2/repository
+ key: java11-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ java11-maven-
+ - name: Install Java 11
+ uses: actions/setup-java@v1
+ with:
+ java-version: 11
+ - name: Build with Maven
+ run: |
+ export MAVEN_OPTS="-Xss256m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+ export MAVEN_CLI_OPTS="--no-transfer-progress"
+ # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
+ ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
+ rm -rf ~/.m2/repository/org/apache/spark
+
+ scala-213:
+ name: Scala 2.13 build with SBT
+ runs-on: ubuntu-20.04
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ scala-213-coursier-
+ - name: Install Java 8
+ uses: actions/setup-java@v1
+ with:
+ java-version: 8
+ - name: Build with SBT
+ run: |
+ ./dev/change-scala-version.sh 2.13
+ ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile
+
+ hadoop-2:
+ name: Hadoop 2 build with SBT
+ runs-on: ubuntu-20.04
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ hadoop-2-coursier-
+ - name: Install Java 8
+ uses: actions/setup-java@v1
+ with:
+ java-version: 8
+ - name: Build with SBT
+ run: |
+ ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile
+
+ tpcds-1g:
+ name: Run TPC-DS queries with SF=1
+ runs-on: ubuntu-20.04
+ env:
+ SPARK_LOCAL_IP: localhost
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ tpcds-coursier-
+ - name: Install Java 8
+ uses: actions/setup-java@v1
+ with:
+ java-version: 8
+ - name: Cache TPC-DS generated data
+ id: cache-tpcds-sf-1
+ uses: actions/cache@v2
+ with:
+ path: ./tpcds-sf-1
+ key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
+ - name: Checkout tpcds-kit repository
+ if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+ uses: actions/checkout@v2
+ with:
+ repository: databricks/tpcds-kit
+ path: ./tpcds-kit
+ - name: Build tpcds-kit
+ if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+ run: cd tpcds-kit/tools && make OS=LINUX
+ - name: Generate TPC-DS (SF=1) table data
+ if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+ run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
+ - name: Run TPC-DS queries
+ run: |
+ SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
+ - name: Upload test results to report
+ if: always()
+ uses: actions/upload-artifact@v2
+ with:
+ name: test-results-tpcds--8-hadoop3.2-hive2.3
+ path: "**/target/test-reports/*.xml"
+ - name: Upload unit tests log files
+ if: failure()
+ uses: actions/upload-artifact@v2
+ with:
+ name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
+ path: "**/target/unit-tests.log"
diff --git a/.github/workflows/cancel_duplicate_workflow_runs.yml b/.github/workflows/cancel_duplicate_workflow_runs.yml
new file mode 100644
index 0000000000000..b20fc947f6f13
--- /dev/null
+++ b/.github/workflows/cancel_duplicate_workflow_runs.yml
@@ -0,0 +1,19 @@
+name: Cancelling Duplicates
+on:
+ workflow_run:
+ workflows:
+ - 'Build and test'
+ types: ['requested']
+
+jobs:
+ cancel-duplicate-workflow-runs:
+ name: "Cancel duplicate workflow runs"
+ runs-on: ubuntu-latest
+ steps:
+ - uses: potiuk/cancel-workflow-runs@953e057dc81d3458935a18d1184c386b0f6b5738 # @master
+ name: "Cancel duplicate workflow runs"
+ with:
+ cancelMode: allDuplicates
+ token: ${{ secrets.GITHUB_TOKEN }}
+ sourceRunId: ${{ github.event.workflow_run.id }}
+ skipEventTypes: '["push", "schedule"]'
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 0000000000000..98855f4668b45
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Intentionally has a general name.
+# because the test status check created in GitHub Actions
+# currently randomly picks any associated workflow.
+# So, the name was changed to make sense in that context too.
+# See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10
+
+name: "On pull requests"
+on: pull_request_target
+
+jobs:
+ label:
+ name: Label pull requests
+ runs-on: ubuntu-latest
+ steps:
+ # In order to get back the negated matches like in the old config,
+ # we need the actinons/labeler concept of `all` and `any` which matches
+ # all of the given constraints / glob patterns for either `all`
+ # files or `any` file in the change set.
+ #
+ # Github issue which requests a timeline for a release with any/all support:
+ # - https://github.com/actions/labeler/issues/111
+ # This issue also references the issue that mentioned that any/all are only
+ # supported on main branch (previously called master):
+ # - https://github.com/actions/labeler/issues/73#issuecomment-639034278
+ #
+ # However, these are not in a published release and the current `main` branch
+ # has some issues upon testing.
+ - uses: actions/labeler@2.2.0
+ with:
+ repo-token: "${{ secrets.GITHUB_TOKEN }}"
+ sync-labels: true
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
deleted file mode 100644
index 4282504cc3984..0000000000000
--- a/.github/workflows/master.yml
+++ /dev/null
@@ -1,156 +0,0 @@
-name: master
-
-on:
- push:
- branches:
- - master
- pull_request:
- branches:
- - master
-
-jobs:
- build:
-
- runs-on: ubuntu-latest
- strategy:
- matrix:
- java: [ '1.8', '11' ]
- hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ]
- hive: [ 'hive-1.2', 'hive-2.3' ]
- exclude:
- - java: '11'
- hive: 'hive-1.2'
- - hadoop: 'hadoop-3.2'
- hive: 'hive-1.2'
- name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }}
-
- steps:
- - uses: actions/checkout@master
- # We split caches because GitHub Action Cache has a 400MB-size limit.
- - uses: actions/cache@v1
- with:
- path: build
- key: build-${{ hashFiles('**/pom.xml') }}
- restore-keys: |
- build-
- - uses: actions/cache@v1
- with:
- path: ~/.m2/repository/com
- key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }}
- restore-keys: |
- ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-
- - uses: actions/cache@v1
- with:
- path: ~/.m2/repository/org
- key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }}
- restore-keys: |
- ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-
- - uses: actions/cache@v1
- with:
- path: ~/.m2/repository/net
- key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }}
- restore-keys: |
- ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-
- - uses: actions/cache@v1
- with:
- path: ~/.m2/repository/io
- key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }}
- restore-keys: |
- ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-
- - name: Set up JDK ${{ matrix.java }}
- uses: actions/setup-java@v1
- with:
- java-version: ${{ matrix.java }}
- - name: Build with Maven
- run: |
- export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
- export MAVEN_CLI_OPTS="--no-transfer-progress"
- mkdir -p ~/.m2
- ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install
- rm -rf ~/.m2/repository/org/apache/spark
-
-
- lint:
- runs-on: ubuntu-latest
- name: Linters (Java/Scala/Python), licenses, dependencies
- steps:
- - uses: actions/checkout@master
- - uses: actions/setup-java@v1
- with:
- java-version: '11'
- - uses: actions/setup-python@v1
- with:
- python-version: '3.x'
- architecture: 'x64'
- - name: Scala
- run: ./dev/lint-scala
- - name: Java
- run: ./dev/lint-java
- - name: Python
- run: |
- pip install flake8 sphinx numpy
- ./dev/lint-python
- - name: License
- run: ./dev/check-license
- - name: Dependencies
- run: ./dev/test-dependencies.sh
-
- lintr:
- runs-on: ubuntu-latest
- name: Linter (R)
- steps:
- - uses: actions/checkout@master
- - uses: actions/setup-java@v1
- with:
- java-version: '11'
- - uses: r-lib/actions/setup-r@v1
- with:
- r-version: '3.6.2'
- - name: Install lib
- run: |
- sudo apt-get install -y libcurl4-openssl-dev
- - name: install R packages
- run: |
- sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
- sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
- - name: package and install SparkR
- run: ./R/install-dev.sh
- - name: lint-r
- run: ./dev/lint-r
-
- docs:
- runs-on: ubuntu-latest
- name: Generate documents
- steps:
- - uses: actions/checkout@master
- - uses: actions/cache@v1
- with:
- path: ~/.m2/repository
- key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
- restore-keys: |
- docs-maven-repo-
- - uses: actions/setup-java@v1
- with:
- java-version: '1.8'
- - uses: actions/setup-python@v1
- with:
- python-version: '3.x'
- architecture: 'x64'
- - uses: actions/setup-ruby@v1
- with:
- ruby-version: '2.7'
- - uses: r-lib/actions/setup-r@v1
- with:
- r-version: '3.6.2'
- - name: Install lib and pandoc
- run: |
- sudo apt-get install -y libcurl4-openssl-dev pandoc
- - name: Install packages
- run: |
- pip install sphinx mkdocs numpy
- gem install jekyll jekyll-redirect-from rouge
- sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
- - name: Run jekyll build
- run: |
- cd docs
- jekyll build
diff --git a/.github/workflows/notify_test_workflow.yml b/.github/workflows/notify_test_workflow.yml
new file mode 100644
index 0000000000000..cc2b7a254e3f5
--- /dev/null
+++ b/.github/workflows/notify_test_workflow.yml
@@ -0,0 +1,125 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Intentionally has a general name.
+# because the test status check created in GitHub Actions
+# currently randomly picks any associated workflow.
+# So, the name was changed to make sense in that context too.
+# See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10
+name: On pull request update
+on:
+ pull_request_target:
+ types: [opened, reopened, synchronize]
+
+jobs:
+ notify:
+ name: Notify test workflow
+ runs-on: ubuntu-20.04
+ steps:
+ - name: "Notify test workflow"
+ uses: actions/github-script@v3
+ if: ${{ github.base_ref == 'master' }}
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const endpoint = 'GET /repos/:owner/:repo/actions/workflows/:id/runs?&branch=:branch'
+
+ // TODO: Should use pull_request.user and pull_request.user.repos_url?
+ // If a different person creates a commit to another forked repo,
+ // it wouldn't be able to detect.
+ const params = {
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ id: 'build_and_test.yml',
+ branch: context.payload.pull_request.head.ref,
+ }
+
+ console.log('Ref: ' + context.payload.pull_request.head.ref)
+ console.log('SHA: ' + context.payload.pull_request.head.sha)
+
+ // Wait 3 seconds to make sure the fork repository triggered a workflow.
+ await new Promise(r => setTimeout(r, 3000))
+
+ let runs
+ try {
+ runs = await github.request(endpoint, params)
+ } catch (error) {
+ console.error(error)
+ // Assume that runs were not found.
+ }
+
+ const name = 'Build and test'
+ const head_sha = context.payload.pull_request.head.sha
+ let status = 'queued'
+
+ if (!runs || runs.data.workflow_runs.length === 0) {
+ status = 'completed'
+ const conclusion = 'action_required'
+
+ github.checks.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ name: name,
+ head_sha: head_sha,
+ status: status,
+ conclusion: conclusion,
+ output: {
+ title: 'Workflow run detection failed',
+ summary: `
+ Unable to detect the workflow run for testing the changes in your PR.
+
+ 1. If you did not enable GitHub Actions in your forked repository, please enable it. See also [Disabling or limiting GitHub Actions for a repository](https://docs.github.com/en/github/administering-a-repository/disabling-or-limiting-github-actions-for-a-repository) for more details.
+ 2. It is possible your branch is based on the old \`master\` branch in Apache Spark, please sync your branch to the latest master branch. For example as below:
+ \`\`\`bash
+ git fetch upstream
+ git rebase upstream/master
+ git push origin YOUR_BRANCH --force
+ \`\`\``
+ }
+ })
+ } else {
+ const runID = runs.data.workflow_runs[0].id
+
+ if (runs.data.workflow_runs[0].head_sha != context.payload.pull_request.head.sha) {
+ throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
+ }
+
+ const runUrl = 'https://github.com/'
+ + context.payload.pull_request.head.repo.full_name
+ + '/actions/runs/'
+ + runID
+
+ github.checks.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ name: name,
+ head_sha: head_sha,
+ status: status,
+ output: {
+ title: 'Test results',
+ summary: '[See test results](' + runUrl + ')',
+ text: JSON.stringify({
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ run_id: runID
+ })
+ },
+ details_url: runUrl,
+ })
+ }
diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml
new file mode 100644
index 0000000000000..c5dbc8d057964
--- /dev/null
+++ b/.github/workflows/publish_snapshot.yml
@@ -0,0 +1,40 @@
+name: Publish Snapshot
+
+on:
+ schedule:
+ - cron: '0 0 * * *'
+
+jobs:
+ publish-snapshot:
+ if: github.repository == 'apache/spark'
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ branch:
+ - master
+ - branch-3.1
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@master
+ with:
+ ref: ${{ matrix.branch }}
+ - name: Cache Maven local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.m2/repository
+ key: snapshot-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ snapshot-maven-
+ - name: Install Java 8
+ uses: actions/setup-java@v1
+ with:
+ java-version: 8
+ - name: Publish snapshot
+ env:
+ ASF_USERNAME: ${{ secrets.NEXUS_USER }}
+ ASF_PASSWORD: ${{ secrets.NEXUS_PW }}
+ GPG_KEY: "not_used"
+ GPG_PASSPHRASE: "not_used"
+ GIT_REF: ${{ matrix.branch }}
+ run: ./dev/create-release/release-build.sh publish-snapshot
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
new file mode 100644
index 0000000000000..6d70f29b2efa2
--- /dev/null
+++ b/.github/workflows/test_report.yml
@@ -0,0 +1,25 @@
+name: Report test results
+on:
+ workflow_run:
+ workflows: ["Build and test"]
+ types:
+ - completed
+
+jobs:
+ test_report:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Download test results to report
+ uses: dawidd6/action-download-artifact@v2
+ with:
+ github_token: ${{ secrets.GITHUB_TOKEN }}
+ workflow: ${{ github.event.workflow_run.workflow_id }}
+ commit: ${{ github.event.workflow_run.head_commit.id }}
+ workflow_conclusion: completed
+ - name: Publish test report
+ uses: scacap/action-surefire-report@v1
+ with:
+ check_name: Report test results
+ github_token: ${{ secrets.GITHUB_TOKEN }}
+ report_paths: "**/target/test-reports/*.xml"
+ commit: ${{ github.event.workflow_run.head_commit.id }}
diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml
new file mode 100644
index 0000000000000..16fbe3a6f66c7
--- /dev/null
+++ b/.github/workflows/update_build_status.yml
@@ -0,0 +1,97 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Update build status workflow
+
+on:
+ schedule:
+ - cron: "*/15 * * * *"
+
+jobs:
+ update:
+ name: Update build status
+ runs-on: ubuntu-20.04
+ steps:
+ - name: "Update build status"
+ uses: actions/github-script@v3
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const endpoint = 'GET /repos/:owner/:repo/pulls?state=:state'
+ const params = {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ state: 'open'
+ }
+
+ // See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
+ const maybeReady = ['behind', 'clean', 'draft', 'has_hooks', 'unknown', 'unstable'];
+
+ // Iterate open PRs
+ for await (const prs of github.paginate.iterator(endpoint,params)) {
+ // Each page
+ for await (const pr of prs.data) {
+ console.log('SHA: ' + pr.head.sha)
+ console.log(' Mergeable status: ' + pr.mergeable_state)
+ if (pr.mergeable_state == null || maybeReady.includes(pr.mergeable_state)) {
+ const checkRuns = await github.request('GET /repos/{owner}/{repo}/commits/{ref}/check-runs', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ ref: pr.head.sha
+ })
+
+ // Iterator GitHub Checks in the PR
+ for await (const cr of checkRuns.data.check_runs) {
+ if (cr.name == 'Build and test' && cr.conclusion != "action_required") {
+ // text contains parameters to make request in JSON.
+ const params = JSON.parse(cr.output.text)
+
+ // Get the workflow run in the forked repository
+ const run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
+
+ // Keep syncing the status of the checks
+ if (run.data.status == 'completed') {
+ console.log(' Run ' + cr.id + ': set status (' + run.data.status + ') and conclusion (' + run.data.conclusion + ')')
+ const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ check_run_id: cr.id,
+ output: cr.output,
+ status: run.data.status,
+ conclusion: run.data.conclusion,
+ details_url: run.data.details_url
+ })
+ } else {
+ console.log(' Run ' + cr.id + ': set status (' + run.data.status + ')')
+ const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ check_run_id: cr.id,
+ output: cr.output,
+ status: run.data.status,
+ details_url: run.data.details_url
+ })
+ }
+
+ break
+ }
+ }
+ }
+ }
+ }
diff --git a/.gitignore b/.gitignore
index 198fdee39be95..0411eb736c11b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,13 +8,17 @@
*.swp
*~
.DS_Store
+.bsp/
.cache
.classpath
.ensime
.ensime_cache/
.ensime_lucene
.generated-mima*
-.idea/
+# The star is required for further !.idea/ to work, see https://git-scm.com/docs/gitignore
+.idea/*
+!.idea/vcs.xml
+python/.idea
.idea_modules/
.project
.pydevproject
@@ -29,7 +33,6 @@ R/pkg/tests/fulltests/Rplots.pdf
build/*.jar
build/apache-maven*
build/scala*
-build/zinc*
cache
checkpoint
conf/*.cmd
@@ -47,10 +50,12 @@ dev/pr-deps/
dist/
docs/_site/
docs/api
+docs/.local_ruby_bundle
sql/docs
sql/site
lib_managed/
lint-r-report.log
+lint-js-report.log
log/
logs/
out/
@@ -64,9 +69,11 @@ python/lib/pyspark.zip
python/.eggs/
python/deps
python/docs/_site/
+python/docs/source/reference/api/
python/test_coverage/coverage_data
python/test_coverage/htmlcov
python/pyspark/python
+.mypy_cache/
reports/
scalastyle-on-compile.generated.xml
scalastyle-output.xml
@@ -79,6 +86,7 @@ target/
unit-tests.log
work/
docs/.jekyll-metadata
+docs/.jekyll-cache
# For Hive
TempStatsStore/
@@ -99,3 +107,6 @@ spark-warehouse/
# For SBT
.jvmopts
+
+# For Node.js
+node_modules
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000000000..28fd3fcdf10ea
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,36 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/LICENSE b/LICENSE
index 6b169b1447f14..df6bed16f4471 100644
--- a/LICENSE
+++ b/LICENSE
@@ -222,14 +222,13 @@ external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaRe
Python Software Foundation License
----------------------------------
-pyspark/heapq3.py
-python/docs/_static/copybutton.js
+python/docs/source/_static/copybutton.js
BSD 3-Clause
------------
python/lib/py4j-*-src.zip
-python/pyspark/cloudpickle.py
+python/pyspark/cloudpickle/*.py
python/pyspark/join.py
core/src/main/resources/org/apache/spark/ui/static/d3.min.js
diff --git a/LICENSE-binary b/LICENSE-binary
index b50da6be4e697..32c7fc315d1a3 100644
--- a/LICENSE-binary
+++ b/LICENSE-binary
@@ -218,7 +218,6 @@ javax.jdo:jdo-api
joda-time:joda-time
net.sf.opencsv:opencsv
org.apache.derby:derby
-org.ehcache:ehcache
org.objenesis:objenesis
org.roaringbitmap:RoaringBitmap
org.scalanlp:breeze-macros_2.12
@@ -261,7 +260,6 @@ net.sf.supercsv:super-csv
org.apache.arrow:arrow-format
org.apache.arrow:arrow-memory
org.apache.arrow:arrow-vector
-org.apache.commons:commons-configuration2
org.apache.commons:commons-crypto
org.apache.commons:commons-lang3
org.apache.hadoop:hadoop-annotations
@@ -296,7 +294,6 @@ org.apache.kerby:kerby-config
org.apache.kerby:kerby-pkix
org.apache.kerby:kerby-util
org.apache.kerby:kerby-xdr
-org.apache.kerby:token-provider
org.apache.orc:orc-core
org.apache.orc:orc-mapreduce
org.mortbay.jetty:jetty
@@ -316,19 +313,15 @@ com.fasterxml.jackson.jaxrs:jackson-jaxrs-json-provider
com.fasterxml.jackson.module:jackson-module-jaxb-annotations
com.fasterxml.jackson.module:jackson-module-paranamer
com.fasterxml.jackson.module:jackson-module-scala_2.12
-com.fasterxml.woodstox:woodstox-core
com.github.mifmif:generex
-com.github.stephenc.jcip:jcip-annotations
com.google.code.findbugs:jsr305
com.google.code.gson:gson
com.google.flatbuffers:flatbuffers-java
com.google.guava:guava
com.google.inject:guice
com.google.inject.extensions:guice-servlet
-com.nimbusds:nimbus-jose-jwt
com.twitter:parquet-hadoop-bundle
commons-cli:commons-cli
-commons-daemon:commons-daemon
commons-dbcp:commons-dbcp
commons-io:commons-io
commons-lang:commons-lang
@@ -340,8 +333,6 @@ javax.inject:javax.inject
javax.validation:validation-api
log4j:apache-log4j-extras
log4j:log4j
-net.minidev:accessors-smart
-net.minidev:json-smart
net.sf.jpam:jpam
org.apache.avro:avro
org.apache.avro:avro-ipc
@@ -357,7 +348,6 @@ org.apache.directory.server:apacheds-i18n
org.apache.directory.server:apacheds-kerberos-codec
org.apache.htrace:htrace-core
org.apache.ivy:ivy
-org.apache.geronimo.specs:geronimo-jcache_1.0_spec
org.apache.mesos:mesos
org.apache.parquet:parquet-column
org.apache.parquet:parquet-common
@@ -432,7 +422,6 @@ BSD 2-Clause
------------
com.github.luben:zstd-jni
-dnsjava:dnsjava
javolution:javolution
com.esotericsoftware:kryo-shaded
com.esotericsoftware:minlog
@@ -440,7 +429,6 @@ com.esotericsoftware:reflectasm
com.google.protobuf:protobuf-java
org.codehaus.janino:commons-compiler
org.codehaus.janino:janino
-org.codehaus.woodstox:stax2-api
jline:jline
org.jodd:jodd-core
com.github.wendykierp:JTransforms
@@ -456,8 +444,6 @@ org.antlr:ST4
org.antlr:stringtemplate
org.antlr:antlr4-runtime
antlr:antlr
-com.github.fommil.netlib:core
-com.google.re2j:re2j
com.thoughtworks.paranamer:paranamer
org.scala-lang:scala-compiler
org.scala-lang:scala-library
@@ -498,6 +484,9 @@ org.slf4j:jul-to-slf4j
org.slf4j:slf4j-api
org.slf4j:slf4j-log4j12
com.github.scopt:scopt_2.12
+dev.ludovic.netlib:blas
+dev.ludovic.netlib:arpack
+dev.ludovic.netlib:lapack
core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js
core/src/main/resources/org/apache/spark/ui/static/*dataTables*
@@ -521,7 +510,6 @@ Common Development and Distribution License (CDDL) 1.1
------------------------------------------------------
javax.el:javax.el-api https://javaee.github.io/uel-ri/
-javax.servlet:javax.servlet-api https://javaee.github.io/servlet-spec/
javax.servlet.jsp:jsp-api
javax.transaction:jta http://www.oracle.com/technetwork/java/index.html
javax.xml.bind:jaxb-api https://github.com/javaee/jaxb-v2
@@ -553,16 +541,11 @@ Eclipse Public License (EPL) 2.0
--------------------------------
jakarta.annotation:jakarta-annotation-api https://projects.eclipse.org/projects/ee4j.ca
+jakarta.servlet:jakarta.servlet-api https://projects.eclipse.org/projects/ee4j.servlet
jakarta.ws.rs:jakarta.ws.rs-api https://github.com/eclipse-ee4j/jaxrs-api
org.glassfish.hk2.external:jakarta.inject
-Python Software Foundation License
-----------------------------------
-
-pyspark/heapq3.py
-
-
Public Domain
-------------
diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md
index 4d9b6416c01cb..2f410cf8bfd94 100644
--- a/R/CRAN_RELEASE.md
+++ b/R/CRAN_RELEASE.md
@@ -25,7 +25,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple
First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control.
-Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`).
+Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, e.g. `yum -q -y install qpdf`).
To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible.
diff --git a/R/DOCUMENTATION.md b/R/DOCUMENTATION.md
index fd0c7644189db..d4ff3f5f4875b 100644
--- a/R/DOCUMENTATION.md
+++ b/R/DOCUMENTATION.md
@@ -19,7 +19,7 @@ license: |
# SparkR Documentation
SparkR documentation is generated by using in-source comments and annotated by using
-[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages,
+[`roxygen2`](https://cran.r-project.org/package=roxygen2). After making changes to the documentation and generating man pages,
you can run the following from an R console in the SparkR home directory
```R
library(devtools)
diff --git a/R/README.md b/R/README.md
index 31174c73526f2..da9f042b4fded 100644
--- a/R/README.md
+++ b/R/README.md
@@ -17,10 +17,14 @@ export R_HOME=/home/username/R
#### Build Spark
-Build Spark with [Maven](https://spark.apache.org/docs/latest/building-spark.html#buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
+Build Spark with [Maven](https://spark.apache.org/docs/latest/building-spark.html#buildmvn) or [SBT](https://spark.apache.org/docs/latest/building-spark.html#building-with-sbt), and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
```bash
+# Maven
./build/mvn -DskipTests -Psparkr package
+
+# SBT
+./build/sbt -Psparkr package
```
#### Running sparkR
diff --git a/R/WINDOWS.md b/R/WINDOWS.md
index dbc27178bdb8c..9fe4a22bf22b2 100644
--- a/R/WINDOWS.md
+++ b/R/WINDOWS.md
@@ -22,8 +22,8 @@ To build SparkR on Windows, the following steps are required
1. Make sure `bash` is available and in `PATH` if you already have a built-in `bash` on Windows. If you do not have, install [Cygwin](https://www.cygwin.com/).
-2. Install R (>= 3.1) and [Rtools](https://cloud.r-project.org/bin/windows/Rtools/). Make sure to
-include Rtools and R in `PATH`. Note that support for R prior to version 3.4 is deprecated as of Spark 3.0.0.
+2. Install R (>= 3.5) and [Rtools](https://cloud.r-project.org/bin/windows/Rtools/). Make sure to
+include Rtools and R in `PATH`.
3. Install JDK that SparkR supports (see `R/pkg/DESCRIPTION`), and set `JAVA_HOME` in the system environment variables.
diff --git a/R/install-dev.bat b/R/install-dev.bat
index ed1c91ae3a0ff..ae5aa589a19d1 100644
--- a/R/install-dev.bat
+++ b/R/install-dev.bat
@@ -24,7 +24,13 @@ set SPARK_HOME=%~dp0..
MKDIR %SPARK_HOME%\R\lib
-R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" %SPARK_HOME%\R\pkg\
+rem When you pass the package path directly as an argument to R CMD INSTALL,
+rem it takes the path as 'C:\projects\spark\R\..\R\pkg"' as an example at
+rem R 4.0. To work around this, directly go to the directory and install it.
+rem See also SPARK-32074
+pushd %SPARK_HOME%\R\pkg\
+R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" .
+popd
rem Zip the SparkR package so that it can be distributed to worker nodes on YARN
pushd %SPARK_HOME%\R\lib
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 52d7e1f4daa53..f55286da25de1 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
Package: SparkR
Type: Package
-Version: 3.1.0
+Version: 3.2.0
Title: R Front End for 'Apache Spark'
Description: Provides an R Front end for 'Apache Spark' .
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
@@ -11,19 +11,20 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
email = "felixcheung@apache.org"),
person(family = "The Apache Software Foundation", role = c("aut", "cph")))
License: Apache License (== 2.0)
-URL: https://www.apache.org/ https://spark.apache.org/
+URL: https://www.apache.org https://spark.apache.org
BugReports: https://spark.apache.org/contributing.html
SystemRequirements: Java (>= 8, < 12)
Depends:
- R (>= 3.1),
+ R (>= 3.5),
methods
Suggests:
knitr,
rmarkdown,
+ markdown,
testthat,
e1071,
survival,
- arrow (>= 0.15.1)
+ arrow (>= 1.0.0)
Collate:
'schema.R'
'generics.R'
@@ -59,7 +60,7 @@ Collate:
'types.R'
'utils.R'
'window.R'
-RoxygenNote: 5.0.1
+RoxygenNote: 7.1.1
VignetteBuilder: knitr
NeedsCompilation: no
Encoding: UTF-8
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 53a0b7856567e..1f0b69db65151 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -202,6 +202,7 @@ exportMethods("%<=>%",
"%in%",
"abs",
"acos",
+ "acosh",
"add_months",
"alias",
"approx_count_distinct",
@@ -222,20 +223,27 @@ exportMethods("%<=>%",
"array_remove",
"array_repeat",
"array_sort",
+ "array_to_vector",
"array_transform",
"arrays_overlap",
"array_union",
"arrays_zip",
"arrays_zip_with",
"asc",
+ "asc_nulls_first",
+ "asc_nulls_last",
"ascii",
"asin",
+ "asinh",
+ "assert_true",
"atan",
+ "atanh",
"atan2",
"avg",
"base64",
"between",
"bin",
+ "bitwise_not",
"bitwiseNOT",
"bround",
"cast",
@@ -252,6 +260,7 @@ exportMethods("%<=>%",
"cos",
"cosh",
"count",
+ "count_distinct",
"countDistinct",
"crc32",
"create_array",
@@ -272,6 +281,9 @@ exportMethods("%<=>%",
"degrees",
"dense_rank",
"desc",
+ "desc_nulls_first",
+ "desc_nulls_last",
+ "dropFields",
"element_at",
"encode",
"endsWith",
@@ -286,6 +298,7 @@ exportMethods("%<=>%",
"floor",
"format_number",
"format_string",
+ "from_avro",
"from_csv",
"from_json",
"from_unixtime",
@@ -348,6 +361,7 @@ exportMethods("%<=>%",
"negate",
"next_day",
"not",
+ "nth_value",
"ntile",
"otherwise",
"over",
@@ -357,8 +371,10 @@ exportMethods("%<=>%",
"pmod",
"posexplode",
"posexplode_outer",
+ "product",
"quarter",
"radians",
+ "raise_error",
"rand",
"randn",
"rank",
@@ -378,8 +394,11 @@ exportMethods("%<=>%",
"sha1",
"sha2",
"shiftLeft",
+ "shiftleft",
"shiftRight",
+ "shiftright",
"shiftRightUnsigned",
+ "shiftrightunsigned",
"shuffle",
"sd",
"sign",
@@ -402,11 +421,14 @@ exportMethods("%<=>%",
"substr",
"substring_index",
"sum",
+ "sum_distinct",
"sumDistinct",
"tan",
"tanh",
+ "timestamp_seconds",
"toDegrees",
"toRadians",
+ "to_avro",
"to_csv",
"to_date",
"to_json",
@@ -425,9 +447,11 @@ exportMethods("%<=>%",
"variance",
"var_pop",
"var_samp",
+ "vector_to_array",
"weekofyear",
"when",
"window",
+ "withField",
"xxhash64",
"year")
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 15b3ce2935427..72d96151f6371 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -880,7 +880,7 @@ setMethod("toJSON",
#' Save the contents of SparkDataFrame as a JSON file
#'
-#' Save the contents of a SparkDataFrame as a JSON file (\href{http://jsonlines.org/}{
+#' Save the contents of a SparkDataFrame as a JSON file (\href{https://jsonlines.org/}{
#' JSON Lines text format or newline-delimited JSON}). Files written out
#' with this method can be read back in as a SparkDataFrame using read.json().
#'
@@ -1233,14 +1233,10 @@ setMethod("collect",
port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
output <- tryCatch({
doServerAuth(conn, authSecret)
- arrowTable <- arrow::read_arrow(readRaw(conn))
- # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
- if (exists("as_tibble", envir = asNamespace("arrow"))) {
- as.data.frame(arrow::as_tibble(arrowTable), stringsAsFactors = stringsAsFactors)
- } else {
- as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
- }
- }, finally = {
+ arrowTable <- arrow::read_ipc_stream(readRaw(conn))
+ as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
+ },
+ finally = {
close(conn)
})
return(output)
@@ -2281,16 +2277,17 @@ setMethod("mutate",
# For named arguments, use the names for arguments as the column names
# For unnamed arguments, use the argument symbols as the column names
- args <- sapply(substitute(list(...))[-1], deparse)
ns <- names(cols)
- if (!is.null(ns)) {
- lapply(seq_along(args), function(i) {
- if (ns[[i]] != "") {
- args[[i]] <<- ns[[i]]
- }
+ if (is.null(ns)) ns <- rep("", length(cols))
+ named_idx <- nzchar(ns)
+ if (!all(named_idx)) {
+ # SPARK-31517: deparse uses width.cutoff on wide input and the
+ # output is length>1, so need to collapse it to scalar
+ colsub <- substitute(list(...))[-1L]
+ ns[!named_idx] <- sapply(which(!named_idx), function(ii) {
+ paste(gsub("^\\s*|\\s*$", "", deparse(colsub[[ii]])), collapse = " ")
})
}
- ns <- args
# The last column of the same name in the specific columns takes effect
deDupCols <- list()
@@ -2776,7 +2773,7 @@ setMethod("merge",
#' Creates a list of columns by replacing the intersected ones with aliases
#'
#' Creates a list of columns by replacing the intersected ones with aliases.
-#' The name of the alias column is formed by concatanating the original column name and a suffix.
+#' The name of the alias column is formed by concatenating the original column name and a suffix.
#'
#' @param x a SparkDataFrame
#' @param intersectedColNames a list of intersected column names of the SparkDataFrame
@@ -2867,11 +2864,18 @@ setMethod("unionAll",
#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
#' into account. Input SparkDataFrames can have different data types in the schema.
#'
+#' When the parameter allowMissingColumns is `TRUE`, the set of column names
+#' in x and y can differ; missing columns will be filled as null.
+#' Further, the missing columns of x will be added at the end
+#' in the schema of the union result.
+#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#' This function resolves columns by name (not by position).
#'
#' @param x A SparkDataFrame
#' @param y A SparkDataFrame
+#' @param allowMissingColumns logical
+#' @param ... further arguments to be passed to or from other methods.
#' @return A SparkDataFrame containing the result of the union.
#' @family SparkDataFrame functions
#' @rdname unionByName
@@ -2884,12 +2888,15 @@ setMethod("unionAll",
#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
#' head(unionByName(df1, df2))
+#'
+#' df3 <- select(createDataFrame(mtcars), "carb")
+#' head(unionByName(df1, df3, allowMissingColumns = TRUE))
#' }
#' @note unionByName since 2.3.0
setMethod("unionByName",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
- function(x, y) {
- unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
+ function(x, y, allowMissingColumns=FALSE) {
+ unioned <- callJMethod(x@sdf, "unionByName", y@sdf, allowMissingColumns)
dataFrame(unioned)
})
@@ -3225,7 +3232,7 @@ setMethod("describe",
#' \item stddev
#' \item min
#' \item max
-#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%")
+#' \item arbitrary approximate percentiles specified as a percentage (e.g., "75\%")
#' }
#' If no statistics are given, this function computes count, mean, stddev, min,
#' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max.
@@ -3438,7 +3445,8 @@ setMethod("as.data.frame",
#' @note attach since 1.6.0
setMethod("attach",
signature(what = "SparkDataFrame"),
- function(what, pos = 2L, name = deparse(substitute(what), backtick = FALSE),
+ function(what, pos = 2L,
+ name = paste(deparse(substitute(what), backtick = FALSE), collapse = " "),
warn.conflicts = TRUE) {
args <- as.list(environment()) # capture all parameters - this must be the first line
newEnv <- assignNewEnv(args$what)
@@ -3737,7 +3745,7 @@ setMethod("histogram",
#'
#' @param x a SparkDataFrame.
#' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}.
-#' @param tableName yhe name of the table in the external database.
+#' @param tableName the name of the table in the external database.
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
#' save mode (it is 'error' by default)
#' @param ... additional JDBC database connection properties.
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 7a1d157bb8a36..408a3ff25b2b2 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -970,7 +970,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical",
MAXINT)))))
# If the first sample didn't turn out large enough, keep trying to
# take samples; this shouldn't happen often because we use a big
- # multiplier for thei initial size
+ # multiplier for the initial size
while (length(samples) < total)
samples <- collectRDD(sampleRDD(x, withReplacement, fraction,
as.integer(ceiling(stats::runif(1,
@@ -1512,7 +1512,7 @@ setMethod("glom",
#'
#' @param x An RDD.
#' @param y An RDD.
-#' @return a new RDD created by performing the simple union (witout removing
+#' @return a new RDD created by performing the simple union (without removing
#' duplicates) of two input RDDs.
#' @examples
#'\dontrun{
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index c0ac68332ec41..14262e1a74ab0 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -203,7 +203,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) {
})
}
- # SPAKR-SQL does not support '.' in column name, so replace it with '_'
+ # SPARK-SQL does not support '.' in column name, so replace it with '_'
# TODO(davies): remove this once SPARK-2775 is fixed
names <- lapply(names, function(n) {
nn <- gsub(".", "_", n, fixed = TRUE)
@@ -374,7 +374,7 @@ setMethod("toDF", signature(x = "RDD"),
#' Create a SparkDataFrame from a JSON file.
#'
#' Loads a JSON file, returning the result as a SparkDataFrame
-#' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
+#' By default, (\href{https://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
#' ) is supported. For JSON (one record per file), set a named property \code{multiLine} to
#' \code{TRUE}.
#' It goes through the entire dataset once to determine the schema.
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index 037809cd0923e..be47d0117ed7f 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -54,7 +54,7 @@ setMethod("show", "WindowSpec",
#' Defines the partitioning columns in a WindowSpec.
#'
#' @param x a WindowSpec.
-#' @param col a column to partition on (desribed by the name or Column).
+#' @param col a column to partition on (described by the name or Column).
#' @param ... additional column(s) to partition on.
#' @return A WindowSpec.
#' @rdname partitionBy
@@ -231,7 +231,7 @@ setMethod("rangeBetween",
#' @rdname over
#' @name over
#' @aliases over,Column,WindowSpec-method
-#' @family colum_func
+#' @family column_func
#' @examples
#' \dontrun{
#' df <- createDataFrame(mtcars)
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 7926a9a2467ee..9fa117ccb6281 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -67,7 +67,11 @@ operators <- list(
# we can not override `&&` and `||`, so use `&` and `|` instead
"&" = "and", "|" = "or", "^" = "pow"
)
-column_functions1 <- c("asc", "desc", "isNaN", "isNull", "isNotNull")
+column_functions1 <- c(
+ "asc", "asc_nulls_first", "asc_nulls_last",
+ "desc", "desc_nulls_first", "desc_nulls_last",
+ "isNaN", "isNull", "isNotNull"
+)
column_functions2 <- c("like", "rlike", "getField", "getItem", "contains")
createOperator <- function(op) {
@@ -131,7 +135,7 @@ createMethods()
#' @rdname alias
#' @name alias
#' @aliases alias,Column-method
-#' @family colum_func
+#' @family column_func
#' @examples
#' \dontrun{
#' df <- createDataFrame(iris)
@@ -157,7 +161,7 @@ setMethod("alias",
#'
#' @rdname substr
#' @name substr
-#' @family colum_func
+#' @family column_func
#' @aliases substr,Column-method
#'
#' @param x a Column.
@@ -183,7 +187,7 @@ setMethod("substr", signature(x = "Column"),
#'
#' @rdname startsWith
#' @name startsWith
-#' @family colum_func
+#' @family column_func
#' @aliases startsWith,Column-method
#'
#' @param x vector of character string whose "starts" are considered
@@ -202,7 +206,7 @@ setMethod("startsWith", signature(x = "Column"),
#'
#' @rdname endsWith
#' @name endsWith
-#' @family colum_func
+#' @family column_func
#' @aliases endsWith,Column-method
#'
#' @param x vector of character string whose "ends" are considered
@@ -220,7 +224,7 @@ setMethod("endsWith", signature(x = "Column"),
#'
#' @rdname between
#' @name between
-#' @family colum_func
+#' @family column_func
#' @aliases between,Column-method
#'
#' @param x a Column
@@ -247,7 +251,7 @@ setMethod("between", signature(x = "Column"),
# nolint end
#' @rdname cast
#' @name cast
-#' @family colum_func
+#' @family column_func
#' @aliases cast,Column-method
#'
#' @examples
@@ -296,7 +300,7 @@ setMethod("%in%",
#' Can be a single value or a Column.
#' @rdname otherwise
#' @name otherwise
-#' @family colum_func
+#' @family column_func
#' @aliases otherwise,Column-method
#' @note otherwise since 1.5.0
setMethod("otherwise",
@@ -356,3 +360,103 @@ setMethod("%<=>%",
#' }
#' @note ! since 2.3.0
setMethod("!", signature(x = "Column"), function(x) not(x))
+
+#' withField
+#'
+#' Adds/replaces field in a struct \code{Column} by name.
+#'
+#' @param x a Column
+#' @param fieldName a character
+#' @param col a Column expression
+#'
+#' @rdname withField
+#' @aliases withField withField,Column-method
+#' @examples
+#' \dontrun{
+#' df <- withColumn(
+#' createDataFrame(iris),
+#' "sepal",
+#' struct(column("Sepal_Width"), column("Sepal_Length"))
+#' )
+#'
+#' head(select(
+#' df,
+#' withField(df$sepal, "product", df$Sepal_Length * df$Sepal_Width)
+#' ))
+#' }
+#' @note withField since 3.1.0
+setMethod("withField",
+ signature(x = "Column", fieldName = "character", col = "Column"),
+ function(x, fieldName, col) {
+ jc <- callJMethod(x@jc, "withField", fieldName, col@jc)
+ column(jc)
+ })
+
+#' dropFields
+#'
+#' Drops fields in a struct \code{Column} by name.
+#'
+#' @param x a Column
+#' @param ... names of the fields to be dropped.
+#'
+#' @rdname dropFields
+#' @aliases dropFields dropFields,Column-method
+#' @examples
+#' \dontrun{
+#' df <- select(
+#' createDataFrame(iris),
+#' alias(
+#' struct(
+#' column("Sepal_Width"), column("Sepal_Length"),
+#' alias(
+#' struct(
+#' column("Petal_Width"), column("Petal_Length"),
+#' alias(
+#' column("Petal_Width") * column("Petal_Length"),
+#' "Petal_Product"
+#' )
+#' ),
+#' "Petal"
+#' )
+#' ),
+#' "dimensions"
+#' )
+#' )
+#' head(withColumn(df, "dimensions", dropFields(df$dimensions, "Petal")))
+#'
+#' head(
+#' withColumn(
+#' df, "dimensions",
+#' dropFields(df$dimensions, "Sepal_Width", "Sepal_Length")
+#' )
+#' )
+#'
+#' # This method supports dropping multiple nested fields directly e.g.
+#' head(
+#' withColumn(
+#' df, "dimensions",
+#' dropFields(df$dimensions, "Petal.Petal_Width", "Petal.Petal_Length")
+#' )
+#' )
+#'
+#' # However, if you are going to add/replace multiple nested fields,
+#' # it is preferred to extract out the nested struct before
+#' # adding/replacing multiple fields e.g.
+#' head(
+#' withColumn(
+#' df, "dimensions",
+#' withField(
+#' column("dimensions"),
+#' "Petal",
+#' dropFields(column("dimensions.Petal"), "Petal_Width", "Petal_Length")
+#' )
+#' )
+#' )
+#' }
+#' @note dropFields since 3.1.0
+setMethod("dropFields",
+ signature(x = "Column"),
+ function(x, ...) {
+ jc <- callJMethod(x@jc, "dropFields", list(...))
+ column(jc)
+ })
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index e3c9d9f8793d6..cca6c2c817de9 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -86,7 +86,7 @@ makeSplits <- function(numSerializedSlices, length) {
# For instance, for numSerializedSlices of 22, length of 50
# [1] 0 0 2 2 4 4 6 6 6 9 9 11 11 13 13 15 15 15 18 18 20 20 22 22 22
# [26] 25 25 27 27 29 29 31 31 31 34 34 36 36 38 38 40 40 40 43 43 45 45 47 47 47
- # Notice the slice group with 3 slices (ie. 6, 15, 22) are roughly evenly spaced.
+ # Notice the slice group with 3 slices (i.e. 6, 15, 22) are roughly evenly spaced.
# We are trying to reimplement the calculation in the positions method in ParallelCollectionRDD
if (numSerializedSlices > 0) {
unlist(lapply(0: (numSerializedSlices - 1), function(x) {
@@ -116,7 +116,7 @@ makeSplits <- function(numSerializedSlices, length) {
#' This change affects both createDataFrame and spark.lapply.
#' In the specific one case that it is used to convert R native object into SparkDataFrame, it has
#' always been kept at the default of 1. In the case the object is large, we are explicitly setting
-#' the parallism to numSlices (which is still 1).
+#' the parallelism to numSlices (which is still 1).
#'
#' Specifically, we are changing to split positions to match the calculation in positions() of
#' ParallelCollectionRDD in Spark.
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 3e7c456bd548d..89a8fbecd36b0 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -233,24 +233,13 @@ readMultipleObjectsWithKeys <- function(inputCon) {
readDeserializeInArrow <- function(inputCon) {
if (requireNamespace("arrow", quietly = TRUE)) {
- # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
- useAsTibble <- exists("as_tibble", envir = asNamespace("arrow"))
-
-
# Currently, there looks no way to read batch by batch by socket connection in R side,
# See ARROW-4512. Therefore, it reads the whole Arrow streaming-formatted binary at once
# for now.
dataLen <- readInt(inputCon)
arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big")
batches <- arrow::RecordBatchStreamReader$create(arrowData)$batches()
-
- if (useAsTibble) {
- as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
- # Read all groupped batches. Tibble -> data.frame is cheap.
- lapply(batches, function(batch) as.data.frame(as_tibble(batch)))
- } else {
- lapply(batches, function(batch) as.data.frame(batch))
- }
+ lapply(batches, function(batch) as.data.frame(batch))
} else {
stop("'arrow' package should be installed.")
}
@@ -261,7 +250,7 @@ readDeserializeWithKeysInArrow <- function(inputCon) {
keys <- readMultipleObjects(inputCon)
- # Read keys to map with each groupped batch later.
+ # Read keys to map with each grouped batch later.
list(keys = keys, data = data)
}
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index da9ef1d6674bd..3ee1cd5b50b48 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -144,7 +144,7 @@ NULL
#' @param y Column to compute on.
#' @param pos In \itemize{
#' \item \code{locate}: a start position of search.
-#' \item \code{overlay}: a start postiton for replacement.
+#' \item \code{overlay}: a start position for replacement.
#' }
#' @param len In \itemize{
#' \item \code{lpad} the maximum length of each output result.
@@ -247,7 +247,7 @@ NULL
#' used to transform the data. The first argument is the key, the second argument
#' is the value.
#' }
-#' @param zero a \code{Column} used as the initial value in \code{array_aggregate}
+#' @param initialValue a \code{Column} used as the initial value in \code{array_aggregate}
#' @param merge a \code{function} a binary function \code{(Column, Column) -> Column}
#' used in \code{array_aggregate}to merge values (the second argument)
#' into accumulator (the first argument).
@@ -338,12 +338,79 @@ NULL
#' tmp <- mutate(df, dist = over(cume_dist(), ws), dense_rank = over(dense_rank(), ws),
#' lag = over(lag(df$mpg), ws), lead = over(lead(df$mpg, 1), ws),
#' percent_rank = over(percent_rank(), ws),
-#' rank = over(rank(), ws), row_number = over(row_number(), ws))
+#' rank = over(rank(), ws), row_number = over(row_number(), ws),
+#' nth_value = over(nth_value(df$mpg, 3), ws))
#' # Get ntile group id (1-4) for hp
#' tmp <- mutate(tmp, ntile = over(ntile(4), ws))
#' head(tmp)}
NULL
+#' ML functions for Column operations
+#'
+#' ML functions defined for \code{Column}.
+#'
+#' @param x Column to compute on.
+#' @param ... additional argument(s).
+#' @name column_ml_functions
+#' @rdname column_ml_functions
+#' @family ml functions
+#' @examples
+#' \dontrun{
+#' df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+#' head(
+#' withColumn(
+#' withColumn(df, "array", vector_to_array(df$features)),
+#' "vector",
+#' array_to_vector(column("array"))
+#' )
+#' )
+#' }
+NULL
+
+#' Avro processing functions for Column operations
+#'
+#' Avro processing functions defined for \code{Column}.
+#'
+#' @param x Column to compute on.
+#' @param jsonFormatSchema character Avro schema in JSON string format
+#' @param ... additional argument(s) passed as parser options.
+#' @name column_avro_functions
+#' @rdname column_avro_functions
+#' @family avro functions
+#' @note Avro is built-in but external data source module since Spark 2.4.
+#' Please deploy the application as per
+#' \href{https://spark.apache.org/docs/latest/sql-data-sources-avro.html#deploying}{
+#' the deployment section
+#' } of "Apache Avro Data Source Guide".
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(iris)
+#' schema <- paste(
+#' c(
+#' '{"type": "record", "namespace": "example.avro", "name": "Iris", "fields": [',
+#' '{"type": ["double", "null"], "name": "Sepal_Length"},',
+#' '{"type": ["double", "null"], "name": "Sepal_Width"},',
+#' '{"type": ["double", "null"], "name": "Petal_Length"},',
+#' '{"type": ["double", "null"], "name": "Petal_Width"},',
+#' '{"type": ["string", "null"], "name": "Species"}]}'
+#' ),
+#' collapse="\\n"
+#' )
+#'
+#' df_serialized <- select(
+#' df,
+#' alias(to_avro(alias(struct(column("*")), "fields")), "payload")
+#' )
+#'
+#' df_deserialized <- select(
+#' df_serialized,
+#' from_avro(df_serialized$payload, schema)
+#' )
+#'
+#' head(df_deserialized)
+#' }
+NULL
+
#' @details
#' \code{lit}: A new Column is created to represent the literal value.
#' If the parameter is a Column, it is returned unchanged.
@@ -394,6 +461,19 @@ setMethod("acos",
column(jc)
})
+#' @details
+#' \code{acosh}: Computes inverse hyperbolic cosine of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases acosh acosh,Column-method
+#' @note acosh since 3.1.0
+setMethod("acosh",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "acosh", x@jc)
+ column(jc)
+ })
+
#' @details
#' \code{approx_count_distinct}: Returns the approximate number of distinct items in a group.
#'
@@ -404,7 +484,7 @@ setMethod("acos",
#' \dontrun{
#' head(select(df, approx_count_distinct(df$gear)))
#' head(select(df, approx_count_distinct(df$gear, 0.02)))
-#' head(select(df, countDistinct(df$gear, df$cyl)))
+#' head(select(df, count_distinct(df$gear, df$cyl)))
#' head(select(df, n_distinct(df$gear)))
#' head(distinct(select(df, "gear")))}
#' @note approx_count_distinct(Column) since 3.0.0
@@ -461,6 +541,19 @@ setMethod("asin",
column(jc)
})
+#' @details
+#' \code{asinh}: Computes inverse hyperbolic sine of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases asinh asinh,Column-method
+#' @note asinh since 3.1.0
+setMethod("asinh",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "asinh", x@jc)
+ column(jc)
+ })
+
#' @details
#' \code{atan}: Returns the inverse tangent of the given value,
#' as if computed by \code{java.lang.Math.atan()}
@@ -475,6 +568,19 @@ setMethod("atan",
column(jc)
})
+#' @details
+#' \code{atanh}: Computes inverse hyperbolic tangent of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases atanh atanh,Column-method
+#' @note atanh since 3.1.0
+setMethod("atanh",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "atanh", x@jc)
+ column(jc)
+ })
+
#' avg
#'
#' Aggregate function: returns the average of the values in a group.
@@ -530,20 +636,33 @@ setMethod("bin",
})
#' @details
-#' \code{bitwiseNOT}: Computes bitwise NOT.
+#' \code{bitwise_not}: Computes bitwise NOT.
#'
#' @rdname column_nonaggregate_functions
-#' @aliases bitwiseNOT bitwiseNOT,Column-method
+#' @aliases bitwise_not bitwise_not,Column-method
#' @examples
#'
#' \dontrun{
-#' head(select(df, bitwiseNOT(cast(df$vs, "int"))))}
+#' head(select(df, bitwise_not(cast(df$vs, "int"))))}
+#' @note bitwise_not since 3.2.0
+setMethod("bitwise_not",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "bitwise_not", x@jc)
+ column(jc)
+ })
+
+#' @details
+#' \code{bitwiseNOT}: Computes bitwise NOT.
+#'
+#' @rdname column_nonaggregate_functions
+#' @aliases bitwiseNOT bitwiseNOT,Column-method
#' @note bitwiseNOT since 1.5.0
setMethod("bitwiseNOT",
signature(x = "Column"),
function(x) {
- jc <- callJStatic("org.apache.spark.sql.functions", "bitwiseNOT", x@jc)
- column(jc)
+ .Deprecated("bitwise_not")
+ bitwise_not(x)
})
#' @details
@@ -809,6 +928,57 @@ setMethod("xxhash64",
column(jc)
})
+#' @details
+#' \code{assert_true}: Returns null if the input column is true; throws an exception
+#' with the provided error message otherwise.
+#'
+#' @param errMsg (optional) The error message to be thrown.
+#'
+#' @rdname column_misc_functions
+#' @aliases assert_true assert_true,Column-method
+#' @examples
+#' \dontrun{
+#' tmp <- mutate(df, v1 = assert_true(df$vs < 2),
+#' v2 = assert_true(df$vs < 2, "custom error message"),
+#' v3 = assert_true(df$vs < 2, df$vs))
+#' head(tmp)}
+#' @note assert_true since 3.1.0
+setMethod("assert_true",
+ signature(x = "Column"),
+ function(x, errMsg = NULL) {
+ jc <- if (is.null(errMsg)) {
+ callJStatic("org.apache.spark.sql.functions", "assert_true", x@jc)
+ } else {
+ if (is.character(errMsg)) {
+ stopifnot(length(errMsg) == 1)
+ errMsg <- lit(errMsg)
+ }
+ callJStatic("org.apache.spark.sql.functions", "assert_true", x@jc, errMsg@jc)
+ }
+ column(jc)
+ })
+
+#' @details
+#' \code{raise_error}: Throws an exception with the provided error message.
+#'
+#' @rdname column_misc_functions
+#' @aliases raise_error raise_error,characterOrColumn-method
+#' @examples
+#' \dontrun{
+#' tmp <- mutate(df, v1 = raise_error("error message"))
+#' head(tmp)}
+#' @note raise_error since 3.1.0
+setMethod("raise_error",
+ signature(x = "characterOrColumn"),
+ function(x) {
+ if (is.character(x)) {
+ stopifnot(length(x) == 1)
+ x <- lit(x)
+ }
+ jc <- callJStatic("org.apache.spark.sql.functions", "raise_error", x@jc)
+ column(jc)
+ })
+
#' @details
#' \code{dayofmonth}: Extracts the day of the month as an integer from a
#' given date/timestamp/string.
@@ -1403,6 +1573,19 @@ setMethod("overlay",
column(jc)
})
+#' @details
+#' \code{product}: Returns the product of the values in a group.
+#'
+#' @rdname column_aggregate_functions
+#' @aliases product product,Column-method
+#' @note product since 3.2.0
+setMethod("product",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "product", x@jc)
+ column(jc)
+ })
+
#' @details
#' \code{quarter}: Extracts the quarter as an integer from a given date/timestamp/string.
#'
@@ -1417,8 +1600,10 @@ setMethod("quarter",
})
#' @details
-#' \code{percentile_approx} Returns the approximate percentile value of
-#' numeric column at the given percentage.
+#' \code{percentile_approx} Returns the approximate \code{percentile} of the numeric column
+#' \code{col} which is the smallest value in the ordered \code{col} values (sorted from least to
+#' greatest) such that no more than \code{percentage} of \code{col} values is less than the value
+#' or equal to that value.
#'
#' @param percentage Numeric percentage at which percentile should be computed
#' All values should be between 0 and 1.
@@ -1778,21 +1963,34 @@ setMethod("sum",
})
#' @details
-#' \code{sumDistinct}: Returns the sum of distinct values in the expression.
+#' \code{sum_distinct}: Returns the sum of distinct values in the expression.
#'
#' @rdname column_aggregate_functions
-#' @aliases sumDistinct sumDistinct,Column-method
+#' @aliases sum_distinct sum_distinct,Column-method
#' @examples
#'
#' \dontrun{
-#' head(select(df, sumDistinct(df$gear)))
+#' head(select(df, sum_distinct(df$gear)))
#' head(distinct(select(df, "gear")))}
+#' @note sum_distinct since 3.2.0
+setMethod("sum_distinct",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "sum_distinct", x@jc)
+ column(jc)
+ })
+
+#' @details
+#' \code{sumDistinct}: Returns the sum of distinct values in the expression.
+#'
+#' @rdname column_aggregate_functions
+#' @aliases sumDistinct sumDistinct,Column-method
#' @note sumDistinct since 1.4.0
setMethod("sumDistinct",
signature(x = "Column"),
function(x) {
- jc <- callJStatic("org.apache.spark.sql.functions", "sumDistinct", x@jc)
- column(jc)
+ .Deprecated("sum_distinct")
+ sum_distinct(x)
})
#' @details
@@ -2286,7 +2484,7 @@ setMethod("pmod", signature(y = "Column"),
column(jc)
})
-#' @param rsd maximum estimation error allowed (default = 0.05).
+#' @param rsd maximum relative standard deviation allowed (default = 0.05).
#'
#' @rdname column_aggregate_functions
#' @aliases approx_count_distinct,Column-method
@@ -2310,23 +2508,37 @@ setMethod("approxCountDistinct",
})
#' @details
-#' \code{countDistinct}: Returns the number of distinct items in a group.
+#' \code{count_distinct}: Returns the number of distinct items in a group.
#'
#' @rdname column_aggregate_functions
-#' @aliases countDistinct countDistinct,Column-method
-#' @note countDistinct since 1.4.0
-setMethod("countDistinct",
+#' @aliases count_distinct count_distinct,Column-method
+#' @note count_distinct since 3.2.0
+setMethod("count_distinct",
signature(x = "Column"),
function(x, ...) {
jcols <- lapply(list(...), function(x) {
stopifnot(class(x) == "Column")
x@jc
})
- jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
+ jc <- callJStatic("org.apache.spark.sql.functions", "count_distinct", x@jc,
jcols)
column(jc)
})
+#' @details
+#' \code{countDistinct}: Returns the number of distinct items in a group.
+#'
+#' An alias of \code{count_distinct}, and it is encouraged to use \code{count_distinct} directly.
+#'
+#' @rdname column_aggregate_functions
+#' @aliases countDistinct countDistinct,Column-method
+#' @note countDistinct since 1.4.0
+setMethod("countDistinct",
+ signature(x = "Column"),
+ function(x, ...) {
+ count_distinct(x, ...)
+ })
+
#' @details
#' \code{concat}: Concatenates multiple input columns together into a single column.
#' The function works with strings, binary and compatible array columns.
@@ -2391,7 +2603,7 @@ setMethod("least",
#' @note n_distinct since 1.4.0
setMethod("n_distinct", signature(x = "Column"),
function(x, ...) {
- countDistinct(x, ...)
+ count_distinct(x, ...)
})
#' @rdname count
@@ -2734,6 +2946,21 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
column(jc)
})
+#' @details
+#' \code{shiftleft}: Shifts the given value numBits left. If the given value is a long value,
+#' this function will return a long value else it will return an integer value.
+#'
+#' @rdname column_math_functions
+#' @aliases shiftleft shiftleft,Column,numeric-method
+#' @note shiftleft since 3.2.0
+setMethod("shiftleft", signature(y = "Column", x = "numeric"),
+ function(y, x) {
+ jc <- callJStatic("org.apache.spark.sql.functions",
+ "shiftleft",
+ y@jc, as.integer(x))
+ column(jc)
+ })
+
#' @details
#' \code{shiftLeft}: Shifts the given value numBits left. If the given value is a long value,
#' this function will return a long value else it will return an integer value.
@@ -2742,9 +2969,22 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
#' @aliases shiftLeft shiftLeft,Column,numeric-method
#' @note shiftLeft since 1.5.0
setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
+ function(y, x) {
+ .Deprecated("shiftleft")
+ shiftleft(y, x)
+ })
+
+#' @details
+#' \code{shiftright}: (Signed) shifts the given value numBits right. If the given value is a long
+#' value, it will return a long value else it will return an integer value.
+#'
+#' @rdname column_math_functions
+#' @aliases shiftright shiftright,Column,numeric-method
+#' @note shiftright since 3.2.0
+setMethod("shiftright", signature(y = "Column", x = "numeric"),
function(y, x) {
jc <- callJStatic("org.apache.spark.sql.functions",
- "shiftLeft",
+ "shiftright",
y@jc, as.integer(x))
column(jc)
})
@@ -2757,15 +2997,28 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
#' @aliases shiftRight shiftRight,Column,numeric-method
#' @note shiftRight since 1.5.0
setMethod("shiftRight", signature(y = "Column", x = "numeric"),
+ function(y, x) {
+ .Deprecated("shiftright")
+ shiftright(y, x)
+ })
+
+#' @details
+#' \code{shiftrightunsigned}: (Unsigned) shifts the given value numBits right. If the given value is
+#' a long value, it will return a long value else it will return an integer value.
+#'
+#' @rdname column_math_functions
+#' @aliases shiftrightunsigned shiftrightunsigned,Column,numeric-method
+#' @note shiftrightunsigned since 3.2.0
+setMethod("shiftrightunsigned", signature(y = "Column", x = "numeric"),
function(y, x) {
jc <- callJStatic("org.apache.spark.sql.functions",
- "shiftRight",
+ "shiftrightunsigned",
y@jc, as.integer(x))
column(jc)
})
#' @details
-#' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is
+#' \code{shiftRightUnsigned}: (Unsigned) shifts the given value numBits right. If the given value is
#' a long value, it will return a long value else it will return an integer value.
#'
#' @rdname column_math_functions
@@ -2773,10 +3026,8 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
#' @note shiftRightUnsigned since 1.5.0
setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
function(y, x) {
- jc <- callJStatic("org.apache.spark.sql.functions",
- "shiftRightUnsigned",
- y@jc, as.integer(x))
- column(jc)
+ .Deprecated("shiftrightunsigned")
+ shiftrightunsigned(y, x)
})
#' @details
@@ -3296,6 +3547,37 @@ setMethod("lead",
column(jc)
})
+#' @details
+#' \code{nth_value}: Window function: returns the value that is the \code{offset}th
+#' row of the window frame# (counting from 1), and \code{null} if the size of window
+#' frame is less than \code{offset} rows.
+#'
+#' @param offset a numeric indicating number of row to use as the value
+#' @param na.rm a logical which indicates that the Nth value should skip null in the
+#' determination of which row to use
+#'
+#' @rdname column_window_functions
+#' @aliases nth_value nth_value,characterOrColumn-method
+#' @note nth_value since 3.1.0
+setMethod("nth_value",
+ signature(x = "characterOrColumn", offset = "numeric"),
+ function(x, offset, na.rm = FALSE) {
+ x <- if (is.character(x)) {
+ column(x)
+ } else {
+ x
+ }
+ offset <- as.integer(offset)
+ jc <- callJStatic(
+ "org.apache.spark.sql.functions",
+ "nth_value",
+ x@jc,
+ offset,
+ na.rm
+ )
+ column(jc)
+ })
+
#' @details
#' \code{ntile}: Returns the ntile group id (from 1 to n inclusive) in an ordered window
#' partition. For example, if n is 4, the first quarter of the rows will get value 1, the second
@@ -3388,7 +3670,12 @@ unresolved_named_lambda_var <- function(...) {
"org.apache.spark.sql.Column",
newJObject(
"org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
- list(...)
+ lapply(list(...), function(x) {
+ handledCallJStatic(
+ "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
+ "freshVarName",
+ x)
+ })
)
)
column(jc)
@@ -3476,11 +3763,11 @@ invoke_higher_order_function <- function(name, cols, funs) {
#' @aliases array_aggregate array_aggregate,characterOrColumn,Column,function-method
#' @note array_aggregate since 3.1.0
setMethod("array_aggregate",
- signature(x = "characterOrColumn", zero = "Column", merge = "function"),
- function(x, zero, merge, finish = NULL) {
+ signature(x = "characterOrColumn", initialValue = "Column", merge = "function"),
+ function(x, initialValue, merge, finish = NULL) {
invoke_higher_order_function(
"ArrayAggregate",
- cols = list(x, zero),
+ cols = list(x, initialValue),
funs = if (is.null(finish)) {
list(merge)
} else {
@@ -4380,7 +4667,8 @@ setMethod("date_trunc",
})
#' @details
-#' \code{current_date}: Returns the current date as a date column.
+#' \code{current_date}: Returns the current date at the start of query evaluation as a date column.
+#' All calls of current_date within the same query return the same value.
#'
#' @rdname column_datetime_functions
#' @aliases current_date current_date,missing-method
@@ -4396,7 +4684,8 @@ setMethod("current_date",
})
#' @details
-#' \code{current_timestamp}: Returns the current timestamp as a timestamp column.
+#' \code{current_timestamp}: Returns the current timestamp at the start of query evaluation as
+#' a timestamp column. All calls of current_timestamp within the same query return the same value.
#'
#' @rdname column_datetime_functions
#' @aliases current_timestamp current_timestamp,missing-method
@@ -4407,3 +4696,115 @@ setMethod("current_timestamp",
jc <- callJStatic("org.apache.spark.sql.functions", "current_timestamp")
column(jc)
})
+
+#' @details
+#' \code{timestamp_seconds}: Creates timestamp from the number of seconds since UTC epoch.
+#'
+#' @rdname column_datetime_functions
+#' @aliases timestamp_seconds timestamp_seconds,Column-method
+#' @note timestamp_seconds since 3.1.0
+setMethod("timestamp_seconds",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic(
+ "org.apache.spark.sql.functions", "timestamp_seconds", x@jc
+ )
+ column(jc)
+ })
+
+#' @details
+#' \code{array_to_vector} Converts a column of array of numeric type into
+#' a column of dense vectors in MLlib
+#'
+#' @rdname column_ml_functions
+#' @aliases array_to_vector array_to_vector,Column-method
+#' @note array_to_vector since 3.1.0
+setMethod("array_to_vector",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic(
+ "org.apache.spark.ml.functions",
+ "array_to_vector",
+ x@jc
+ )
+ column(jc)
+ })
+
+#' @details
+#' \code{vector_to_array} Converts a column of MLlib sparse/dense vectors into
+#' a column of dense arrays.
+#'
+#' @param dtype The data type of the output array. Valid values: "float64" or "float32".
+#'
+#' @rdname column_ml_functions
+#' @aliases vector_to_array vector_to_array,Column-method
+#' @note vector_to_array since 3.1.0
+setMethod("vector_to_array",
+ signature(x = "Column"),
+ function(x, dtype = c("float64", "float32")) {
+ dtype <- match.arg(dtype)
+ jc <- callJStatic(
+ "org.apache.spark.ml.functions",
+ "vector_to_array",
+ x@jc,
+ dtype
+ )
+ column(jc)
+ })
+
+#' @details
+#' \code{from_avro} Converts a binary column of Avro format into its corresponding catalyst value.
+#' The specified schema must match the read data, otherwise the behavior is undefined:
+#' it may fail or return arbitrary result.
+#' To deserialize the data with a compatible and evolved schema, the expected Avro schema can be
+#' set via the option avroSchema.
+#'
+#' @rdname column_avro_functions
+#' @aliases from_avro from_avro,Column-method
+#' @note from_avro since 3.1.0
+setMethod("from_avro",
+ signature(x = "characterOrColumn"),
+ function(x, jsonFormatSchema, ...) {
+ x <- if (is.character(x)) {
+ column(x)
+ } else {
+ x
+ }
+
+ options <- varargsToStrEnv(...)
+ jc <- callJStatic(
+ "org.apache.spark.sql.avro.functions", "from_avro",
+ x@jc,
+ jsonFormatSchema,
+ options
+ )
+ column(jc)
+ })
+
+#' @details
+#' \code{to_avro} Converts a column into binary of Avro format.
+#'
+#' @rdname column_avro_functions
+#' @aliases to_avro to_avro,Column-method
+#' @note to_avro since 3.1.0
+setMethod("to_avro",
+ signature(x = "characterOrColumn"),
+ function(x, jsonFormatSchema = NULL) {
+ x <- if (is.character(x)) {
+ column(x)
+ } else {
+ x
+ }
+
+ jc <- if (is.null(jsonFormatSchema)) {
+ callJStatic("org.apache.spark.sql.avro.functions", "to_avro", x@jc)
+ } else {
+ callJStatic(
+ "org.apache.spark.sql.avro.functions",
+ "to_avro",
+ x@jc,
+ jsonFormatSchema
+ )
+ }
+ column(jc)
+ })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 839c00cf21aeb..38ad5f742ca68 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -638,7 +638,7 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
#' @rdname unionByName
-setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
+setGeneric("unionByName", function(x, y, ...) { standardGeneric("unionByName") })
#' @rdname unpersist
setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
@@ -675,6 +675,12 @@ setGeneric("broadcast", function(x) { standardGeneric("broadcast") })
#' @rdname columnfunctions
setGeneric("asc", function(x) { standardGeneric("asc") })
+#' @rdname columnfunctions
+setGeneric("asc_nulls_first", function(x) { standardGeneric("asc_nulls_first") })
+
+#' @rdname columnfunctions
+setGeneric("asc_nulls_last", function(x) { standardGeneric("asc_nulls_last") })
+
#' @rdname between
setGeneric("between", function(x, bounds) { standardGeneric("between") })
@@ -689,6 +695,12 @@ setGeneric("contains", function(x, ...) { standardGeneric("contains") })
#' @rdname columnfunctions
setGeneric("desc", function(x) { standardGeneric("desc") })
+#' @rdname columnfunctions
+setGeneric("desc_nulls_first", function(x) { standardGeneric("desc_nulls_first") })
+
+#' @rdname columnfunctions
+setGeneric("desc_nulls_last", function(x) { standardGeneric("desc_nulls_last") })
+
#' @rdname endsWith
setGeneric("endsWith", function(x, suffix) { standardGeneric("endsWith") })
@@ -729,6 +741,12 @@ setGeneric("over", function(x, window) { standardGeneric("over") })
#' @rdname eq_null_safe
setGeneric("%<=>%", function(x, value) { standardGeneric("%<=>%") })
+#' @rdname withField
+setGeneric("withField", function(x, fieldName, col) { standardGeneric("withField") })
+
+#' @rdname dropFields
+setGeneric("dropFields", function(x, ...) { standardGeneric("dropFields") })
+
###################### WindowSpec Methods ##########################
#' @rdname partitionBy
@@ -762,7 +780,8 @@ setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCoun
#' @rdname column_collection_functions
#' @name NULL
-setGeneric("array_aggregate", function(x, zero, merge, ...) { standardGeneric("array_aggregate") })
+setGeneric("array_aggregate",
+ function(x, initialValue, merge, ...) { standardGeneric("array_aggregate") })
#' @rdname column_collection_functions
#' @name NULL
@@ -820,6 +839,10 @@ setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat")
#' @name NULL
setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
+#' @rdname column_ml_functions
+#' @name NULL
+setGeneric("array_to_vector", function(x) { standardGeneric("array_to_vector") })
+
#' @rdname column_collection_functions
#' @name NULL
setGeneric("array_transform", function(x, f) { standardGeneric("array_transform") })
@@ -844,6 +867,10 @@ setGeneric("arrays_zip_with", function(x, y, f) { standardGeneric("arrays_zip_wi
#' @name NULL
setGeneric("ascii", function(x) { standardGeneric("ascii") })
+#' @rdname column_misc_functions
+#' @name NULL
+setGeneric("assert_true", function(x, errMsg = NULL) { standardGeneric("assert_true") })
+
#' @param x Column to compute on or a GroupedData object.
#' @param ... additional argument(s) when \code{x} is a GroupedData object.
#' @rdname avg
@@ -857,6 +884,10 @@ setGeneric("base64", function(x) { standardGeneric("base64") })
#' @name NULL
setGeneric("bin", function(x) { standardGeneric("bin") })
+#' @rdname column_nonaggregate_functions
+#' @name NULL
+setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
+
#' @rdname column_nonaggregate_functions
#' @name NULL
setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
@@ -896,6 +927,10 @@ setGeneric("concat_ws", function(sep, x, ...) { standardGeneric("concat_ws") })
#' @name NULL
setGeneric("conv", function(x, fromBase, toBase) { standardGeneric("conv") })
+#' @rdname column_aggregate_functions
+#' @name NULL
+setGeneric("count_distinct", function(x, ...) { standardGeneric("count_distinct") })
+
#' @rdname column_aggregate_functions
#' @name NULL
setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") })
@@ -928,7 +963,6 @@ setGeneric("current_date", function(x = "missing") { standardGeneric("current_da
#' @name NULL
setGeneric("current_timestamp", function(x = "missing") { standardGeneric("current_timestamp") })
-
#' @rdname column_datetime_diff_functions
#' @name NULL
setGeneric("datediff", function(y, x) { standardGeneric("datediff") })
@@ -993,6 +1027,10 @@ setGeneric("expr", function(x) { standardGeneric("expr") })
#' @name NULL
setGeneric("flatten", function(x) { standardGeneric("flatten") })
+#' @rdname column_avro_functions
+#' @name NULL
+setGeneric("from_avro", function(x, ...) { standardGeneric("from_avro") })
+
#' @rdname column_datetime_diff_functions
#' @name NULL
setGeneric("from_utc_timestamp", function(y, x) { standardGeneric("from_utc_timestamp") })
@@ -1161,6 +1199,10 @@ setGeneric("months_between", function(y, x, ...) { standardGeneric("months_betwe
#' @rdname count
setGeneric("n", function(x) { standardGeneric("n") })
+#' @rdname column_window_functions
+#' @name NULL
+setGeneric("nth_value", function(x, offset, ...) { standardGeneric("nth_value") })
+
#' @rdname column_nonaggregate_functions
#' @name NULL
setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
@@ -1209,10 +1251,18 @@ setGeneric("posexplode", function(x) { standardGeneric("posexplode") })
#' @name NULL
setGeneric("posexplode_outer", function(x) { standardGeneric("posexplode_outer") })
+#' @rdname column_aggregate_functions
+#' @name NULL
+setGeneric("product", function(x) { standardGeneric("product") })
+
#' @rdname column_datetime_functions
#' @name NULL
setGeneric("quarter", function(x) { standardGeneric("quarter") })
+#' @rdname column_misc_functions
+#' @name NULL
+setGeneric("raise_error", function(x) { standardGeneric("raise_error") })
+
#' @rdname column_nonaggregate_functions
#' @name NULL
setGeneric("rand", function(seed) { standardGeneric("rand") })
@@ -1286,14 +1336,26 @@ setGeneric("sha2", function(y, x) { standardGeneric("sha2") })
#' @name NULL
setGeneric("shiftLeft", function(y, x) { standardGeneric("shiftLeft") })
+#' @rdname column_math_functions
+#' @name NULL
+setGeneric("shiftleft", function(y, x) { standardGeneric("shiftleft") })
+
#' @rdname column_math_functions
#' @name NULL
setGeneric("shiftRight", function(y, x) { standardGeneric("shiftRight") })
+#' @rdname column_math_functions
+#' @name NULL
+setGeneric("shiftright", function(y, x) { standardGeneric("shiftright") })
+
#' @rdname column_math_functions
#' @name NULL
setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUnsigned") })
+#' @rdname column_math_functions
+#' @name NULL
+setGeneric("shiftrightunsigned", function(y, x) { standardGeneric("shiftrightunsigned") })
+
#' @rdname column_collection_functions
#' @name NULL
setGeneric("shuffle", function(x) { standardGeneric("shuffle") })
@@ -1350,10 +1412,22 @@ setGeneric("struct", function(x, ...) { standardGeneric("struct") })
#' @name NULL
setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") })
+#' @rdname column_aggregate_functions
+#' @name NULL
+setGeneric("sum_distinct", function(x) { standardGeneric("sum_distinct") })
+
#' @rdname column_aggregate_functions
#' @name NULL
setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
+#' @rdname column_datetime_functions
+#' @name timestamp_seconds
+setGeneric("timestamp_seconds", function(x) { standardGeneric("timestamp_seconds") })
+
+#' @rdname column_avro_functions
+#' @name NULL
+setGeneric("to_avro", function(x, ...) { standardGeneric("to_avro") })
+
#' @rdname column_collection_functions
#' @name NULL
setGeneric("transform_keys", function(x, f) { standardGeneric("transform_keys") })
@@ -1438,6 +1512,10 @@ setGeneric("var_pop", function(x) { standardGeneric("var_pop") })
#' @name NULL
setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
+#' @rdname column_ml_functions
+#' @name NULL
+setGeneric("vector_to_array", function(x, ...) { standardGeneric("vector_to_array") })
+
#' @rdname column_datetime_functions
#' @name NULL
setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index ea2c0b4c0f42f..bbb9188cd083f 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -39,11 +39,11 @@
#' version number in the format of "x.y" where x and y are integer.
#' If \code{hadoopVersion = "without"}, "Hadoop free" build is installed.
#' See
-#' \href{http://spark.apache.org/docs/latest/hadoop-provided.html}{
+#' \href{https://spark.apache.org/docs/latest/hadoop-provided.html}{
#' "Hadoop Free" Build} for more information.
#' Other patched version names can also be used, e.g. \code{"cdh4"}
#' @param mirrorUrl base URL of the repositories to use. The directory layout should follow
-#' \href{http://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}.
+#' \href{https://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}.
#' @param localDir a local directory where Spark is installed. The directory contains
#' version-specific folders of Spark packages. Default is path to
#' the cache directory:
@@ -64,7 +64,7 @@
#'}
#' @note install.spark since 2.1.0
#' @seealso See available Hadoop versions:
-#' \href{http://spark.apache.org/downloads.html}{Apache Spark}
+#' \href{https://spark.apache.org/downloads.html}{Apache Spark}
install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
localDir = NULL, overwrite = FALSE) {
sparkHome <- Sys.getenv("SPARK_HOME")
@@ -289,7 +289,7 @@ sparkCachePath <- function() {
}
# Length of the Spark cache specific relative path segments for each platform
-# eg. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix
+# e.g. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix
# Must match sparkCachePath() exactly.
sparkCacheRelPathLength <- function() {
if (is_windows()) {
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index ec83b6bd406a7..093467ecf7d28 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -425,7 +425,7 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
#' Only categorical data is supported.
#' For more details, see
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{
+#' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html}{
#' Multilayer Perceptron}
#'
#' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
diff --git a/R/pkg/R/mllib_fpm.R b/R/pkg/R/mllib_fpm.R
index 0cc7a16c302dc..65a43514930f0 100644
--- a/R/pkg/R/mllib_fpm.R
+++ b/R/pkg/R/mllib_fpm.R
@@ -122,11 +122,12 @@ setMethod("spark.freqItemsets", signature(object = "FPGrowthModel"),
# Get association rules.
#' @return A \code{SparkDataFrame} with association rules.
-#' The \code{SparkDataFrame} contains four columns:
+#' The \code{SparkDataFrame} contains five columns:
#' \code{antecedent} (an array of the same type as the input column),
#' \code{consequent} (an array of the same type as the input column),
-#' \code{condfidence} (confidence for the rule)
-#' and \code{lift} (lift for the rule)
+#' \code{confidence} (confidence for the rule)
+#' \code{lift} (lift for the rule)
+#' and \code{support} (support for the rule)
#' @rdname spark.fpGrowth
#' @aliases associationRules,FPGrowthModel-method
#' @note spark.associationRules(FPGrowthModel) since 2.2.0
diff --git a/R/pkg/R/mllib_recommendation.R b/R/pkg/R/mllib_recommendation.R
index d238ff93ed245..87a1bc991f812 100644
--- a/R/pkg/R/mllib_recommendation.R
+++ b/R/pkg/R/mllib_recommendation.R
@@ -30,7 +30,7 @@ setClass("ALSModel", representation(jobj = "jobj"))
#' to make predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
#'
#' For more details, see
-#' \href{http://spark.apache.org/docs/latest/ml-collaborative-filtering.html}{MLlib:
+#' \href{https://spark.apache.org/docs/latest/ml-collaborative-filtering.html}{MLlib:
#' Collaborative Filtering}.
#'
#' @param data a SparkDataFrame for training.
diff --git a/R/pkg/R/mllib_stat.R b/R/pkg/R/mllib_stat.R
index 6db4d5d4831dd..f82fb589bb5a5 100644
--- a/R/pkg/R/mllib_stat.R
+++ b/R/pkg/R/mllib_stat.R
@@ -49,7 +49,7 @@ setClass("KSTest", representation(jobj = "jobj"))
#' @rdname spark.kstest
#' @aliases spark.kstest,SparkDataFrame-method
#' @name spark.kstest
-#' @seealso \href{http://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{
+#' @seealso \href{https://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{
#' MLlib: Hypothesis Testing}
#' @examples
#' \dontrun{
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index f6aa48f5fa04a..f3192ee9b1382 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -53,7 +53,7 @@ setClass("DecisionTreeRegressionModel", representation(jobj = "jobj"))
#' @note DecisionTreeClassificationModel since 2.3.0
setClass("DecisionTreeClassificationModel", representation(jobj = "jobj"))
-# Create the summary of a tree ensemble model (eg. Random Forest, GBT)
+# Create the summary of a tree ensemble model (e.g. Random Forest, GBT)
summary.treeEnsemble <- function(model) {
jobj <- model@jobj
formula <- callJMethod(jobj, "formula")
@@ -73,7 +73,7 @@ summary.treeEnsemble <- function(model) {
jobj = jobj)
}
-# Prints the summary of tree ensemble models (eg. Random Forest, GBT)
+# Prints the summary of tree ensemble models (e.g. Random Forest, GBT)
print.summary.treeEnsemble <- function(x) {
jobj <- x$jobj
cat("Formula: ", x$formula)
@@ -127,9 +127,9 @@ print.summary.decisionTree <- function(x) {
#' \code{write.ml}/\code{read.ml} to save/load fitted models.
#' For more details, see
# nolint start
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{
+#' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{
#' GBT Regression} and
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{
+#' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{
#' GBT Classification}
# nolint end
#'
@@ -343,9 +343,9 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
#' save/load fitted models.
#' For more details, see
# nolint start
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{
+#' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{
#' Random Forest Regression} and
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{
+#' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{
#' Random Forest Classification}
# nolint end
#'
@@ -568,9 +568,9 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
#' save/load fitted models.
#' For more details, see
# nolint start
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-regression}{
+#' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-regression}{
#' Decision Tree Regression} and
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier}{
+#' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier}{
#' Decision Tree Classification}
# nolint end
#'
diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R
index f38f1ac3a6b4c..d943d8d0ab4c0 100644
--- a/R/pkg/R/mllib_utils.R
+++ b/R/pkg/R/mllib_utils.R
@@ -18,7 +18,7 @@
# mllib_utils.R: Utilities for MLlib integration
# Integration with R's standard functions.
-# Most of MLlib's argorithms are provided in two flavours:
+# Most of MLlib's algorithms are provided in two flavours:
# - a specialization of the default R methods (glm). These methods try to respect
# the inputs and the outputs of R's method to the largest extent, but some small differences
# may exist.
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index b29381bb900fb..41676be03e951 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -239,7 +239,7 @@ setMethod("partitionByRDD",
javaPairRDD <- callJMethod(javaPairRDD, "partitionBy", rPartitioner)
# Call .values() on the result to get back the final result, the
- # shuffled acutal content key-val pairs.
+ # shuffled actual content key-val pairs.
r <- callJMethod(javaPairRDD, "values")
RDD(r, serializedMode = "byte")
@@ -411,7 +411,7 @@ setMethod("reduceByKeyLocally",
#' \itemize{
#' \item createCombiner, which turns a V into a C (e.g., creates a one-element list)
#' \item mergeValue, to merge a V into a C (e.g., adds it to the end of a list) -
-#' \item mergeCombiners, to combine two C's into a single one (e.g., concatentates
+#' \item mergeCombiners, to combine two C's into a single one (e.g., concatenates
#' two lists).
#' }
#'
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 7252351ebebb2..0aabceef226e3 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -109,7 +109,8 @@ setMethod("corr",
#'
#' Finding frequent items for columns, possibly with false positives.
#' Using the frequent element count algorithm described in
-#' \url{https://doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
+#' \url{https://dl.acm.org/doi/10.1145/762471.762473}, proposed by Karp, Schenker,
+#' and Papadimitriou.
#'
#' @param x A SparkDataFrame.
#' @param cols A vector column names to search frequent items in.
diff --git a/R/pkg/R/streaming.R b/R/pkg/R/streaming.R
index 5eccbdc9d3818..2bcfb363f9d24 100644
--- a/R/pkg/R/streaming.R
+++ b/R/pkg/R/streaming.R
@@ -93,7 +93,7 @@ setMethod("explain",
#' lastProgress
#'
-#' Prints the most recent progess update of this streaming query in JSON format.
+#' Prints the most recent progress update of this streaming query in JSON format.
#'
#' @param x a StreamingQuery.
#' @rdname lastProgress
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
index 5d48a9eee2799..dfa83c35665ce 100644
--- a/R/pkg/R/types.R
+++ b/R/pkg/R/types.R
@@ -68,7 +68,7 @@ rToSQLTypes <- as.environment(list(
"character" = "string",
"logical" = "boolean"))
-# Helper function of coverting decimal type. When backend returns column type in the
+# Helper function of converting decimal type. When backend returns column type in the
# format of decimal(,) (e.g., decimal(10, 0)), this function coverts the column type
# as double type. This function converts backend returned types that are not the key
# of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES.
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 65db9c21d9dbb..264cbfc9ba929 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -376,6 +376,7 @@ varargsToStrEnv <- function(...) {
getStorageLevel <- function(newLevel = c("DISK_ONLY",
"DISK_ONLY_2",
+ "DISK_ONLY_3",
"MEMORY_AND_DISK",
"MEMORY_AND_DISK_2",
"MEMORY_AND_DISK_SER",
@@ -390,6 +391,7 @@ getStorageLevel <- function(newLevel = c("DISK_ONLY",
storageLevel <- switch(newLevel,
"DISK_ONLY" = callJStatic(storageLevelClass, "DISK_ONLY"),
"DISK_ONLY_2" = callJStatic(storageLevelClass, "DISK_ONLY_2"),
+ "DISK_ONLY_3" = callJStatic(storageLevelClass, "DISK_ONLY_3"),
"MEMORY_AND_DISK" = callJStatic(storageLevelClass, "MEMORY_AND_DISK"),
"MEMORY_AND_DISK_2" = callJStatic(storageLevelClass, "MEMORY_AND_DISK_2"),
"MEMORY_AND_DISK_SER" = callJStatic(storageLevelClass,
@@ -415,6 +417,8 @@ storageLevelToString <- function(levelObj) {
"DISK_ONLY"
} else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 2) {
"DISK_ONLY_2"
+ } else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 3) {
+ "DISK_ONLY_3"
} else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 1) {
"MEMORY_ONLY"
} else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 2) {
@@ -529,7 +533,10 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
# Namespaces other than "SparkR" will not be searched.
if (!isNamespace(func.env) ||
(getNamespaceName(func.env) == "SparkR" &&
- !(nodeChar %in% getNamespaceExports("SparkR")))) {
+ !(nodeChar %in% getNamespaceExports("SparkR")) &&
+ # Note that generic S4 methods should not be set to the environment of
+ # cleaned closure. It does not work with R 4.0.0+. See also SPARK-31918.
+ nodeChar != "" && !methods::isGeneric(nodeChar, func.env))) {
# Only include SparkR internals.
# Set parameter 'inherits' to FALSE since we do not need to search in
@@ -923,7 +930,7 @@ getOne <- function(x, envir, inherits = TRUE, ifnotfound = NULL) {
}
# Returns a vector of parent directories, traversing up count times, starting with a full path
-# eg. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return
+# e.g. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return
# this "/Users/user/Library/Caches/spark/spark2.2"
# and "/Users/user/Library/Caches/spark"
traverseParentDirs <- function(x, count) {
diff --git a/R/pkg/inst/profile/general.R b/R/pkg/inst/profile/general.R
index 3efb460846fc2..8c75c19ca7ac3 100644
--- a/R/pkg/inst/profile/general.R
+++ b/R/pkg/inst/profile/general.R
@@ -16,10 +16,6 @@
#
.First <- function() {
- if (utils::compareVersion(paste0(R.version$major, ".", R.version$minor), "3.4.0") == -1) {
- warning("Support for R prior to version 3.4 is deprecated since Spark 3.0.0")
- }
-
packageDir <- Sys.getenv("SPARKR_PACKAGE_DIR")
dirs <- strsplit(packageDir, ",")[[1]]
.libPaths(c(dirs, .libPaths()))
diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R
index e4e0d032997de..ffedb3038fd53 100644
--- a/R/pkg/inst/profile/shell.R
+++ b/R/pkg/inst/profile/shell.R
@@ -16,10 +16,6 @@
#
.First <- function() {
- if (utils::compareVersion(paste0(R.version$major, ".", R.version$minor), "3.4.0") == -1) {
- warning("Support for R prior to version 3.4 is deprecated since Spark 3.0.0")
- }
-
home <- Sys.getenv("SPARK_HOME")
.libPaths(c(file.path(home, "R", "lib"), .libPaths()))
Sys.setenv(NOAWT = 1)
@@ -47,5 +43,7 @@
cat(" /_/", "\n")
cat("\n")
- cat("\nSparkSession available as 'spark'.\n")
+ cat("\nSparkSession Web UI available at", SparkR::sparkR.uiWebUrl())
+ cat("\nSparkSession available as 'spark'(master = ", unlist(SparkR::sparkR.conf("spark.master")),
+ ", app id = ", unlist(SparkR::sparkR.conf("spark.app.id")), ").", "\n", sep = "")
}
diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index fb9db63b07cd0..4589bb9c6ad1b 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -32,7 +32,7 @@ inputCon <- socketConnection(
SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET"))
-# Waits indefinitely for a socket connecion by default.
+# Waits indefinitely for a socket connection by default.
selectTimeout <- NULL
while (TRUE) {
@@ -72,7 +72,7 @@ while (TRUE) {
}
})
} else if (is.null(children)) {
- # If it is NULL, there are no children. Waits indefinitely for a socket connecion.
+ # If it is NULL, there are no children. Waits indefinitely for a socket connection.
selectTimeout <- NULL
}
diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R
index 1ef05ea621e83..7fc4680bad10e 100644
--- a/R/pkg/inst/worker/worker.R
+++ b/R/pkg/inst/worker/worker.R
@@ -85,7 +85,7 @@ outputResult <- function(serializer, output, outputCon) {
}
# Constants
-specialLengths <- list(END_OF_STERAM = 0L, TIMING_DATA = -1L)
+specialLengths <- list(END_OF_STREAM = 0L, TIMING_DATA = -1L)
# Timing R process boot
bootTime <- currentTimeSecs()
@@ -180,7 +180,7 @@ if (isEmpty != 0) {
} else if (deserializer == "arrow" && mode == 1) {
data <- SparkR:::readDeserializeInArrow(inputCon)
# See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html
- # rbind.fill might be an anternative to make it faster if plyr is installed.
+ # rbind.fill might be an alternative to make it faster if plyr is installed.
# Also, note that, 'dapply' applies a function to each partition.
data <- do.call("rbind", data)
}
@@ -196,7 +196,7 @@ if (isEmpty != 0) {
outputs <- list()
for (i in seq_len(length(data))) {
# Timing reading input data for execution
- inputElap <- elapsedSecs()
+ computeStart <- elapsedSecs()
output <- compute(mode, partition, serializer, deserializer, keys[[i]],
colNames, computeFunc, data[[i]])
computeElap <- elapsedSecs()
@@ -204,17 +204,18 @@ if (isEmpty != 0) {
outputs[[length(outputs) + 1L]] <- output
} else {
outputResult(serializer, output, outputCon)
+ outputComputeElapsDiff <- outputComputeElapsDiff + (elapsedSecs() - computeElap)
}
- outputElap <- elapsedSecs()
- computeInputElapsDiff <- computeInputElapsDiff + (computeElap - inputElap)
- outputComputeElapsDiff <- outputComputeElapsDiff + (outputElap - computeElap)
+ computeInputElapsDiff <- computeInputElapsDiff + (computeElap - computeStart)
}
if (serializer == "arrow") {
# See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html
- # rbind.fill might be an anternative to make it faster if plyr is installed.
+ # rbind.fill might be an alternative to make it faster if plyr is installed.
+ outputStart <- elapsedSecs()
combined <- do.call("rbind", outputs)
SparkR:::writeSerializeInArrow(outputCon, combined)
+ outputComputeElapsDiff <- elapsedSecs() - outputStart
}
}
} else {
@@ -285,7 +286,7 @@ SparkR:::writeDouble(outputCon, computeInputElapsDiff) # compute
SparkR:::writeDouble(outputCon, outputComputeElapsDiff) # output
# End of output
-SparkR:::writeInt(outputCon, specialLengths$END_OF_STERAM)
+SparkR:::writeInt(outputCon, specialLengths$END_OF_STREAM)
close(outputCon)
close(inputCon)
diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R
index e01f6ee005218..a52289e43ca5e 100644
--- a/R/pkg/tests/fulltests/test_Serde.R
+++ b/R/pkg/tests/fulltests/test_Serde.R
@@ -125,7 +125,7 @@ test_that("SerDe of list of lists", {
sparkR.session.stop()
-# Note that this test should be at the end of tests since the configruations used here are not
+# Note that this test should be at the end of tests since the configurations used here are not
# specific to sessions, and the Spark context is restarted.
test_that("createDataFrame large objects", {
for (encryptionEnabled in list("true", "false")) {
diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R
index 6be04b321e985..1add5a9fdde44 100644
--- a/R/pkg/tests/fulltests/test_context.R
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -26,7 +26,9 @@ test_that("Check masked functions", {
"colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
"summary", "transform", "drop", "window", "as.data.frame", "union", "not")
version <- packageVersion("base")
- if (as.numeric(version$major) >= 3 && as.numeric(version$minor) >= 3) {
+ is33Above <- as.numeric(version$major) >= 3 && as.numeric(version$minor) >= 3
+ is40Above <- as.numeric(version$major) >= 4
+ if (is33Above || is40Above) {
namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
}
masked <- conflicts(detail = TRUE)$`package:SparkR`
@@ -137,7 +139,7 @@ test_that("utility function can be called", {
expect_true(TRUE)
})
-test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
+test_that("getClientModeSparkSubmitOpts() returns spark-submit args from allowList", {
e <- new.env()
e[["spark.driver.memory"]] <- "512m"
ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
diff --git a/R/pkg/tests/fulltests/test_jvm_api.R b/R/pkg/tests/fulltests/test_jvm_api.R
index 8b3b4f73de170..3bf6ae556c079 100644
--- a/R/pkg/tests/fulltests/test_jvm_api.R
+++ b/R/pkg/tests/fulltests/test_jvm_api.R
@@ -20,11 +20,11 @@ context("JVM API")
sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
test_that("Create and call methods on object", {
- jarr <- sparkR.newJObject("java.util.ArrayList")
+ jarray <- sparkR.newJObject("java.util.ArrayList")
# Add an element to the array
- sparkR.callJMethod(jarr, "add", 1L)
+ sparkR.callJMethod(jarray, "add", 1L)
# Check if get returns the same element
- expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L)
+ expect_equal(sparkR.callJMethod(jarray, "get", 0L), 1L)
})
test_that("Call static methods", {
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
index 9dd275a173089..20339c947d7bf 100644
--- a/R/pkg/tests/fulltests/test_mllib_classification.R
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -34,18 +34,18 @@ test_that("spark.svmLinear", {
summary <- summary(model)
# test summary coefficients return matrix type
- expect_true(class(summary$coefficients) == "matrix")
+ expect_true(any(class(summary$coefficients) == "matrix"))
expect_true(class(summary$coefficients[, 1]) == "numeric")
coefs <- summary$coefficients[, "Estimate"]
- expected_coefs <- c(-0.06004978, -0.1563083, -0.460648, 0.2276626, 1.055085)
+ expected_coefs <- c(-6.8823988, -0.6154984, -1.5135447, 1.9694126, 3.3736856)
expect_true(all(abs(coefs - expected_coefs) < 0.1))
# Test prediction with string label
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
- expected <- c("versicolor", "versicolor", "versicolor", "virginica", "virginica",
- "virginica", "virginica", "virginica", "virginica", "virginica")
+ expected <- c("versicolor", "versicolor", "versicolor", "versicolor", "versicolor",
+ "versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected)
# Test model save and load
@@ -130,7 +130,7 @@ test_that("spark.logit", {
summary <- summary(model)
# test summary coefficients return matrix type
- expect_true(class(summary$coefficients) == "matrix")
+ expect_true(any(class(summary$coefficients) == "matrix"))
expect_true(class(summary$coefficients[, 1]) == "numeric")
versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
@@ -242,8 +242,8 @@ test_that("spark.logit", {
# Test binomial logistic regression against two classes with upperBoundsOnCoefficients
# and upperBoundsOnIntercepts
u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
- model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
- upperBoundsOnIntercepts = 1.0)
+ model <- suppressWarnings(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
+ upperBoundsOnIntercepts = 1.0))
summary <- summary(model)
coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
coefs <- summary$coefficients[, "Estimate"]
@@ -255,8 +255,8 @@ test_that("spark.logit", {
# Test binomial logistic regression against two classes with lowerBoundsOnCoefficients
# and lowerBoundsOnIntercepts
l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
- model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
- lowerBoundsOnIntercepts = 0.0)
+ model <- suppressWarnings(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
+ lowerBoundsOnIntercepts = 0.0))
summary <- summary(model)
coefsR <- c(0, 0, -1, 0, 1.902192)
coefs <- summary$coefficients[, "Estimate"]
@@ -268,9 +268,9 @@ test_that("spark.logit", {
# Test multinomial logistic regression with lowerBoundsOnCoefficients
# and lowerBoundsOnIntercepts
l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
- model <- spark.logit(training, Species ~ ., family = "multinomial",
- lowerBoundsOnCoefficients = l,
- lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
+ model <- suppressWarnings(spark.logit(training, Species ~ ., family = "multinomial",
+ lowerBoundsOnCoefficients = l,
+ lowerBoundsOnIntercepts = as.array(c(0.0, 0.0))))
summary <- summary(model)
versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R
index 028ad574b8134..f180aeea28150 100644
--- a/R/pkg/tests/fulltests/test_mllib_clustering.R
+++ b/R/pkg/tests/fulltests/test_mllib_clustering.R
@@ -171,7 +171,7 @@ test_that("spark.kmeans", {
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
# test summary coefficients return matrix type
- expect_true(class(summary.model$coefficients) == "matrix")
+ expect_true(any(class(summary.model$coefficients) == "matrix"))
expect_true(class(summary.model$coefficients[1, ]) == "numeric")
# Test model save/load
diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R b/R/pkg/tests/fulltests/test_mllib_fpm.R
index bc1e17538d41a..78d26d3324473 100644
--- a/R/pkg/tests/fulltests/test_mllib_fpm.R
+++ b/R/pkg/tests/fulltests/test_mllib_fpm.R
@@ -45,7 +45,8 @@ test_that("spark.fpGrowth", {
antecedent = I(list(list("2"), list("3"))),
consequent = I(list(list("1"), list("1"))),
confidence = c(1, 1),
- lift = c(1, 1)
+ lift = c(1, 1),
+ support = c(0.75, 0.5)
)
expect_equivalent(expected_association_rules, collect(spark.associationRules(model)))
diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R b/R/pkg/tests/fulltests/test_mllib_regression.R
index 0f2a62b7229ca..b281cd6235ef0 100644
--- a/R/pkg/tests/fulltests/test_mllib_regression.R
+++ b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -116,7 +116,7 @@ test_that("spark.glm summary", {
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
# test summary coefficients return matrix type
- expect_true(class(stats$coefficients) == "matrix")
+ expect_true(any(class(stats$coefficients) == "matrix"))
expect_true(class(stats$coefficients[, 1]) == "numeric")
coefs <- stats$coefficients
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 611d9057c0f13..30daa2064355d 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1397,7 +1397,8 @@ test_that("column operators", {
test_that("column functions", {
c <- column("a")
c1 <- abs(c) + acos(c) + approx_count_distinct(c) + ascii(c) + asin(c) + atan(c)
- c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
+ c2 <- avg(c) + base64(c) + bin(c) + suppressWarnings(bitwiseNOT(c)) +
+ bitwise_not(c) + cbrt(c) + ceil(c) + cos(c)
c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
@@ -1405,7 +1406,8 @@ test_that("column functions", {
c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
- c10 <- sumDistinct(c) + tan(c) + tanh(c) + degrees(c) + radians(c)
+ c10 <- suppressWarnings(sumDistinct(c)) + sum_distinct(c) + tan(c) + tanh(c) +
+ degrees(c) + radians(c)
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
c12 <- variance(c) + xxhash64(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
@@ -1424,6 +1426,15 @@ test_that("column functions", {
date_trunc("quarter", c) + current_date() + current_timestamp()
c25 <- overlay(c1, c2, c3, c3) + overlay(c1, c2, c3) + overlay(c1, c2, 1) +
overlay(c1, c2, 3, 4)
+ c26 <- timestamp_seconds(c1) + vector_to_array(c) +
+ vector_to_array(c, "float32") + vector_to_array(c, "float64") +
+ array_to_vector(c)
+ c27 <- nth_value("x", 1L) + nth_value("y", 2, TRUE) +
+ nth_value(column("v"), 3) + nth_value(column("z"), 4L, FALSE)
+ c28 <- asc_nulls_first(c1) + asc_nulls_last(c1) +
+ desc_nulls_first(c1) + desc_nulls_last(c1)
+ c29 <- acosh(c1) + asinh(c1) + atanh(c1)
+ c30 <- product(c1) + product(c1 * 0.5)
# Test if base::is.nan() is exposed
expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
@@ -1449,6 +1460,8 @@ test_that("column functions", {
expect_equal(collect(df3)[[2, 1]], FALSE)
expect_equal(collect(df3)[[3, 1]], TRUE)
+ df4 <- select(df, count_distinct(df$age, df$name))
+ expect_equal(collect(df4)[[1, 1]], 2)
df4 <- select(df, countDistinct(df$age, df$name))
expect_equal(collect(df4)[[1, 1]], 2)
@@ -1676,9 +1689,9 @@ test_that("column functions", {
df <- as.DataFrame(list(list("col" = "1")))
c <- collect(select(df, schema_of_csv("Amsterdam,2018")))
- expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
+ expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
c <- collect(select(df, schema_of_csv(lit("Amsterdam,2018"))))
- expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
+ expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
# Test to_json(), from_json(), schema_of_json()
df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
@@ -1711,9 +1724,9 @@ test_that("column functions", {
df <- as.DataFrame(list(list("col" = "1")))
c <- collect(select(df, schema_of_json('{"name":"Bob"}')))
- expect_equal(c[[1]], "struct")
+ expect_equal(c[[1]], "STRUCT<`name`: STRING>")
c <- collect(select(df, schema_of_json(lit('{"name":"Bob"}'))))
- expect_equal(c[[1]], "struct")
+ expect_equal(c[[1]], "STRUCT<`name`: STRING>")
# Test to_json() supports arrays of primitive types and arrays
df <- sql("SELECT array(19, 42, 70) as age")
@@ -1803,6 +1816,62 @@ test_that("column functions", {
)
expect_equal(actual, expected)
+
+ # Test withField
+ lines <- c("{\"Person\": {\"name\":\"Bob\", \"age\":24, \"height\": 170}}")
+ jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(lines, jsonPath)
+ df <- read.df(jsonPath, "json")
+ result <- collect(
+ select(
+ select(df, alias(withField(df$Person, "dummy", lit(42)), "Person")),
+ "Person.dummy"
+ )
+ )
+ expect_equal(result, data.frame(dummy = 42))
+
+ # Test dropFields
+ expect_setequal(
+ colnames(select(
+ withColumn(df, "Person", dropFields(df$Person, "age")),
+ column("Person.*")
+ )),
+ c("name", "height")
+ )
+
+ expect_equal(
+ colnames(select(
+ withColumn(df, "Person", dropFields(df$Person, "height", "name")),
+ column("Person.*")
+ )),
+ "age"
+ )
+})
+
+test_that("avro column functions", {
+ skip_if_not(
+ grepl("spark-avro", sparkR.conf("spark.jars", "")),
+ "spark-avro jar not present"
+ )
+
+ schema <- '{"namespace": "example.avro",
+ "type": "record",
+ "name": "User",
+ "fields": [
+ {"name": "name", "type": "string"},
+ {"name": "favorite_color", "type": ["string", "null"]}
+ ]
+ }'
+
+ c0 <- column("foo")
+ c1 <- from_avro(c0, schema)
+ expect_s4_class(c1, "Column")
+ c2 <- from_avro("foo", schema)
+ expect_s4_class(c2, "Column")
+ c3 <- to_avro(c1)
+ expect_s4_class(c3, "Column")
+ c4 <- to_avro(c1, schema)
+ expect_s4_class(c4, "Column")
})
test_that("column binary mathfunctions", {
@@ -1823,9 +1892,12 @@ test_that("column binary mathfunctions", {
expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
## nolint end
- expect_equal(collect(select(df, shiftLeft(df$b, 1)))[4, 1], 16)
- expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
- expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
+ expect_equal(collect(select(df, shiftleft(df$b, 1)))[4, 1], 16)
+ expect_equal(collect(select(df, shiftright(df$b, 1)))[4, 1], 4)
+ expect_equal(collect(select(df, shiftrightunsigned(df$b, 1)))[4, 1], 4)
+ expect_equal(collect(select(df, suppressWarnings(shiftLeft(df$b, 1))))[4, 1], 16)
+ expect_equal(collect(select(df, suppressWarnings(shiftRight(df$b, 1))))[4, 1], 4)
+ expect_equal(collect(select(df, suppressWarnings(shiftRightUnsigned(df$b, 1))))[4, 1], 4)
expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01)
expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
@@ -2030,7 +2102,7 @@ test_that("higher order functions", {
createDataFrame(data.frame(id = 1)),
expr("CAST(array(1.0, 2.0, -3.0, -4.0) AS array) xs"),
expr("CAST(array(0.0, 3.0, 48.0) AS array) ys"),
- expr("array('FAILED', 'SUCCEDED') as vs"),
+ expr("array('FAILED', 'SUCCEEDED') as vs"),
expr("map('foo', 1, 'bar', 2) as mx"),
expr("map('foo', 42, 'bar', -1, 'baz', 0) as my")
)
@@ -2089,6 +2161,20 @@ test_that("higher order functions", {
expect_error(array_transform("xs", function(...) 42))
})
+test_that("SPARK-34794: lambda vars must be resolved properly in nested higher order functions", {
+ df <- sql("SELECT array(1, 2, 3) as numbers, array('a', 'b', 'c') as letters")
+ ret <- first(select(
+ df,
+ array_transform("numbers", function(number) {
+ array_transform("letters", function(latter) {
+ struct(alias(number, "n"), alias(latter, "l"))
+ })
+ })
+ ))
+
+ expect_equal(1, ret[[1]][[1]][[1]][[1]]$n)
+})
+
test_that("group by, agg functions", {
df <- read.json(jsonPath)
df1 <- agg(df, name = "max", age = "sum")
@@ -2113,7 +2199,7 @@ test_that("group by, agg functions", {
df3 <- agg(gd, age = "stddev")
expect_is(df3, "SparkDataFrame")
df3_local <- collect(df3)
- expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2]))
+ expect_true(is.na(df3_local[df3_local$name == "Andy", ][1, 2]))
df4 <- agg(gd, sumAge = sum(df$age))
expect_is(df4, "SparkDataFrame")
@@ -2144,7 +2230,7 @@ test_that("group by, agg functions", {
df7 <- agg(gd2, value = "stddev")
df7_local <- collect(df7)
expect_true(abs(df7_local[df7_local$name == "ID1", ][1, 2] - 6.928203) < 1e-6)
- expect_true(is.nan(df7_local[df7_local$name == "ID2", ][1, 2]))
+ expect_true(is.na(df7_local[df7_local$name == "ID2", ][1, 2]))
mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
"{\"name\":\"Andy\", \"age\":30}",
@@ -2696,6 +2782,19 @@ test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataF
expect_error(rbind(df, df2, df3),
"Names of input data frames are different.")
+
+ df4 <- unionByName(df2, select(df2, "age"), TRUE)
+
+ expect_equal(
+ sum(collect(
+ select(df4, alias(isNull(df4$name), "missing_name")
+ ))$missing_name),
+ 3
+ )
+
+ testthat::expect_error(unionByName(df2, select(df2, "age"), FALSE))
+ testthat::expect_error(unionByName(df2, select(df2, "age")))
+
excepted <- arrange(except(df, df2), desc(df$age))
expect_is(unioned, "SparkDataFrame")
expect_equal(count(excepted), 2)
@@ -2807,6 +2906,15 @@ test_that("mutate(), transform(), rename() and names()", {
expect_equal(nrow(result), 153)
expect_equal(ncol(result), 2)
detach(airquality)
+
+ # ensure long inferred names are handled without error (SPARK-26199)
+ # test implicitly assumes eval(formals(deparse)$width.cutoff) = 60
+ # (which has always been true as of 2020-11-15)
+ newDF <- mutate(
+ df,
+ df$age + 12345678901234567890 + 12345678901234567890 + 12345678901234
+ )
+ expect_match(tail(columns(newDF), 1L), "234567890", fixed = TRUE)
})
test_that("read/write ORC files", {
@@ -3196,6 +3304,12 @@ test_that("attach() on a DataFrame", {
stat3 <- summary(df[, "age", drop = F])
expect_equal(collect(stat3)[8, "age"], "30")
expect_error(age)
+
+ # attach method uses deparse(); ensure no errors from a very long input
+ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnop <- df # nolint
+ attach(abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnop)
+ expect_true(any(grepl("abcdefghijklmnopqrstuvwxyz", search())))
+ detach("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnop")
})
test_that("with() on a DataFrame", {
@@ -3591,7 +3705,7 @@ test_that("gapply() and gapplyCollect() on a DataFrame", {
}
# Computes the arithmetic mean of the second column by grouping
- # on the first and third columns. Output the groupping value and the average.
+ # on the first and third columns. Output the grouping value and the average.
schema <- structType(structField("a", "integer"), structField("c", "string"),
structField("avg", "double"))
df3 <- gapply(
@@ -3889,15 +4003,34 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", {
paste("Error in listFunctions : analysis error - Database",
"'zxwtyswklpf_db' does not exist"))
- # recoverPartitions does not work with tempory view
+ # recoverPartitions does not work with temporary view
expect_error(recoverPartitions("cars"),
- "no such table - Table or view 'cars' not found in database 'default'")
+ paste("Error in recoverPartitions : analysis error - cars is a temp view.",
+ "'recoverPartitions()' expects a table"), fixed = TRUE)
expect_error(refreshTable("cars"), NA)
expect_error(refreshByPath("/"), NA)
dropTempView("cars")
})
+test_that("assert_true, raise_error", {
+ df <- read.json(jsonPath)
+ filtered <- filter(df, "age < 20")
+
+ expect_equal(collect(select(filtered, assert_true(filtered$age < 20)))$age, c(NULL))
+ expect_equal(collect(select(filtered, assert_true(filtered$age < 20, "error message")))$age,
+ c(NULL))
+ expect_equal(collect(select(filtered, assert_true(filtered$age < 20, filtered$name)))$age,
+ c(NULL))
+ expect_error(collect(select(df, assert_true(df$age < 20))), "is not true!")
+ expect_error(collect(select(df, assert_true(df$age < 20, "error message"))),
+ "error message")
+ expect_error(collect(select(df, assert_true(df$age < 20, df$name))), "Michael")
+
+ expect_error(collect(select(filtered, raise_error("error message"))), "error message")
+ expect_error(collect(select(filtered, raise_error(filtered$name))), "Justin")
+})
+
compare_list <- function(list1, list2) {
# get testthat to show the diff by first making the 2 lists equal in length
expect_equal(length(list1), length(list2))
@@ -3921,14 +4054,14 @@ test_that("No extra files are created in SPARK_HOME by starting session and maki
# before creating a SparkSession with enableHiveSupport = T at the top of this test file
# (filesBefore). The test here is to compare that (filesBefore) against the list of files before
# any test is run in run-all.R (sparkRFilesBefore).
- # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
+ # sparkRAllowedSQLDirs is also defined in run-all.R, and should contain only 2 allowed dirs,
# here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
# same as before any test is run.
- compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
+ compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRAllowedSQLDirs[[1]]))
# third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
# note: as the note above, after running all tests in this file while enableHiveSupport = T, we
- # check the list of files again. This time we allow both whitelisted dirs to be in the diff.
- compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
+ # check the list of files again. This time we allow both dirs to be in the diff.
+ compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRAllowedSQLDirs))
})
unlink(parquetPath)
diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
index 97972753a78fa..06743488fdf11 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
@@ -19,7 +19,10 @@ library(testthat)
context("SparkSQL Arrow optimization")
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sparkSession <- sparkR.session(
+ master = sparkRTestMaster,
+ enableHiveSupport = FALSE,
+ sparkConfig = list(spark.sql.execution.arrow.sparkr.enabled = "true"))
test_that("createDataFrame/collect Arrow optimization", {
skip_if_not_installed("arrow")
@@ -35,29 +38,13 @@ test_that("createDataFrame/collect Arrow optimization", {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- expect_equal(collect(createDataFrame(mtcars)), expected)
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ expect_equal(collect(createDataFrame(mtcars)), expected)
})
test_that("createDataFrame/collect Arrow optimization - many partitions (partition order test)", {
skip_if_not_installed("arrow")
-
- conf <- callJMethod(sparkSession, "conf")
- arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
-
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- expect_equal(collect(createDataFrame(mtcars, numPartitions = 32)),
- collect(createDataFrame(mtcars, numPartitions = 1)))
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ expect_equal(collect(createDataFrame(mtcars, numPartitions = 32)),
+ collect(createDataFrame(mtcars, numPartitions = 1)))
})
test_that("createDataFrame/collect Arrow optimization - type specification", {
@@ -81,13 +68,7 @@ test_that("createDataFrame/collect Arrow optimization - type specification", {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- expect_equal(collect(createDataFrame(rdf)), expected)
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ expect_equal(collect(createDataFrame(rdf)), expected)
})
test_that("dapply() Arrow optimization", {
@@ -98,36 +79,30 @@ test_that("dapply() Arrow optimization", {
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
- tryCatch({
- ret <- dapply(df,
- function(rdf) {
- stopifnot(is.data.frame(rdf))
- rdf
- },
- schema(df))
- expected <- collect(ret)
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
-
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
ret <- dapply(df,
function(rdf) {
stopifnot(is.data.frame(rdf))
- # mtcars' hp is more then 50.
- stopifnot(all(rdf$hp > 50))
rdf
},
schema(df))
- actual <- collect(ret)
- expect_equal(actual, expected)
- expect_equal(count(ret), nrow(mtcars))
+ expected <- collect(ret)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
+
+ ret <- dapply(df,
+ function(rdf) {
+ stopifnot(is.data.frame(rdf))
+ # mtcars' hp is more then 50.
+ stopifnot(all(rdf$hp > 50))
+ rdf
+ },
+ schema(df))
+ actual <- collect(ret)
+ expect_equal(actual, expected)
+ expect_equal(count(ret), nrow(mtcars))
})
test_that("dapply() Arrow optimization - type specification", {
@@ -154,15 +129,9 @@ test_that("dapply() Arrow optimization - type specification", {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- ret <- dapply(df, function(rdf) { rdf }, schema(df))
- actual <- collect(ret)
- expect_equal(actual, expected)
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ ret <- dapply(df, function(rdf) { rdf }, schema(df))
+ actual <- collect(ret)
+ expect_equal(actual, expected)
})
test_that("dapply() Arrow optimization - type specification (date and timestamp)", {
@@ -170,18 +139,8 @@ test_that("dapply() Arrow optimization - type specification (date and timestamp)
rdf <- data.frame(list(list(a = as.Date("1990-02-24"),
b = as.POSIXct("1990-02-24 12:34:56"))))
df <- createDataFrame(rdf)
-
- conf <- callJMethod(sparkSession, "conf")
- arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
-
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- ret <- dapply(df, function(rdf) { rdf }, schema(df))
- expect_equal(collect(ret), rdf)
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ ret <- dapply(df, function(rdf) { rdf }, schema(df))
+ expect_equal(collect(ret), rdf)
})
test_that("gapply() Arrow optimization", {
@@ -209,28 +168,22 @@ test_that("gapply() Arrow optimization", {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- ret <- gapply(df,
- "gear",
- function(key, grouped) {
- if (length(key) > 0) {
- stopifnot(is.numeric(key[[1]]))
- }
- stopifnot(is.data.frame(grouped))
- stopifnot(length(colnames(grouped)) == 11)
- # mtcars' hp is more then 50.
- stopifnot(all(grouped$hp > 50))
- grouped
- },
- schema(df))
- actual <- collect(ret)
- expect_equal(actual, expected)
- expect_equal(count(ret), nrow(mtcars))
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ ret <- gapply(df,
+ "gear",
+ function(key, grouped) {
+ if (length(key) > 0) {
+ stopifnot(is.numeric(key[[1]]))
+ }
+ stopifnot(is.data.frame(grouped))
+ stopifnot(length(colnames(grouped)) == 11)
+ # mtcars' hp is more then 50.
+ stopifnot(all(grouped$hp > 50))
+ grouped
+ },
+ schema(df))
+ actual <- collect(ret)
+ expect_equal(actual, expected)
+ expect_equal(count(ret), nrow(mtcars))
})
test_that("gapply() Arrow optimization - type specification", {
@@ -250,26 +203,19 @@ test_that("gapply() Arrow optimization - type specification", {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
tryCatch({
ret <- gapply(df,
- "a",
- function(key, grouped) { grouped }, schema(df))
+ "a",
+ function(key, grouped) { grouped }, schema(df))
expected <- collect(ret)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
-
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- ret <- gapply(df,
- "a",
- function(key, grouped) { grouped }, schema(df))
- actual <- collect(ret)
- expect_equal(actual, expected)
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ ret <- gapply(df,
+ "a",
+ function(key, grouped) { grouped }, schema(df))
+ actual <- collect(ret)
+ expect_equal(actual, expected)
})
test_that("gapply() Arrow optimization - type specification (date and timestamp)", {
@@ -277,39 +223,30 @@ test_that("gapply() Arrow optimization - type specification (date and timestamp)
rdf <- data.frame(list(list(a = as.Date("1990-02-24"),
b = as.POSIXct("1990-02-24 12:34:56"))))
df <- createDataFrame(rdf)
+ ret <- gapply(df,
+ "a",
+ function(key, grouped) { grouped }, schema(df))
+ expect_equal(collect(ret), rdf)
+})
- conf <- callJMethod(sparkSession, "conf")
- arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
+test_that("Arrow optimization - unsupported types", {
+ skip_if_not_installed("arrow")
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- ret <- gapply(df,
- "a",
- function(key, grouped) { grouped }, schema(df))
- expect_equal(collect(ret), rdf)
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ expect_error(checkSchemaInArrow(structType("a FLOAT")), "not support float type")
+ expect_error(checkSchemaInArrow(structType("a BINARY")), "not support binary type")
+ expect_error(checkSchemaInArrow(structType("a ARRAY")), "not support array type")
+ expect_error(checkSchemaInArrow(structType("a MAP")), "not support map type")
+ expect_error(checkSchemaInArrow(structType("a STRUCT")),
+ "not support nested struct type")
})
-test_that("Arrow optimization - unsupported types", {
+test_that("SPARK-32478: gapply() Arrow optimization - error message for schema mismatch", {
skip_if_not_installed("arrow")
+ df <- createDataFrame(list(list(a = 1L, b = "a")))
- conf <- callJMethod(sparkSession, "conf")
- arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
- tryCatch({
- expect_error(checkSchemaInArrow(structType("a FLOAT")), "not support float type")
- expect_error(checkSchemaInArrow(structType("a BINARY")), "not support binary type")
- expect_error(checkSchemaInArrow(structType("a ARRAY")), "not support array type")
- expect_error(checkSchemaInArrow(structType("a MAP")), "not support map type")
- expect_error(checkSchemaInArrow(structType("a STRUCT")),
- "not support nested struct type")
- },
- finally = {
- callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
- })
+ expect_error(
+ count(gapply(df, "a", function(key, group) { group }, structType("a int, b int"))),
+ "expected IntegerType, IntegerType, got IntegerType, StringType")
})
sparkR.session.stop()
diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R
index c3fb9046fcda4..6c83a137cfb7b 100644
--- a/R/pkg/tests/fulltests/test_utils.R
+++ b/R/pkg/tests/fulltests/test_utils.R
@@ -116,7 +116,7 @@ test_that("cleanClosure on R functions", {
actual <- get("y", envir = env, inherits = FALSE)
expect_equal(actual, y)
- # Test for combination for nested and sequenctial functions in a closure
+ # Test for combination for nested and sequential functions in a closure
f1 <- function(x) x + 1
f2 <- function(x) f1(x) + 2
userFunc <- function(x) { f1(x); f2(x) }
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index bf02ecdad66ff..f9e266eb4e014 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -35,8 +35,8 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
install.spark(overwrite = TRUE)
sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
- sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
- invisible(lapply(sparkRWhitelistSQLDirs,
+ sparkRAllowedSQLDirs <- c("spark-warehouse", "metastore_db")
+ invisible(lapply(sparkRAllowedSQLDirs,
function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
@@ -60,22 +60,37 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
# set random seed for predictable results. mostly for base's sample() in tree and classification
set.seed(42)
- # TODO (SPARK-30663) To be removed once testthat 1.x is removed from all builds
- if (grepl("^1\\..*", packageVersion("testthat"))) {
- # testthat 1.x
- test_runner <- testthat:::run_tests
- reporter <- "summary"
+ if (packageVersion("testthat")$major <= 1) stop("testhat 1.x is not supported")
+ test_runner <- if (packageVersion("testthat")$major == 2) {
+ # testthat >= 2.0.0, < 3.0.0
+ function(path, package, reporter, filter) {
+ testthat:::test_package_dir(
+ test_path = path,
+ package = package,
+ filter = filter,
+ reporter = reporter
+ )
+ }
} else {
- # testthat >= 2.0.0
- test_runner <- testthat:::test_package_dir
- reporter <- testthat::default_reporter()
+ # testthat >= 3.0.0
+ testthat::test_dir
}
- test_runner("SparkR",
- file.path(sparkRDir, "pkg", "tests", "fulltests"),
- NULL,
- reporter)
+ dir.create("target/test-reports", showWarnings = FALSE)
+ reporter <- MultiReporter$new(list(
+ SummaryReporter$new(),
+ JunitReporter$new(
+ file = file.path(getwd(), "target/test-reports/test-results.xml")
+ )
+ ))
+
+ test_runner(
+ path = file.path(sparkRDir, "pkg", "tests", "fulltests"),
+ package = "SparkR",
+ reporter = reporter,
+ filter = NULL
+ )
}
SparkR:::uninstallDownloadedSpark()
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 3713e6c784855..0ed0028eb5173 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -146,7 +146,7 @@ sparkR.session.stop()
Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs.
-After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html).
+After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (e.g. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html).
```{r, eval=FALSE}
install.spark()
@@ -331,7 +331,7 @@ A common flow of grouping and aggregation is
2. Feed the `GroupedData` object to `agg` or `summarize` functions, with some provided aggregation functions to compute a number within each group.
-A number of widely used functions are supported to aggregate data after grouping, including `avg`, `countDistinct`, `count`, `first`, `kurtosis`, `last`, `max`, `mean`, `min`, `sd`, `skewness`, `stddev_pop`, `stddev_samp`, `sumDistinct`, `sum`, `var_pop`, `var_samp`, `var`. See the [API doc for aggregate functions](https://spark.apache.org/docs/latest/api/R/column_aggregate_functions.html) linked there.
+A number of widely used functions are supported to aggregate data after grouping, including `avg`, `count_distinct`, `count`, `first`, `kurtosis`, `last`, `max`, `mean`, `min`, `sd`, `skewness`, `stddev_pop`, `stddev_samp`, `sum_distinct`, `sum`, `var_pop`, `var_samp`, `var`. See the [API doc for aggregate functions](https://spark.apache.org/docs/latest/api/R/column_aggregate_functions.html) linked there.
For example we can compute a histogram of the number of cylinders in the `mtcars` dataset as shown below.
@@ -1007,7 +1007,7 @@ perplexity
#### Alternating Least Squares
-`spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](https://dl.acm.org/citation.cfm?id=1608614).
+`spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](https://dl.acm.org/doi/10.1109/MC.2009.263).
There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, and `nonnegative`. For a complete list, refer to the help file.
diff --git a/R/run-tests.sh b/R/run-tests.sh
index 51ca7d600caf0..edc2b2b60b60e 100755
--- a/R/run-tests.sh
+++ b/R/run-tests.sh
@@ -23,7 +23,18 @@ FAILED=0
LOGFILE=$FWDIR/unit-tests.out
rm -f $LOGFILE
-SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
+SPARK_AVRO_JAR_PATH=$(find $FWDIR/../external/avro/ -name "spark-avro*jar" -print | egrep -v "tests.jar|test-sources.jar|sources.jar|javadoc.jar")
+
+if [[ $(echo $SPARK_AVRO_JAR_PATH | wc -l) -eq 1 ]]; then
+ SPARK_JARS=$SPARK_AVRO_JAR_PATH
+fi
+
+if [ -z "$SPARK_JARS" ]; then
+ SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
+else
+ SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
+fi
+
FAILED=$((PIPESTATUS[0]||$FAILED))
NUM_TEST_WARNING="$(grep -c -e 'Warnings ----------------' $LOGFILE)"
diff --git a/README.md b/README.md
index d7931263b0fc7..aa7d1dd338be0 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ and Structured Streaming for stream processing.
-[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3)
+[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2)
[![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)
[![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site)
diff --git a/appveyor.yml b/appveyor.yml
index a4da5f9040ded..c40b23c8341eb 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -41,9 +41,9 @@ cache:
install:
# Install maven and dependencies
- ps: .\dev\appveyor-install-dependencies.ps1
- # Required package for R unit tests
- - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
- - cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival'); packageVersion('arrow')"
+ # Required package for R unit tests. xml2 is required to use jUnit reporter in testthat.
+ - cmd: Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
+ - cmd: Rscript -e "pkg_list <- as.data.frame(installed.packages()[,c(1, 3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]"
build_script:
# '-Djna.nosys=true' is required to avoid kernel32.dll load failure.
diff --git a/assembly/pom.xml b/assembly/pom.xml
index d17abe857ade5..d662aae96c4af 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../pom.xml
@@ -136,6 +136,10 @@
spark-yarn_${scala.binary.version}${project.version}
+
+ org.apache.hadoop
+ hadoop-yarn-server-web-proxy
+
diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
index 57b86254ab424..83b13b83341d0 100755
--- a/bin/docker-image-tool.sh
+++ b/bin/docker-image-tool.sh
@@ -172,6 +172,7 @@ function build {
local BASEDOCKERFILE=${BASEDOCKERFILE:-"kubernetes/dockerfiles/spark/Dockerfile"}
local PYDOCKERFILE=${PYDOCKERFILE:-false}
local RDOCKERFILE=${RDOCKERFILE:-false}
+ local ARCHS=${ARCHS:-"--platform linux/amd64,linux/arm64"}
(cd $(img_ctx_dir base) && docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
-t $(image_ref spark) \
@@ -179,6 +180,11 @@ function build {
if [ $? -ne 0 ]; then
error "Failed to build Spark JVM Docker image, please refer to Docker build output for details."
fi
+ if [ "${CROSS_BUILD}" != "false" ]; then
+ (cd $(img_ctx_dir base) && docker buildx build $ARCHS $NOCACHEARG "${BUILD_ARGS[@]}" --push \
+ -t $(image_ref spark) \
+ -f "$BASEDOCKERFILE" .)
+ fi
if [ "${PYDOCKERFILE}" != "false" ]; then
(cd $(img_ctx_dir pyspark) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
@@ -187,6 +193,11 @@ function build {
if [ $? -ne 0 ]; then
error "Failed to build PySpark Docker image, please refer to Docker build output for details."
fi
+ if [ "${CROSS_BUILD}" != "false" ]; then
+ (cd $(img_ctx_dir pyspark) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" --push \
+ -t $(image_ref spark-py) \
+ -f "$PYDOCKERFILE" .)
+ fi
fi
if [ "${RDOCKERFILE}" != "false" ]; then
@@ -196,6 +207,11 @@ function build {
if [ $? -ne 0 ]; then
error "Failed to build SparkR Docker image, please refer to Docker build output for details."
fi
+ if [ "${CROSS_BUILD}" != "false" ]; then
+ (cd $(img_ctx_dir sparkr) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" --push \
+ -t $(image_ref spark-r) \
+ -f "$RDOCKERFILE" .)
+ fi
fi
}
@@ -227,6 +243,8 @@ Options:
-n Build docker image with --no-cache
-u uid UID to use in the USER directive to set the user the main Spark process runs as inside the
resulting container
+ -X Use docker buildx to cross build. Automatically pushes.
+ See https://docs.docker.com/buildx/working-with-buildx/ for steps to setup buildx.
-b arg Build arg to build or push the image. For multiple build args, this option needs to
be used separately for each build arg.
@@ -252,6 +270,12 @@ Examples:
- Build and push JDK11-based image with tag "v3.0.0" to docker.io/myrepo
$0 -r docker.io/myrepo -t v3.0.0 -b java_image_tag=11-jre-slim build
$0 -r docker.io/myrepo -t v3.0.0 push
+
+ - Build and push JDK11-based image for multiple archs to docker.io/myrepo
+ $0 -r docker.io/myrepo -t v3.0.0 -X -b java_image_tag=11-jre-slim build
+ # Note: buildx, which does cross building, needs to do the push during build
+ # So there is no separate push step with -X
+
EOF
}
@@ -268,7 +292,8 @@ RDOCKERFILE=
NOCACHEARG=
BUILD_PARAMS=
SPARK_UID=
-while getopts f:p:R:mr:t:nb:u: option
+CROSS_BUILD="false"
+while getopts f:p:R:mr:t:Xnb:u: option
do
case "${option}"
in
@@ -279,6 +304,7 @@ do
t) TAG=${OPTARG};;
n) NOCACHEARG="--no-cache";;
b) BUILD_PARAMS=${BUILD_PARAMS}" --build-arg "${OPTARG};;
+ X) CROSS_BUILD=1;;
m)
if ! which minikube 1>/dev/null; then
error "Cannot find minikube."
diff --git a/bin/find-spark-home b/bin/find-spark-home
index 617dbaa4fff86..462b538b00a04 100755
--- a/bin/find-spark-home
+++ b/bin/find-spark-home
@@ -33,9 +33,9 @@ elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)"
else
# We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
- # Default to standard python interpreter unless told otherwise
+ # Default to standard python3 interpreter unless told otherwise
if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
- PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
+ PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python3"}"
fi
export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")
fi
diff --git a/bin/find-spark-home.cmd b/bin/find-spark-home.cmd
index 6025f67c38de4..3149d05039ba4 100644
--- a/bin/find-spark-home.cmd
+++ b/bin/find-spark-home.cmd
@@ -20,8 +20,8 @@ rem
rem Path to Python script finding SPARK_HOME
set FIND_SPARK_HOME_PYTHON_SCRIPT=%~dp0find_spark_home.py
-rem Default to standard python interpreter unless told otherwise
-set PYTHON_RUNNER=python
+rem Default to standard python3 interpreter unless told otherwise
+set PYTHON_RUNNER=python3
rem If PYSPARK_DRIVER_PYTHON is set, it overwrites the python version
if not "x%PYSPARK_DRIVER_PYTHON%"=="x" (
set PYTHON_RUNNER=%PYSPARK_DRIVER_PYTHON%
@@ -55,6 +55,6 @@ if "x%SPARK_HOME%"=="x" (
set SPARK_HOME=%~dp0..
) else (
rem We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
- for /f "delims=" %%i in ('%PYTHON_RUNNER% %FIND_SPARK_HOME_PYTHON_SCRIPT%') do set SPARK_HOME=%%i
+ for /f "delims=" %%i in ('%PYTHON_RUNNER% "%FIND_SPARK_HOME_PYTHON_SCRIPT%"') do set SPARK_HOME=%%i
)
)
diff --git a/bin/load-spark-env.cmd b/bin/load-spark-env.cmd
index 5f98cc34b6bab..5692af529fb66 100644
--- a/bin/load-spark-env.cmd
+++ b/bin/load-spark-env.cmd
@@ -21,42 +21,42 @@ rem This script loads spark-env.cmd if it exists, and ensures it is only loaded
rem spark-env.cmd is loaded from SPARK_CONF_DIR if set, or within the current directory's
rem conf\ subdirectory.
-set SPARK_ENV_CMD=spark-env.cmd
-if [%SPARK_ENV_LOADED%] == [] (
+if not defined SPARK_ENV_LOADED (
set SPARK_ENV_LOADED=1
- if [%SPARK_CONF_DIR%] == [] (
+ if not defined SPARK_CONF_DIR (
set SPARK_CONF_DIR=%~dp0..\conf
)
- set SPARK_ENV_CMD=%SPARK_CONF_DIR%\%SPARK_ENV_CMD%
- if exist %SPARK_ENV_CMD% (
- call %SPARK_ENV_CMD%
- )
+ call :LoadSparkEnv
)
rem Setting SPARK_SCALA_VERSION if not already set.
-rem TODO: revisit for Scala 2.13 support
-set SPARK_SCALA_VERSION=2.12
-rem if [%SPARK_SCALA_VERSION%] == [] (
-rem set SCALA_VERSION_1=2.12
-rem set SCALA_VERSION_2=2.11
-rem
-rem set ASSEMBLY_DIR1=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1%
-rem set ASSEMBLY_DIR2=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2%
-rem set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables
-rem if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
-rem echo "Presence of build for multiple Scala versions detected (%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%)."
-rem echo "Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in %SPARK_ENV_CMD%."
-rem echo "Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd."
-rem echo "Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd."
-rem exit 1
-rem )
-rem if exist %ASSEMBLY_DIR1% (
-rem set SPARK_SCALA_VERSION=%SCALA_VERSION_1%
-rem ) else (
-rem set SPARK_SCALA_VERSION=%SCALA_VERSION_2%
-rem )
-rem )
+set SCALA_VERSION_1=2.13
+set SCALA_VERSION_2=2.12
+
+set ASSEMBLY_DIR1="%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1%"
+set ASSEMBLY_DIR2="%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2%"
+set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables
+
+if not defined SPARK_SCALA_VERSION (
+ if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
+ echo Presence of build for multiple Scala versions detected ^(%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%^).
+ echo Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in spark-env.cmd.
+ echo Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd.
+ echo Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd.
+ exit 1
+ )
+ if exist %ASSEMBLY_DIR1% (
+ set SPARK_SCALA_VERSION=%SCALA_VERSION_1%
+ ) else (
+ set SPARK_SCALA_VERSION=%SCALA_VERSION_2%
+ )
+)
exit /b 0
+
+:LoadSparkEnv
+if exist "%SPARK_CONF_DIR%\spark-env.cmd" (
+ call "%SPARK_CONF_DIR%\spark-env.cmd"
+)
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
index 107e7991c28bc..04adaeed7ac61 100644
--- a/bin/load-spark-env.sh
+++ b/bin/load-spark-env.sh
@@ -43,25 +43,23 @@ fi
# Setting SPARK_SCALA_VERSION if not already set.
-# TODO: revisit for Scala 2.13 support
-export SPARK_SCALA_VERSION=2.12
-#if [ -z "$SPARK_SCALA_VERSION" ]; then
-# SCALA_VERSION_1=2.12
-# SCALA_VERSION_2=2.11
-#
-# ASSEMBLY_DIR_1="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_1}"
-# ASSEMBLY_DIR_2="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_2}"
-# ENV_VARIABLE_DOC="https://spark.apache.org/docs/latest/configuration.html#environment-variables"
-# if [[ -d "$ASSEMBLY_DIR_1" && -d "$ASSEMBLY_DIR_2" ]]; then
-# echo "Presence of build for multiple Scala versions detected ($ASSEMBLY_DIR_1 and $ASSEMBLY_DIR_2)." 1>&2
-# echo "Remove one of them or, export SPARK_SCALA_VERSION=$SCALA_VERSION_1 in ${SPARK_ENV_SH}." 1>&2
-# echo "Visit ${ENV_VARIABLE_DOC} for more details about setting environment variables in spark-env.sh." 1>&2
-# exit 1
-# fi
-#
-# if [[ -d "$ASSEMBLY_DIR_1" ]]; then
-# export SPARK_SCALA_VERSION=${SCALA_VERSION_1}
-# else
-# export SPARK_SCALA_VERSION=${SCALA_VERSION_2}
-# fi
-#fi
+if [ -z "$SPARK_SCALA_VERSION" ]; then
+ SCALA_VERSION_1=2.13
+ SCALA_VERSION_2=2.12
+
+ ASSEMBLY_DIR_1="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_1}"
+ ASSEMBLY_DIR_2="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_2}"
+ ENV_VARIABLE_DOC="https://spark.apache.org/docs/latest/configuration.html#environment-variables"
+ if [[ -d "$ASSEMBLY_DIR_1" && -d "$ASSEMBLY_DIR_2" ]]; then
+ echo "Presence of build for multiple Scala versions detected ($ASSEMBLY_DIR_1 and $ASSEMBLY_DIR_2)." 1>&2
+ echo "Remove one of them or, export SPARK_SCALA_VERSION=$SCALA_VERSION_1 in ${SPARK_ENV_SH}." 1>&2
+ echo "Visit ${ENV_VARIABLE_DOC} for more details about setting environment variables in spark-env.sh." 1>&2
+ exit 1
+ fi
+
+ if [[ -d "$ASSEMBLY_DIR_1" ]]; then
+ export SPARK_SCALA_VERSION=${SCALA_VERSION_1}
+ else
+ export SPARK_SCALA_VERSION=${SCALA_VERSION_2}
+ fi
+fi
diff --git a/bin/pyspark b/bin/pyspark
index ad4132fb59eb0..38ebe51c8d555 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -37,9 +37,9 @@ if [[ -n "$IPYTHON" || -n "$IPYTHON_OPTS" ]]; then
exit 1
fi
-# Default to standard python interpreter unless told otherwise
+# Default to standard python3 interpreter unless told otherwise
if [[ -z "$PYSPARK_PYTHON" ]]; then
- PYSPARK_PYTHON=python
+ PYSPARK_PYTHON=python3
fi
if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
PYSPARK_DRIVER_PYTHON=$PYSPARK_PYTHON
@@ -50,7 +50,7 @@ export PYSPARK_DRIVER_PYTHON_OPTS
# Add the PySpark classes to the Python path:
export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.2-src.zip:$PYTHONPATH"
# Load the PySpark shell.py script when ./pyspark is used interactively:
export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index dc34be1a41706..f5f9fad8dfb1b 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
)
set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.2-src.zip;%PYTHONPATH%
set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
old mode 100644
new mode 100755
index 34d04c9856d2c..68b271d1d05d9
--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -30,12 +30,12 @@ if "x%1"=="x" (
rem Find Spark jars.
if exist "%SPARK_HOME%\jars" (
- set SPARK_JARS_DIR="%SPARK_HOME%\jars"
+ set SPARK_JARS_DIR=%SPARK_HOME%\jars
) else (
- set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%\jars"
+ set SPARK_JARS_DIR=%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%\jars
)
-if not exist "%SPARK_JARS_DIR%"\ (
+if not exist "%SPARK_JARS_DIR%" (
echo Failed to find Spark jars directory.
echo You need to build Spark before running this program.
exit /b 1
diff --git a/binder/apt.txt b/binder/apt.txt
new file mode 100644
index 0000000000000..385f5b0fba754
--- /dev/null
+++ b/binder/apt.txt
@@ -0,0 +1 @@
+openjdk-8-jre
diff --git a/binder/postBuild b/binder/postBuild
new file mode 100644
index 0000000000000..42bb3514c5a2e
--- /dev/null
+++ b/binder/postBuild
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is used for Binder integration to install PySpark available in
+# Jupyter notebook.
+
+VERSION=$(python -c "exec(open('python/pyspark/version.py').read()); print(__version__)")
+pip install "pyspark[sql,ml,mllib]<=$VERSION"
diff --git a/build/mvn b/build/mvn
index 53a77f6e81f2b..9e63cc2ff9ca9 100755
--- a/build/mvn
+++ b/build/mvn
@@ -26,36 +26,67 @@ _COMPILE_JVM_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g"
# Installs any application tarball given a URL, the expected tarball name,
# and, optionally, a checkable binary path to determine if the binary has
-# already been installed
-## Arg1 - URL
-## Arg2 - Tarball Name
-## Arg3 - Checkable Binary
+# already been installed. Arguments:
+# 1 - Mirror host
+# 2 - URL path on host
+# 3 - URL query string
+# 4 - checksum suffix
+# 5 - Tarball Name
+# 6 - Checkable Binary
install_app() {
- local remote_tarball="$1/$2"
- local local_tarball="${_DIR}/$2"
- local binary="${_DIR}/$3"
+ local mirror_host="$1"
+ local url_path="$2"
+ local url_query="$3"
+ local checksum_suffix="$4"
+ local local_tarball="${_DIR}/$5"
+ local binary="${_DIR}/$6"
+ local remote_tarball="${mirror_host}/${url_path}${url_query}"
+ local local_checksum="${local_tarball}.${checksum_suffix}"
+ local remote_checksum="https://archive.apache.org/dist/${url_path}.${checksum_suffix}"
local curl_opts="--silent --show-error -L"
local wget_opts="--no-verbose"
- if [ -z "$3" -o ! -f "$binary" ]; then
+ if [ ! -f "$binary" ]; then
# check if we already have the tarball
# check if we have curl installed
# download application
- [ ! -f "${local_tarball}" ] && [ $(command -v curl) ] && \
- echo "exec: curl ${curl_opts} ${remote_tarball}" 1>&2 && \
+ if [ ! -f "${local_tarball}" -a $(command -v curl) ]; then
+ echo "exec: curl ${curl_opts} ${remote_tarball}" 1>&2
curl ${curl_opts} "${remote_tarball}" > "${local_tarball}"
+ if [ ! -z "${checksum_suffix}" ]; then
+ echo "exec: curl ${curl_opts} ${remote_checksum}" 1>&2
+ curl ${curl_opts} "${remote_checksum}" > "${local_checksum}"
+ fi
+ fi
# if the file still doesn't exist, lets try `wget` and cross our fingers
- [ ! -f "${local_tarball}" ] && [ $(command -v wget) ] && \
- echo "exec: wget ${wget_opts} ${remote_tarball}" 1>&2 && \
+ if [ ! -f "${local_tarball}" -a $(command -v wget) ]; then
+ echo "exec: wget ${wget_opts} ${remote_tarball}" 1>&2
wget ${wget_opts} -O "${local_tarball}" "${remote_tarball}"
+ if [ ! -z "${checksum_suffix}" ]; then
+ echo "exec: wget ${wget_opts} ${remote_checksum}" 1>&2
+ wget ${wget_opts} -O "${local_checksum}" "${remote_checksum}"
+ fi
+ fi
# if both were unsuccessful, exit
- [ ! -f "${local_tarball}" ] && \
- echo -n "ERROR: Cannot download $2 with cURL or wget; " && \
- echo "please install manually and try again." && \
+ if [ ! -f "${local_tarball}" ]; then
+ echo -n "ERROR: Cannot download ${remote_tarball} with cURL or wget; please install manually and try again."
exit 2
- cd "${_DIR}" && tar -xzf "$2"
- rm -rf "$local_tarball"
+ fi
+ # Checksum may not have been specified; don't check if doesn't exist
+ if [ -f "${local_checksum}" ]; then
+ echo " ${local_tarball}" >> ${local_checksum} # two spaces + file are important!
+ # Assuming SHA512 here for now
+ echo "Veryfing checksum from ${local_checksum}" 1>&2
+ if ! shasum -a 512 -q -c "${local_checksum}" ; then
+ echo "Bad checksum from ${remote_checksum}"
+ exit 2
+ fi
+ fi
+
+ cd "${_DIR}" && tar -xzf "${local_tarball}"
+ rm -rf "${local_tarball}"
+ rm -f "${local_checksum}"
fi
}
@@ -71,47 +102,32 @@ install_mvn() {
local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')"
fi
if [ $(version $MVN_DETECTED_VERSION) -lt $(version $MVN_VERSION) ]; then
- local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua?action=download&filename='}
-
+ local MVN_TARBALL="apache-maven-${MVN_VERSION}-bin.tar.gz"
+ local FILE_PATH="maven/maven-3/${MVN_VERSION}/binaries/${MVN_TARBALL}"
+ local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua'}
+ local MIRROR_URL_QUERY="?action=download"
+
if [ $(command -v curl) ]; then
- local TEST_MIRROR_URL="${APACHE_MIRROR}/maven/maven-3/${MVN_VERSION}/binaries/apache-maven-${MVN_VERSION}-bin.tar.gz"
- if ! curl -L --output /dev/null --silent --head --fail "$TEST_MIRROR_URL" ; then
+ if ! curl -L --output /dev/null --silent --head --fail "${APACHE_MIRROR}/${FILE_PATH}${MIRROR_URL_QUERY}" ; then
# Fall back to archive.apache.org for older Maven
echo "Falling back to archive.apache.org to download Maven"
APACHE_MIRROR="https://archive.apache.org/dist"
+ MIRROR_URL_QUERY=""
fi
fi
install_app \
- "${APACHE_MIRROR}/maven/maven-3/${MVN_VERSION}/binaries" \
- "apache-maven-${MVN_VERSION}-bin.tar.gz" \
+ "${APACHE_MIRROR}" \
+ "${FILE_PATH}" \
+ "${MIRROR_URL_QUERY}" \
+ "sha512" \
+ "${MVN_TARBALL}" \
"apache-maven-${MVN_VERSION}/bin/mvn"
MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn"
fi
}
-# Install zinc under the build/ folder
-install_zinc() {
- local ZINC_VERSION=0.3.15
- ZINC_BIN="$(command -v zinc)"
- if [ "$ZINC_BIN" ]; then
- local ZINC_DETECTED_VERSION="$(zinc -version | head -n1 | awk '{print $5}')"
- fi
-
- if [ $(version $ZINC_DETECTED_VERSION) -lt $(version $ZINC_VERSION) ]; then
- local zinc_path="zinc-${ZINC_VERSION}/bin/zinc"
- [ ! -f "${_DIR}/${zinc_path}" ] && ZINC_INSTALL_FLAG=1
- local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.lightbend.com}
-
- install_app \
- "${TYPESAFE_MIRROR}/zinc/${ZINC_VERSION}" \
- "zinc-${ZINC_VERSION}.tgz" \
- "${zinc_path}"
- ZINC_BIN="${_DIR}/${zinc_path}"
- fi
-}
-
# Determine the Scala version from the root pom.xml file, set the Scala URL,
# and, with that, download the specific version of Scala necessary under
# the build/ folder
@@ -121,39 +137,26 @@ install_scala() {
local scala_version=`grep "scala.version" "${_DIR}/../pom.xml" | grep ${scala_binary_version} | head -n1 | awk -F '[<>]' '{print $3}'`
local scala_bin="${_DIR}/scala-${scala_version}/bin/scala"
local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.lightbend.com}
+ local SCALA_TARBALL="scala-${scala_version}.tgz"
install_app \
- "${TYPESAFE_MIRROR}/scala/${scala_version}" \
- "scala-${scala_version}.tgz" \
+ "${TYPESAFE_MIRROR}" \
+ "scala/${scala_version}/${SCALA_TARBALL}" \
+ "" \
+ "" \
+ ${SCALA_TARBALL} \
"scala-${scala_version}/bin/scala"
SCALA_COMPILER="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-compiler.jar"
SCALA_LIBRARY="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-library.jar"
}
-# Setup healthy defaults for the Zinc port if none were provided from
-# the environment
-ZINC_PORT=${ZINC_PORT:-"3030"}
-
-# Install the proper version of Scala, Zinc and Maven for the build
-install_zinc
install_scala
install_mvn
# Reset the current working directory
cd "${_CALLING_DIR}"
-# Now that zinc is ensured to be installed, check its status and, if its
-# not running or just installed, start it
-if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`"${ZINC_BIN}" -status -port ${ZINC_PORT}`" ]; then
- export ZINC_OPTS=${ZINC_OPTS:-"$_COMPILE_JVM_OPTS"}
- "${ZINC_BIN}" -shutdown -port ${ZINC_PORT}
- "${ZINC_BIN}" -start -port ${ZINC_PORT} \
- -server 127.0.0.1 -idle-timeout 3h \
- -scala-compiler "${SCALA_COMPILER}" \
- -scala-library "${SCALA_LIBRARY}" &>/dev/null
-fi
-
# Set any `mvn` options if not already present
export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"}
@@ -161,10 +164,7 @@ echo "Using \`mvn\` from path: $MVN_BIN" 1>&2
# call the `mvn` command as usual
# SPARK-25854
-"${MVN_BIN}" -DzincPort=${ZINC_PORT} "$@"
+"${MVN_BIN}" "$@"
MVN_RETCODE=$?
-# Try to shut down zinc explicitly if the server is still running.
-"${ZINC_BIN}" -shutdown -port ${ZINC_PORT}
-
exit $MVN_RETCODE
diff --git a/build/sbt b/build/sbt
index 475dfd3b20b43..ae9ca93fc9ca9 100755
--- a/build/sbt
+++ b/build/sbt
@@ -53,6 +53,7 @@ realpath () {
declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
declare -r sbt_opts_file=".sbtopts"
declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
+declare -r default_sbt_opts="-Xss4m"
usage() {
cat < "$SPARK_BUILD_INFO"
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 39cdc6d6d6cd3..4ade8c2032b24 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../../pom.xml
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
index 42e090bc83ed1..431c7e42774e4 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
@@ -164,8 +164,9 @@ public void clear() {
}
/**
- * An alias class for the type "ConcurrentHashMap, Boolean>", which is used
- * as a concurrent hashset for storing natural keys and the boolean value doesn't matter.
+ * An alias class for the type "{@literal ConcurrentHashMap, Boolean>}",
+ * which is used as a concurrent hashset for storing natural keys
+ * and the boolean value doesn't matter.
*/
private static class NaturalKeys extends ConcurrentHashMap, Boolean> {}
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index 2ca4b0b2cb9f9..121dfbd4f6838 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -19,10 +19,13 @@
import java.io.File;
import java.io.IOException;
+import java.lang.ref.SoftReference;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
import static java.nio.charset.StandardCharsets.UTF_8;
import com.google.common.annotations.VisibleForTesting;
@@ -64,6 +67,13 @@ public class LevelDB implements KVStore {
private final ConcurrentMap typeAliases;
private final ConcurrentMap, LevelDBTypeInfo> types;
+ /**
+ * Trying to close a JNI LevelDB handle with a closed DB causes JVM crashes. This is used to
+ * ensure that all iterators are correctly closed before LevelDB is closed. Use soft reference
+ * to ensure that the iterator can be GCed, when it is only referenced here.
+ */
+ private final ConcurrentLinkedQueue>> iteratorTracker;
+
public LevelDB(File path) throws Exception {
this(path, new KVStoreSerializer());
}
@@ -94,6 +104,8 @@ public LevelDB(File path, KVStoreSerializer serializer) throws Exception {
aliases = new HashMap<>();
}
typeAliases = new ConcurrentHashMap<>(aliases);
+
+ iteratorTracker = new ConcurrentLinkedQueue<>();
}
@Override
@@ -142,24 +154,72 @@ public void write(Object value) throws Exception {
try (WriteBatch batch = db().createWriteBatch()) {
byte[] data = serializer.serialize(value);
synchronized (ti) {
- Object existing;
- try {
- existing = get(ti.naturalIndex().entityKey(null, value), value.getClass());
- } catch (NoSuchElementException e) {
- existing = null;
- }
+ updateBatch(batch, value, data, value.getClass(), ti.naturalIndex(), ti.indices());
+ db().write(batch);
+ }
+ }
+ }
+
+ public void writeAll(List> values) throws Exception {
+ Preconditions.checkArgument(values != null && !values.isEmpty(),
+ "Non-empty values required.");
+
+ // Group by class, in case there are values from different classes in the values
+ // Typical usecase is for this to be a single class.
+ // A NullPointerException will be thrown if values contain null object.
+ for (Map.Entry extends Class>, ? extends List>> entry :
+ values.stream().collect(Collectors.groupingBy(Object::getClass)).entrySet()) {
+
+ final Iterator> valueIter = entry.getValue().iterator();
+ final Iterator serializedValueIter;
+
+ // Deserialize outside synchronized block
+ List list = new ArrayList<>(entry.getValue().size());
+ for (Object value : values) {
+ list.add(serializer.serialize(value));
+ }
+ serializedValueIter = list.iterator();
+
+ final Class> klass = entry.getKey();
+ final LevelDBTypeInfo ti = getTypeInfo(klass);
+
+ synchronized (ti) {
+ final LevelDBTypeInfo.Index naturalIndex = ti.naturalIndex();
+ final Collection indices = ti.indices();
- PrefixCache cache = new PrefixCache(value);
- byte[] naturalKey = ti.naturalIndex().toKey(ti.naturalIndex().getValue(value));
- for (LevelDBTypeInfo.Index idx : ti.indices()) {
- byte[] prefix = cache.getPrefix(idx);
- idx.add(batch, value, existing, data, naturalKey, prefix);
+ try (WriteBatch batch = db().createWriteBatch()) {
+ while (valueIter.hasNext()) {
+ updateBatch(batch, valueIter.next(), serializedValueIter.next(), klass,
+ naturalIndex, indices);
+ }
+ db().write(batch);
}
- db().write(batch);
}
}
}
+ private void updateBatch(
+ WriteBatch batch,
+ Object value,
+ byte[] data,
+ Class> klass,
+ LevelDBTypeInfo.Index naturalIndex,
+ Collection indices) throws Exception {
+ Object existing;
+ try {
+ existing = get(naturalIndex.entityKey(null, value), klass);
+ } catch (NoSuchElementException e) {
+ existing = null;
+ }
+
+ PrefixCache cache = new PrefixCache(value);
+ byte[] naturalKey = naturalIndex.toKey(naturalIndex.getValue(value));
+ for (LevelDBTypeInfo.Index idx : indices) {
+ byte[] prefix = cache.getPrefix(idx);
+ idx.add(batch, value, existing, data, naturalKey, prefix);
+ }
+ }
+
@Override
public void delete(Class> type, Object naturalKey) throws Exception {
Preconditions.checkArgument(naturalKey != null, "Null keys are not allowed.");
@@ -189,7 +249,9 @@ public KVStoreView view(Class type) throws Exception {
@Override
public Iterator iterator() {
try {
- return new LevelDBIterator<>(type, LevelDB.this, this);
+ LevelDBIterator it = new LevelDBIterator<>(type, LevelDB.this, this);
+ iteratorTracker.add(new SoftReference<>(it));
+ return it;
} catch (Exception e) {
throw Throwables.propagate(e);
}
@@ -238,6 +300,14 @@ public void close() throws IOException {
}
try {
+ if (iteratorTracker != null) {
+ for (SoftReference> ref: iteratorTracker) {
+ LevelDBIterator> it = ref.get();
+ if (it != null) {
+ it.close();
+ }
+ }
+ }
_db.close();
} catch (IOException ioe) {
throw ioe;
@@ -252,6 +322,7 @@ public void close() throws IOException {
* with a closed DB can cause JVM crashes, so this ensures that situation does not happen.
*/
void closeIterator(LevelDBIterator> it) throws IOException {
+ notifyIteratorClosed(it);
synchronized (this._db) {
DB _db = this._db.get();
if (_db != null) {
@@ -260,6 +331,14 @@ void closeIterator(LevelDBIterator> it) throws IOException {
}
}
+ /**
+ * Remove iterator from iterator tracker. `LevelDBIterator` calls it to notify
+ * iterator is closed.
+ */
+ void notifyIteratorClosed(LevelDBIterator> it) {
+ iteratorTracker.removeIf(ref -> it.equals(ref.get()));
+ }
+
/** Returns metadata about indices for the given type. */
LevelDBTypeInfo getTypeInfo(Class> type) throws Exception {
LevelDBTypeInfo ti = types.get(type);
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
index 94e8c9fc5796c..e8fb4fac5ba17 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
@@ -185,6 +185,7 @@ public boolean skip(long n) {
@Override
public synchronized void close() throws IOException {
+ db.notifyIteratorClosed(this);
if (!closed) {
it.close();
closed = true;
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
index d7423537ddfcf..4d7f76f673865 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
@@ -133,7 +133,7 @@ class LevelDBTypeInfo {
// First create the parent indices, then the child indices.
ti.indices().forEach(idx -> {
- // In LevelDB, there is no parent index for the NUTURAL INDEX.
+ // In LevelDB, there is no parent index for the NATURAL INDEX.
if (idx.parent().isEmpty() || idx.value().equals(KVIndex.NATURAL_INDEX_NAME)) {
indices.put(idx.value(), new Index(idx, ti.getAccessor(idx.value()), null));
}
diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
index 0b755ba0e8000..f6566617765d4 100644
--- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
@@ -19,6 +19,7 @@
import java.io.File;
import java.util.Arrays;
+import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.stream.Collectors;
@@ -276,6 +277,41 @@ public void testNegativeIndexValues() throws Exception {
assertEquals(expected, results);
}
+ @Test
+ public void testCloseLevelDBIterator() throws Exception {
+ // SPARK-31929: test when LevelDB.close() is called, related LevelDBIterators
+ // are closed. And files opened by iterators are also closed.
+ File dbPathForCloseTest = File
+ .createTempFile(
+ "test_db_close.",
+ ".ldb");
+ dbPathForCloseTest.delete();
+ LevelDB dbForCloseTest = new LevelDB(dbPathForCloseTest);
+ for (int i = 0; i < 8192; i++) {
+ dbForCloseTest.write(createCustomType1(i));
+ }
+ String key = dbForCloseTest
+ .view(CustomType1.class).iterator().next().key;
+ assertEquals("key0", key);
+ Iterator it0 = dbForCloseTest
+ .view(CustomType1.class).max(1).iterator();
+ while (it0.hasNext()) {
+ it0.next();
+ }
+ System.gc();
+ Iterator it1 = dbForCloseTest
+ .view(CustomType1.class).iterator();
+ assertEquals("key0", it1.next().key);
+ try (KVStoreIterator it2 = dbForCloseTest
+ .view(CustomType1.class).closeableIterator()) {
+ assertEquals("key0", it2.next().key);
+ }
+ dbForCloseTest.close();
+ assertTrue(dbPathForCloseTest.exists());
+ FileUtils.deleteQuietly(dbPathForCloseTest);
+ assertTrue(!dbPathForCloseTest.exists());
+ }
+
private CustomType1 createCustomType1(int i) {
CustomType1 t = new CustomType1();
t.key = "key" + i;
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 9d5bc9aae0719..0318f60d546e7 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../../pom.xml
@@ -91,6 +91,10 @@
org.apache.commonscommons-crypto
+
+ org.roaringbitmap
+ RoaringBitmap
+
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
index 6dcc703e92669..eb2882074d7c7 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -303,7 +303,7 @@ public void close() {
@Override
public String toString() {
return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE)
- .append("remoteAdress", channel.remoteAddress())
+ .append("remoteAddress", channel.remoteAddress())
.append("clientId", clientId)
.append("isActive", isActive())
.toString();
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
index 24c436a504fa8..43408d43e577e 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
@@ -254,7 +254,7 @@ TransportClient createClient(InetSocketAddress address)
// Disable Nagle's Algorithm since we don't want packets to wait
.option(ChannelOption.TCP_NODELAY, true)
.option(ChannelOption.SO_KEEPALIVE, true)
- .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, conf.connectionTimeoutMs())
+ .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, conf.connectionCreationTimeoutMs())
.option(ChannelOption.ALLOCATOR, pooledAllocator);
if (conf.receiveBuf() > 0) {
@@ -280,9 +280,10 @@ public void initChannel(SocketChannel ch) {
// Connect to the remote server
long preConnect = System.nanoTime();
ChannelFuture cf = bootstrap.connect(address);
- if (!cf.await(conf.connectionTimeoutMs())) {
+ if (!cf.await(conf.connectionCreationTimeoutMs())) {
throw new IOException(
- String.format("Connecting to %s timed out (%s ms)", address, conf.connectionTimeoutMs()));
+ String.format("Connecting to %s timed out (%s ms)",
+ address, conf.connectionCreationTimeoutMs()));
} else if (cf.cause() != null) {
throw new IOException(String.format("Failed to connect to %s", address), cf.cause());
}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
index 2f143f77fa4ae..3aac2d2441d2a 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
@@ -188,6 +188,7 @@ public void handle(ResponseMessage message) throws Exception {
if (listener == null) {
logger.warn("Ignoring response for RPC {} from {} ({} bytes) since it is not outstanding",
resp.requestId, getRemoteAddress(channel), resp.body().size());
+ resp.body().release();
} else {
outstandingRpcs.remove(resp.requestId);
try {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java
index 64fdb32a67ada..c2b2edc7f07d5 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java
@@ -287,7 +287,7 @@ private byte[] doCipherOp(int mode, byte[] in, boolean isFinal)
}
}
} catch (InternalError ie) {
- // SPARK-25535. The commons-cryto library will throw InternalError if something goes wrong,
+ // SPARK-25535. The commons-crypto library will throw InternalError if something goes wrong,
// and leave bad state behind in the Java wrappers, so it's not safe to use them afterwards.
if (mode == Cipher.ENCRYPT_MODE) {
this.encryptor = null;
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/README.md b/common/network-common/src/main/java/org/apache/spark/network/crypto/README.md
index 14df703270498..7a9fa3a91d143 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/README.md
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/README.md
@@ -155,4 +155,4 @@ server will be able to understand. This will cause the server to close the conne
attacker tries to send any command to the server. The attacker can just hold the channel open for
some time, which will be closed when the server times out the channel. These issues could be
separately mitigated by adding a shorter timeout for the first message after authentication, and
-potentially by adding host blacklists if a possible attack is detected from a particular host.
+potentially by adding host reject-lists if a possible attack is detected from a particular host.
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java
index 490915f6de4b3..8bab808ad6864 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java
@@ -17,9 +17,12 @@
package org.apache.spark.network.protocol;
+import java.io.IOException;
+import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import io.netty.buffer.ByteBuf;
+import org.roaringbitmap.RoaringBitmap;
/** Provides a canonical set of Encoders for simple types. */
public class Encoders {
@@ -44,6 +47,51 @@ public static String decode(ByteBuf buf) {
}
}
+ /**
+ * Bitmaps are encoded with their serialization length followed by the serialization bytes.
+ *
+ * @since 3.1.0
+ */
+ public static class Bitmaps {
+ public static int encodedLength(RoaringBitmap b) {
+ // Compress the bitmap before serializing it. Note that since BlockTransferMessage
+ // needs to invoke encodedLength first to figure out the length for the ByteBuf, it
+ // guarantees that the bitmap will always be compressed before being serialized.
+ b.trim();
+ b.runOptimize();
+ return b.serializedSizeInBytes();
+ }
+
+ /**
+ * The input ByteBuf for this encoder should have enough write capacity to fit the serialized
+ * bitmap. Other encoders which use {@link io.netty.buffer.AbstractByteBuf#writeBytes(byte[])}
+ * to write can expand the buf as writeBytes calls {@link ByteBuf#ensureWritable} internally.
+ * However, this encoder doesn't rely on netty's writeBytes and will fail if the input buf
+ * doesn't have enough write capacity.
+ */
+ public static void encode(ByteBuf buf, RoaringBitmap b) {
+ // RoaringBitmap requires nio ByteBuffer for serde. We expose the netty ByteBuf as a nio
+ // ByteBuffer. Here, we need to explicitly manage the index so we can write into the
+ // ByteBuffer, and the write is reflected in the underneath ByteBuf.
+ ByteBuffer byteBuffer = buf.nioBuffer(buf.writerIndex(), buf.writableBytes());
+ b.serialize(byteBuffer);
+ buf.writerIndex(buf.writerIndex() + byteBuffer.position());
+ }
+
+ public static RoaringBitmap decode(ByteBuf buf) {
+ RoaringBitmap bitmap = new RoaringBitmap();
+ try {
+ bitmap.deserialize(buf.nioBuffer());
+ // RoaringBitmap deserialize does not advance the reader index of the underlying ByteBuf.
+ // Manually update the index here.
+ buf.readerIndex(buf.readerIndex() + bitmap.serializedSizeInBytes());
+ } catch (IOException e) {
+ throw new RuntimeException("Exception while decoding bitmap", e);
+ }
+ return bitmap;
+ }
+ }
+
/** Byte arrays are encoded with their length followed by bytes. */
public static class ByteArrays {
public static int encodedLength(byte[] arr) {
@@ -135,4 +183,35 @@ public static long[] decode(ByteBuf buf) {
return longs;
}
}
+
+ /**
+ * Bitmap arrays are encoded with the number of bitmaps followed by per-Bitmap encoding.
+ *
+ * @since 3.1.0
+ */
+ public static class BitmapArrays {
+ public static int encodedLength(RoaringBitmap[] bitmaps) {
+ int totalLength = 4;
+ for (RoaringBitmap b : bitmaps) {
+ totalLength += Bitmaps.encodedLength(b);
+ }
+ return totalLength;
+ }
+
+ public static void encode(ByteBuf buf, RoaringBitmap[] bitmaps) {
+ buf.writeInt(bitmaps.length);
+ for (RoaringBitmap b : bitmaps) {
+ Bitmaps.encode(buf, b);
+ }
+ }
+
+ public static RoaringBitmap[] decode(ByteBuf buf) {
+ int numBitmaps = buf.readInt();
+ RoaringBitmap[] bitmaps = new RoaringBitmap[numBitmaps];
+ for (int i = 0; i < bitmaps.length; i ++) {
+ bitmaps[i] = Bitmaps.decode(buf);
+ }
+ return bitmaps;
+ }
+ }
}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java
index 82810dacdad84..9a71cf593e28c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java
@@ -88,12 +88,14 @@ public void processFetchRequest(
logger.trace("Received req from {} to fetch block {}", getRemoteAddress(channel),
msg.streamChunkId);
}
- long chunksBeingTransferred = streamManager.chunksBeingTransferred();
- if (chunksBeingTransferred >= maxChunksBeingTransferred) {
- logger.warn("The number of chunks being transferred {} is above {}, close the connection.",
- chunksBeingTransferred, maxChunksBeingTransferred);
- channel.close();
- return;
+ if (maxChunksBeingTransferred < Long.MAX_VALUE) {
+ long chunksBeingTransferred = streamManager.chunksBeingTransferred();
+ if (chunksBeingTransferred >= maxChunksBeingTransferred) {
+ logger.warn("The number of chunks being transferred {} is above {}, close the connection.",
+ chunksBeingTransferred, maxChunksBeingTransferred);
+ channel.close();
+ return;
+ }
}
ManagedBuffer buf;
try {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java
index e53a0c1a0852e..2d439473bc226 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java
@@ -165,8 +165,9 @@ public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exc
if (hasInFlightRequests) {
String address = getRemoteAddress(ctx.channel());
logger.error("Connection to {} has been quiet for {} ms while there are outstanding " +
- "requests. Assuming connection is dead; please adjust spark.network.timeout if " +
- "this is wrong.", address, requestTimeoutNs / 1000 / 1000);
+ "requests. Assuming connection is dead; please adjust" +
+ " spark.{}.io.connectionTimeout if this is wrong.",
+ address, requestTimeoutNs / 1000 / 1000, transportContext.getConf().getModuleName());
client.timeOut();
ctx.close();
} else if (closeIdleConnections) {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index f178928006902..4a30f8de07827 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -124,12 +124,14 @@ private void processStreamRequest(final StreamRequest req) {
req.streamId);
}
- long chunksBeingTransferred = streamManager.chunksBeingTransferred();
- if (chunksBeingTransferred >= maxChunksBeingTransferred) {
- logger.warn("The number of chunks being transferred {} is above {}, close the connection.",
- chunksBeingTransferred, maxChunksBeingTransferred);
- channel.close();
- return;
+ if (maxChunksBeingTransferred < Long.MAX_VALUE) {
+ long chunksBeingTransferred = streamManager.chunksBeingTransferred();
+ if (chunksBeingTransferred >= maxChunksBeingTransferred) {
+ logger.warn("The number of chunks being transferred {} is above {}, close the connection.",
+ chunksBeingTransferred, maxChunksBeingTransferred);
+ channel.close();
+ return;
+ }
}
ManagedBuffer buf;
try {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java b/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java
index 423cc0c70ea02..ffbc54a0e0ad8 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java
@@ -162,7 +162,6 @@ public static PooledByteBufAllocator createPooledByteBufAllocator(
Math.min(PooledByteBufAllocator.defaultNumDirectArena(), allowDirectBufs ? numCores : 0),
PooledByteBufAllocator.defaultPageSize(),
PooledByteBufAllocator.defaultMaxOrder(),
- allowCache ? PooledByteBufAllocator.defaultTinyCacheSize() : 0,
allowCache ? PooledByteBufAllocator.defaultSmallCacheSize() : 0,
allowCache ? PooledByteBufAllocator.defaultNormalCacheSize() : 0,
allowCache ? PooledByteBufAllocator.defaultUseCacheForAllThreads() : false
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
index 6c37f9a382376..f051042a7adb4 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -19,6 +19,7 @@
import java.util.Locale;
import java.util.Properties;
+import java.util.concurrent.TimeUnit;
import com.google.common.primitives.Ints;
import io.netty.util.NettyRuntime;
@@ -31,6 +32,7 @@ public class TransportConf {
private final String SPARK_NETWORK_IO_MODE_KEY;
private final String SPARK_NETWORK_IO_PREFERDIRECTBUFS_KEY;
private final String SPARK_NETWORK_IO_CONNECTIONTIMEOUT_KEY;
+ private final String SPARK_NETWORK_IO_CONNECTIONCREATIONTIMEOUT_KEY;
private final String SPARK_NETWORK_IO_BACKLOG_KEY;
private final String SPARK_NETWORK_IO_NUMCONNECTIONSPERPEER_KEY;
private final String SPARK_NETWORK_IO_SERVERTHREADS_KEY;
@@ -54,6 +56,7 @@ public TransportConf(String module, ConfigProvider conf) {
SPARK_NETWORK_IO_MODE_KEY = getConfKey("io.mode");
SPARK_NETWORK_IO_PREFERDIRECTBUFS_KEY = getConfKey("io.preferDirectBufs");
SPARK_NETWORK_IO_CONNECTIONTIMEOUT_KEY = getConfKey("io.connectionTimeout");
+ SPARK_NETWORK_IO_CONNECTIONCREATIONTIMEOUT_KEY = getConfKey("io.connectionCreationTimeout");
SPARK_NETWORK_IO_BACKLOG_KEY = getConfKey("io.backLog");
SPARK_NETWORK_IO_NUMCONNECTIONSPERPEER_KEY = getConfKey("io.numConnectionsPerPeer");
SPARK_NETWORK_IO_SERVERTHREADS_KEY = getConfKey("io.serverThreads");
@@ -94,7 +97,7 @@ public boolean preferDirectBufs() {
return conf.getBoolean(SPARK_NETWORK_IO_PREFERDIRECTBUFS_KEY, true);
}
- /** Connect timeout in milliseconds. Default 120 secs. */
+ /** Connection idle timeout in milliseconds. Default 120 secs. */
public int connectionTimeoutMs() {
long defaultNetworkTimeoutS = JavaUtils.timeStringAsSec(
conf.get("spark.network.timeout", "120s"));
@@ -103,6 +106,14 @@ public int connectionTimeoutMs() {
return (int) defaultTimeoutMs;
}
+ /** Connect creation timeout in milliseconds. Default 30 secs. */
+ public int connectionCreationTimeoutMs() {
+ long connectionTimeoutS = TimeUnit.MILLISECONDS.toSeconds(connectionTimeoutMs());
+ long defaultTimeoutMs = JavaUtils.timeStringAsSec(
+ conf.get(SPARK_NETWORK_IO_CONNECTIONCREATIONTIMEOUT_KEY, connectionTimeoutS + "s")) * 1000;
+ return (int) defaultTimeoutMs;
+ }
+
/** Number of concurrent connections between two nodes for fetching data. */
public int numConnectionsPerPeer() {
return conf.getInt(SPARK_NETWORK_IO_NUMCONNECTIONSPERPEER_KEY, 1);
@@ -290,7 +301,7 @@ public boolean sharedByteBufAllocators() {
}
/**
- * If enabled then off-heap byte buffers will be prefered for the shared ByteBuf allocators.
+ * If enabled then off-heap byte buffers will be preferred for the shared ByteBuf allocators.
*/
public boolean preferDirectBufsForSharedByteBufAllocators() {
return conf.getBoolean("spark.network.io.preferDirectBufs", true);
@@ -363,4 +374,49 @@ public boolean useOldFetchProtocol() {
return conf.getBoolean("spark.shuffle.useOldFetchProtocol", false);
}
+ /**
+ * Class name of the implementation of MergedShuffleFileManager that merges the blocks
+ * pushed to it when push-based shuffle is enabled. By default, push-based shuffle is disabled at
+ * a cluster level because this configuration is set to
+ * 'org.apache.spark.network.shuffle.ExternalBlockHandler$NoOpMergedShuffleFileManager'.
+ * To turn on push-based shuffle at a cluster level, set the configuration to
+ * 'org.apache.spark.network.shuffle.RemoteBlockPushResolver'.
+ */
+ public String mergedShuffleFileManagerImpl() {
+ return conf.get("spark.shuffle.server.mergedShuffleFileManagerImpl",
+ "org.apache.spark.network.shuffle.ExternalBlockHandler$NoOpMergedShuffleFileManager");
+ }
+
+ /**
+ * The minimum size of a chunk when dividing a merged shuffle file into multiple chunks during
+ * push-based shuffle.
+ * A merged shuffle file consists of multiple small shuffle blocks. Fetching the
+ * complete merged shuffle file in a single response increases the memory requirements for the
+ * clients. Instead of serving the entire merged file, the shuffle service serves the
+ * merged file in `chunks`. A `chunk` constitutes few shuffle blocks in entirety and this
+ * configuration controls how big a chunk can get. A corresponding index file for each merged
+ * shuffle file will be generated indicating chunk boundaries.
+ */
+ public int minChunkSizeInMergedShuffleFile() {
+ return Ints.checkedCast(JavaUtils.byteStringAsBytes(
+ conf.get("spark.shuffle.server.minChunkSizeInMergedShuffleFile", "2m")));
+ }
+
+ /**
+ * The size of cache in memory which is used in push-based shuffle for storing merged index files.
+ */
+ public long mergedIndexCacheSize() {
+ return JavaUtils.byteStringAsBytes(
+ conf.get("spark.shuffle.server.mergedIndexCacheSize", "100m"));
+ }
+
+ /**
+ * The threshold for number of IOExceptions while merging shuffle blocks to a shuffle partition.
+ * When the number of IOExceptions while writing to merged shuffle data/index/meta file exceed
+ * this threshold then the shuffle server will respond back to client to stop pushing shuffle
+ * blocks for this shuffle partition.
+ */
+ public int ioExceptionsThresholdDuringMerge() {
+ return conf.getInt("spark.shuffle.server.ioExceptionsThresholdDuringMerge", 4);
+ }
}
diff --git a/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java b/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
index 56a2b805f154c..c2c5ffa43e0ed 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
@@ -22,7 +22,9 @@
public class TestUtils {
public static String getLocalHost() {
try {
- return InetAddress.getLocalHost().getHostAddress();
+ return (System.getenv().containsKey("SPARK_LOCAL_IP"))?
+ System.getenv("SPARK_LOCAL_IP"):
+ InetAddress.getLocalHost().getHostAddress();
} catch (Exception e) {
throw new RuntimeException(e);
}
diff --git a/common/network-common/src/test/java/org/apache/spark/network/client/TransportClientFactorySuite.java b/common/network-common/src/test/java/org/apache/spark/network/client/TransportClientFactorySuite.java
index ea0ac51589dc3..277ff85db7bf5 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/client/TransportClientFactorySuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/client/TransportClientFactorySuite.java
@@ -29,9 +29,7 @@
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
-import org.junit.Rule;
import org.junit.Test;
-import org.junit.rules.ExpectedException;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotSame;
@@ -227,11 +225,8 @@ public void closeFactoryBeforeCreateClient() throws IOException, InterruptedExce
factory.createClient(TestUtils.getLocalHost(), server1.getPort());
}
- @Rule
- public ExpectedException expectedException = ExpectedException.none();
-
@Test
- public void fastFailConnectionInTimeWindow() throws IOException, InterruptedException {
+ public void fastFailConnectionInTimeWindow() {
TransportClientFactory factory = context.createClientFactory();
TransportServer server = context.createServer();
int unreachablePort = server.getPort();
@@ -241,9 +236,7 @@ public void fastFailConnectionInTimeWindow() throws IOException, InterruptedExce
} catch (Exception e) {
assert(e instanceof IOException);
}
- expectedException.expect(IOException.class);
- expectedException.expectMessage("fail this connection directly");
- factory.createClient(TestUtils.getLocalHost(), unreachablePort, true);
- expectedException = ExpectedException.none();
+ Assert.assertThrows("fail this connection directly", IOException.class,
+ () -> factory.createClient(TestUtils.getLocalHost(), unreachablePort, true));
}
}
diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java
index 0790f0079c2bd..1c2061699a128 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java
@@ -150,8 +150,8 @@ public void testEncryptedMessage() throws Exception {
ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length);
TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(buf);
- while (emsg.transfered() < emsg.count()) {
- emsg.transferTo(channel, emsg.transfered());
+ while (emsg.transferred() < emsg.count()) {
+ emsg.transferTo(channel, emsg.transferred());
}
assertEquals(data.length, channel.length());
} finally {
@@ -196,9 +196,9 @@ public Long answer(InvocationOnMock invocationOnMock) throws Throwable {
TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(region);
ByteArrayWritableChannel channel = new ByteArrayWritableChannel(testDataLength);
// "transferTo" should act correctly when the underlying FileRegion transfers 0 bytes.
- assertEquals(0L, emsg.transferTo(channel, emsg.transfered()));
- assertEquals(testDataLength, emsg.transferTo(channel, emsg.transfered()));
- assertEquals(emsg.transfered(), emsg.count());
+ assertEquals(0L, emsg.transferTo(channel, emsg.transferred()));
+ assertEquals(testDataLength, emsg.transferTo(channel, emsg.transferred()));
+ assertEquals(emsg.transferred(), emsg.count());
assertEquals(4, channel.length());
} finally {
client.close();
diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java
index 6b2186f73cd0c..e62b8cb24e0ed 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java
@@ -29,11 +29,11 @@
import org.apache.spark.network.util.MapConfigProvider;
import org.apache.spark.network.util.TransportConf;
import org.hamcrest.CoreMatchers;
+import org.hamcrest.MatcherAssert;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertThat;
import static org.junit.Assert.fail;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyInt;
@@ -81,7 +81,7 @@ CryptoInputStream createInputStream(ReadableByteChannel ch) throws IOException {
channel.writeInbound(buffer2);
fail("Should have raised an exception");
} catch (Throwable expected) {
- assertThat(expected, CoreMatchers.instanceOf(IOException.class));
+ MatcherAssert.assertThat(expected, CoreMatchers.instanceOf(IOException.class));
assertEquals(0, buffer2.refCnt());
}
diff --git a/common/network-common/src/test/java/org/apache/spark/network/protocol/EncodersSuite.java b/common/network-common/src/test/java/org/apache/spark/network/protocol/EncodersSuite.java
new file mode 100644
index 0000000000000..6e89702c04396
--- /dev/null
+++ b/common/network-common/src/test/java/org/apache/spark/network/protocol/EncodersSuite.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.protocol;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
+import org.junit.Test;
+import org.roaringbitmap.RoaringBitmap;
+
+import static org.junit.Assert.*;
+
+/**
+ * Tests for {@link Encoders}.
+ */
+public class EncodersSuite {
+
+ @Test
+ public void testRoaringBitmapEncodeDecode() {
+ RoaringBitmap bitmap = new RoaringBitmap();
+ bitmap.add(1, 2, 3);
+ ByteBuf buf = Unpooled.buffer(Encoders.Bitmaps.encodedLength(bitmap));
+ Encoders.Bitmaps.encode(buf, bitmap);
+ RoaringBitmap decodedBitmap = Encoders.Bitmaps.decode(buf);
+ assertEquals(bitmap, decodedBitmap);
+ }
+
+ @Test (expected = java.nio.BufferOverflowException.class)
+ public void testRoaringBitmapEncodeShouldFailWhenBufferIsSmall() {
+ RoaringBitmap bitmap = new RoaringBitmap();
+ bitmap.add(1, 2, 3);
+ ByteBuf buf = Unpooled.buffer(4);
+ Encoders.Bitmaps.encode(buf, bitmap);
+ }
+
+ @Test
+ public void testBitmapArraysEncodeDecode() {
+ RoaringBitmap[] bitmaps = new RoaringBitmap[] {
+ new RoaringBitmap(),
+ new RoaringBitmap(),
+ new RoaringBitmap(), // empty
+ new RoaringBitmap(),
+ new RoaringBitmap()
+ };
+ bitmaps[0].add(1, 2, 3);
+ bitmaps[1].add(1, 2, 4);
+ bitmaps[3].add(7L, 9L);
+ bitmaps[4].add(1L, 100L);
+ ByteBuf buf = Unpooled.buffer(Encoders.BitmapArrays.encodedLength(bitmaps));
+ Encoders.BitmapArrays.encode(buf, bitmaps);
+ RoaringBitmap[] decodedBitmaps = Encoders.BitmapArrays.decode(buf);
+ assertArrayEquals(bitmaps, decodedBitmaps);
+ }
+}
diff --git a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java
index 3bff34e210e3c..af1c2878672c0 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java
@@ -129,8 +129,8 @@ private void testFileRegionBody(int totalWrites, int writesPerCall) throws Excep
private ByteBuf doWrite(MessageWithHeader msg, int minExpectedWrites) throws Exception {
int writes = 0;
ByteArrayWritableChannel channel = new ByteArrayWritableChannel((int) msg.count());
- while (msg.transfered() < msg.count()) {
- msg.transferTo(channel, msg.transfered());
+ while (msg.transferred() < msg.count()) {
+ msg.transferTo(channel, msg.transferred());
writes++;
}
assertTrue("Not enough writes!", minExpectedWrites <= writes);
diff --git a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
index ecaeec98da182..32c9acd327213 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
@@ -191,28 +191,28 @@ public void testEncryptedMessage() throws Exception {
SaslEncryption.EncryptedMessage emsg =
new SaslEncryption.EncryptedMessage(backend, msg, 1024);
- long count = emsg.transferTo(channel, emsg.transfered());
+ long count = emsg.transferTo(channel, emsg.transferred());
assertTrue(count < data.length);
assertTrue(count > 0);
// Here, the output buffer is full so nothing should be transferred.
- assertEquals(0, emsg.transferTo(channel, emsg.transfered()));
+ assertEquals(0, emsg.transferTo(channel, emsg.transferred()));
// Now there's room in the buffer, but not enough to transfer all the remaining data,
// so the dummy count should be returned.
channel.reset();
- assertEquals(1, emsg.transferTo(channel, emsg.transfered()));
+ assertEquals(1, emsg.transferTo(channel, emsg.transferred()));
// Eventually, the whole message should be transferred.
for (int i = 0; i < data.length / 32 - 2; i++) {
channel.reset();
- assertEquals(1, emsg.transferTo(channel, emsg.transfered()));
+ assertEquals(1, emsg.transferTo(channel, emsg.transferred()));
}
channel.reset();
- count = emsg.transferTo(channel, emsg.transfered());
+ count = emsg.transferTo(channel, emsg.transferred());
assertTrue("Unexpected count: " + count, count > 1 && count < data.length);
- assertEquals(data.length, emsg.transfered());
+ assertEquals(data.length, emsg.transferred());
} finally {
msg.release();
}
@@ -237,9 +237,9 @@ public void testEncryptedMessageChunking() throws Exception {
new SaslEncryption.EncryptedMessage(backend, msg.convertToNetty(), data.length / 8);
ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length);
- while (emsg.transfered() < emsg.count()) {
+ while (emsg.transferred() < emsg.count()) {
channel.reset();
- emsg.transferTo(channel, emsg.transfered());
+ emsg.transferTo(channel, emsg.transferred());
}
verify(backend, times(8)).wrap(any(byte[].class), anyInt(), anyInt());
diff --git a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java
index 45e1836da641f..634b40ed450ee 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java
@@ -72,7 +72,7 @@ public void testMissingChunk() {
Assert.assertNotNull(getChunk(manager, streamId, 2));
manager.connectionTerminated(dummyChannel);
- // loaded buffers are not released yet as in production a MangedBuffer returned by getChunk()
+ // loaded buffers are not released yet as in production a ManagedBuffer returned by getChunk()
// would only be released by Netty after it is written to the network
Mockito.verify(buffer1, Mockito.never()).release();
Mockito.verify(buffer2, Mockito.never()).release();
diff --git a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java
index 4b67aa80351d2..163c52b023822 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java
@@ -98,7 +98,7 @@ public void testConsolidationPerf() throws Exception {
writtenBytes += pieceBytes;
}
logger.info("Writing 300MiB frame buf with consolidation of threshold " + threshold
- + " took " + totalTime + " milis");
+ + " took " + totalTime + " millis");
} finally {
for (ByteBuf buf : retained) {
release(buf);
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 00f1defbb0093..6be6df993478d 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../../pom.xml
@@ -47,6 +47,11 @@
metrics-core
+
+ org.apache.spark
+ spark-tags_${scala.binary.version}
+
+
org.slf4j
@@ -57,6 +62,10 @@
com.google.guavaguava
+
+ org.roaringbitmap
+ RoaringBitmap
+
@@ -66,11 +75,6 @@
test-jartest
-
- org.apache.spark
- spark-tags_${scala.binary.version}
- test
- org.apache.hadoop
- hadoop-client
+ ${hadoop-client-api.artifact}
+ ${hadoop.version}
+
+
+ org.apache.hadoop
+ ${hadoop-client-runtime.artifact}
+ ${hadoop.version}org.slf4j
diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
index 3d14318bf90f0..cb6d5d0ca2037 100644
--- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
+++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
@@ -19,6 +19,7 @@
import java.io.File;
import java.io.IOException;
+import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.ByteBuffer;
import java.util.List;
@@ -41,6 +42,7 @@
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.server.api.*;
+import org.apache.spark.network.shuffle.MergedShuffleFileManager;
import org.apache.spark.network.util.LevelDBProvider;
import org.iq80.leveldb.DB;
import org.iq80.leveldb.DBIterator;
@@ -74,6 +76,20 @@
* is because an application running on the same Yarn cluster may choose to not use the external
* shuffle service, in which case its setting of `spark.authenticate` should be independent of
* the service's.
+ *
+ * The shuffle service will produce metrics via the YARN NodeManager's {@code metrics2} system
+ * under a namespace specified by the {@value SPARK_SHUFFLE_SERVICE_METRICS_NAMESPACE_KEY} config.
+ *
+ * By default, all configurations for the shuffle service will be taken directly from the
+ * Hadoop {@link Configuration} passed by the YARN NodeManager. It is also possible to configure
+ * the shuffle service by placing a resource named
+ * {@value SHUFFLE_SERVICE_CONF_OVERLAY_RESOURCE_NAME} into the classpath, which should be an
+ * XML file in the standard Hadoop Configuration resource format. Note that when the shuffle
+ * service is loaded in the default manner, without configuring
+ * {@code yarn.nodemanager.aux-services..classpath}, this file must be on the classpath
+ * of the NodeManager itself. When using the {@code classpath} configuration, it can be present
+ * either on the NodeManager's classpath, or specified in the classpath configuration.
+ * This {@code classpath} configuration is only supported on YARN versions >= 2.9.0.
*/
public class YarnShuffleService extends AuxiliaryService {
private static final Logger logger = LoggerFactory.getLogger(YarnShuffleService.class);
@@ -82,6 +98,14 @@ public class YarnShuffleService extends AuxiliaryService {
private static final String SPARK_SHUFFLE_SERVICE_PORT_KEY = "spark.shuffle.service.port";
private static final int DEFAULT_SPARK_SHUFFLE_SERVICE_PORT = 7337;
+ /**
+ * The namespace to use for the metrics record which will contain all metrics produced by the
+ * shuffle service.
+ */
+ static final String SPARK_SHUFFLE_SERVICE_METRICS_NAMESPACE_KEY =
+ "spark.yarn.shuffle.service.metrics.namespace";
+ private static final String DEFAULT_SPARK_SHUFFLE_SERVICE_METRICS_NAME = "sparkShuffleService";
+
// Whether the shuffle server should authenticate fetch requests
private static final String SPARK_AUTHENTICATE_KEY = "spark.authenticate";
private static final boolean DEFAULT_SPARK_AUTHENTICATE = false;
@@ -102,6 +126,13 @@ public class YarnShuffleService extends AuxiliaryService {
private static final LevelDBProvider.StoreVersion CURRENT_VERSION = new LevelDBProvider
.StoreVersion(1, 0);
+ /**
+ * The name of the resource to search for on the classpath to find a shuffle service-specific
+ * configuration overlay. If found, this will be parsed as a standard Hadoop
+ * {@link Configuration config} file and will override the configs passed from the NodeManager.
+ */
+ static final String SHUFFLE_SERVICE_CONF_OVERLAY_RESOURCE_NAME = "spark-shuffle-site.xml";
+
// just for integration tests that want to look at this file -- in general not sensible as
// a static
@VisibleForTesting
@@ -138,6 +169,13 @@ public class YarnShuffleService extends AuxiliaryService {
private DB db;
public YarnShuffleService() {
+ // The name of the auxiliary service configured within the NodeManager
+ // (`yarn.nodemanager.aux-services`) is treated as the source-of-truth, so this one can be
+ // arbitrary. The NodeManager will log a warning if the configured name doesn't match this name,
+ // to inform operators of a potential misconfiguration, but this name is otherwise not used.
+ // It is hard-coded instead of using the value of the `spark.shuffle.service.name` configuration
+ // because at this point in instantiation there is no Configuration object; it is not passed
+ // until `serviceInit` is called, at which point it's too late to adjust the name.
super("spark_shuffle");
logger.info("Initializing YARN shuffle service for Spark");
instance = this;
@@ -156,10 +194,18 @@ private boolean isAuthenticationEnabled() {
* Start the shuffle server with the given configuration.
*/
@Override
- protected void serviceInit(Configuration conf) throws Exception {
- _conf = conf;
+ protected void serviceInit(Configuration externalConf) throws Exception {
+ _conf = new Configuration(externalConf);
+ URL confOverlayUrl = Thread.currentThread().getContextClassLoader()
+ .getResource(SHUFFLE_SERVICE_CONF_OVERLAY_RESOURCE_NAME);
+ if (confOverlayUrl != null) {
+ logger.info("Initializing Spark YARN shuffle service with configuration overlay from {}",
+ confOverlayUrl);
+ _conf.addResource(confOverlayUrl);
+ }
+ super.serviceInit(_conf);
- boolean stopOnFailure = conf.getBoolean(STOP_ON_FAILURE_KEY, DEFAULT_STOP_ON_FAILURE);
+ boolean stopOnFailure = _conf.getBoolean(STOP_ON_FAILURE_KEY, DEFAULT_STOP_ON_FAILURE);
try {
// In case this NM was killed while there were running spark applications, we need to restore
@@ -171,13 +217,16 @@ protected void serviceInit(Configuration conf) throws Exception {
registeredExecutorFile = initRecoveryDb(RECOVERY_FILE_NAME);
}
- TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf));
- blockHandler = new ExternalBlockHandler(transportConf, registeredExecutorFile);
+ TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(_conf));
+ MergedShuffleFileManager shuffleMergeManager = newMergedShuffleFileManagerInstance(
+ transportConf);
+ blockHandler = new ExternalBlockHandler(
+ transportConf, registeredExecutorFile, shuffleMergeManager);
// If authentication is enabled, set up the shuffle server to use a
// special RPC handler that filters out unauthenticated fetch requests
List bootstraps = Lists.newArrayList();
- boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE);
+ boolean authEnabled = _conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE);
if (authEnabled) {
secretManager = new ShuffleSecretManager();
if (_recoveryPath != null) {
@@ -186,7 +235,7 @@ protected void serviceInit(Configuration conf) throws Exception {
bootstraps.add(new AuthServerBootstrap(transportConf, secretManager));
}
- int port = conf.getInt(
+ int port = _conf.getInt(
SPARK_SHUFFLE_SERVICE_PORT_KEY, DEFAULT_SPARK_SHUFFLE_SERVICE_PORT);
transportContext = new TransportContext(transportConf, blockHandler, true);
shuffleServer = transportContext.createServer(port, bootstraps);
@@ -199,13 +248,16 @@ protected void serviceInit(Configuration conf) throws Exception {
blockHandler.getAllMetrics().getMetrics().put("numRegisteredConnections",
shuffleServer.getRegisteredConnections());
blockHandler.getAllMetrics().getMetrics().putAll(shuffleServer.getAllMetrics().getMetrics());
+ String metricsNamespace = _conf.get(SPARK_SHUFFLE_SERVICE_METRICS_NAMESPACE_KEY,
+ DEFAULT_SPARK_SHUFFLE_SERVICE_METRICS_NAME);
YarnShuffleServiceMetrics serviceMetrics =
- new YarnShuffleServiceMetrics(blockHandler.getAllMetrics());
+ new YarnShuffleServiceMetrics(metricsNamespace, blockHandler.getAllMetrics());
MetricsSystemImpl metricsSystem = (MetricsSystemImpl) DefaultMetricsSystem.instance();
metricsSystem.register(
- "sparkShuffleService", "Metrics on the Spark Shuffle Service", serviceMetrics);
- logger.info("Registered metrics with Hadoop's DefaultMetricsSystem");
+ metricsNamespace, "Metrics on the Spark Shuffle Service", serviceMetrics);
+ logger.info("Registered metrics with Hadoop's DefaultMetricsSystem using namespace '{}'",
+ metricsNamespace);
logger.info("Started YARN shuffle service for Spark on port {}. " +
"Authentication is {}. Registered executor file is {}", port, authEnabledString,
@@ -219,6 +271,23 @@ protected void serviceInit(Configuration conf) throws Exception {
}
}
+ @VisibleForTesting
+ static MergedShuffleFileManager newMergedShuffleFileManagerInstance(TransportConf conf) {
+ String mergeManagerImplClassName = conf.mergedShuffleFileManagerImpl();
+ try {
+ Class> mergeManagerImplClazz = Class.forName(
+ mergeManagerImplClassName, true, Thread.currentThread().getContextClassLoader());
+ Class extends MergedShuffleFileManager> mergeManagerSubClazz =
+ mergeManagerImplClazz.asSubclass(MergedShuffleFileManager.class);
+ // The assumption is that all the custom implementations just like the RemoteBlockPushResolver
+ // will also need the transport configuration.
+ return mergeManagerSubClazz.getConstructor(TransportConf.class).newInstance(conf);
+ } catch (Exception e) {
+ logger.error("Unable to create an instance of {}", mergeManagerImplClassName);
+ return new ExternalBlockHandler.NoOpMergedShuffleFileManager(conf);
+ }
+ }
+
private void loadSecretsFromDb() throws IOException {
secretsFile = initRecoveryDb(SECRETS_RECOVERY_FILE_NAME);
diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleServiceMetrics.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleServiceMetrics.java
index 81be6e8036ffe..f30abbd0f7fcd 100644
--- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleServiceMetrics.java
+++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleServiceMetrics.java
@@ -32,9 +32,11 @@
*/
class YarnShuffleServiceMetrics implements MetricsSource {
+ private final String metricsNamespace;
private final MetricSet metricSet;
- YarnShuffleServiceMetrics(MetricSet metricSet) {
+ YarnShuffleServiceMetrics(String metricsNamespace, MetricSet metricSet) {
+ this.metricsNamespace = metricsNamespace;
this.metricSet = metricSet;
}
@@ -46,7 +48,7 @@ class YarnShuffleServiceMetrics implements MetricsSource {
*/
@Override
public void getMetrics(MetricsCollector collector, boolean all) {
- MetricsRecordBuilder metricsRecordBuilder = collector.addRecord("sparkShuffleService");
+ MetricsRecordBuilder metricsRecordBuilder = collector.addRecord(metricsNamespace);
for (Map.Entry entry : metricSet.getMetrics().entrySet()) {
collectMetric(metricsRecordBuilder, entry.getKey(), entry.getValue());
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 72a2c4ceb43b6..b5a6775366a47 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../../pom.xml
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
index e83b331391e39..61cd2cec1a34b 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
@@ -17,12 +17,16 @@
package org.apache.spark.util.sketch;
+import java.nio.ByteOrder;
+
/**
* 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction.
*/
// This class is duplicated from `org.apache.spark.unsafe.hash.Murmur3_x86_32` to make sure
// spark-sketch has no external dependencies.
final class Murmur3_x86_32 {
+ private static final boolean isBigEndian = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);
+
private static final int C1 = 0xcc9e2d51;
private static final int C2 = 0x1b873593;
@@ -92,8 +96,10 @@ private static int hashBytesByInt(Object base, long offset, int lengthInBytes, i
int h1 = seed;
for (int i = 0; i < lengthInBytes; i += 4) {
int halfWord = Platform.getInt(base, offset + i);
- int k1 = mixK1(halfWord);
- h1 = mixH1(h1, k1);
+ if (isBigEndian) {
+ halfWord = Integer.reverseBytes(halfWord);
+ }
+ h1 = mixH1(h1, mixK1(halfWord));
}
return h1;
}
diff --git a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BitArraySuite.scala b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BitArraySuite.scala
index ff728f0ebcb85..4c535a8dd0411 100644
--- a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BitArraySuite.scala
+++ b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BitArraySuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.util.sketch
import scala.util.Random
-import org.scalatest.FunSuite // scalastyle:ignore funsuite
+import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite
-class BitArraySuite extends FunSuite { // scalastyle:ignore funsuite
+class BitArraySuite extends AnyFunSuite { // scalastyle:ignore funsuite
test("error case when create BitArray") {
intercept[IllegalArgumentException](new BitArray(0))
diff --git a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala
index a0408d2da4dff..8b289fc86af0f 100644
--- a/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala
+++ b/common/sketch/src/test/scala/org/apache/spark/util/sketch/BloomFilterSuite.scala
@@ -22,9 +22,9 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import scala.reflect.ClassTag
import scala.util.Random
-import org.scalatest.FunSuite // scalastyle:ignore funsuite
+import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite
-class BloomFilterSuite extends FunSuite { // scalastyle:ignore funsuite
+class BloomFilterSuite extends AnyFunSuite { // scalastyle:ignore funsuite
private final val EPSILON = 0.01
// Serializes and deserializes a given `BloomFilter`, then checks whether the deserialized
diff --git a/common/sketch/src/test/scala/org/apache/spark/util/sketch/CountMinSketchSuite.scala b/common/sketch/src/test/scala/org/apache/spark/util/sketch/CountMinSketchSuite.scala
index 174eb01986c4f..087dae26047ef 100644
--- a/common/sketch/src/test/scala/org/apache/spark/util/sketch/CountMinSketchSuite.scala
+++ b/common/sketch/src/test/scala/org/apache/spark/util/sketch/CountMinSketchSuite.scala
@@ -22,9 +22,9 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import scala.reflect.ClassTag
import scala.util.Random
-import org.scalatest.FunSuite // scalastyle:ignore funsuite
+import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite
-class CountMinSketchSuite extends FunSuite { // scalastyle:ignore funsuite
+class CountMinSketchSuite extends AnyFunSuite { // scalastyle:ignore funsuite
private val epsOfTotalCount = 0.01
private val confidence = 0.9
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index ea16dadca40cb..e51357d97faab 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../../pom.xml
diff --git a/common/tags/src/test/java/org/apache/spark/tags/ChromeUITest.java b/common/tags/src/test/java/org/apache/spark/tags/ChromeUITest.java
new file mode 100644
index 0000000000000..e3fed3d656d20
--- /dev/null
+++ b/common/tags/src/test/java/org/apache/spark/tags/ChromeUITest.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.tags;
+
+import java.lang.annotation.*;
+
+import org.scalatest.TagAnnotation;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface ChromeUITest { }
diff --git a/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java b/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java
new file mode 100644
index 0000000000000..a7e6f352667d7
--- /dev/null
+++ b/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.tags;
+
+import org.scalatest.TagAnnotation;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface SlowHiveTest { }
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 769e2518b1fd4..b22400575dd02 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../../pom.xml
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index d239de6083ad0..0b9d9ced312a1 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -17,12 +17,16 @@
package org.apache.spark.unsafe.hash;
+import java.nio.ByteOrder;
+
import org.apache.spark.unsafe.Platform;
/**
* 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction.
*/
public final class Murmur3_x86_32 {
+ private static final boolean isBigEndian = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);
+
private static final int C1 = 0xcc9e2d51;
private static final int C2 = 0x1b873593;
@@ -92,8 +96,10 @@ private static int hashBytesByInt(Object base, long offset, int lengthInBytes, i
int h1 = seed;
for (int i = 0; i < lengthInBytes; i += 4) {
int halfWord = Platform.getInt(base, offset + i);
- int k1 = mixK1(halfWord);
- h1 = mixH1(h1, k1);
+ if (isBigEndian) {
+ halfWord = Integer.reverseBytes(halfWord);
+ }
+ h1 = mixH1(h1, mixK1(halfWord));
}
return h1;
}
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 186597fa64780..db52f77481761 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -29,7 +29,6 @@
import com.esotericsoftware.kryo.KryoSerializable;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
-import com.google.common.primitives.Ints;
import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.UTF8StringBuilder;
@@ -341,8 +340,17 @@ public UTF8String substringSQL(int pos, int length) {
// to the -ith element before the end of the sequence. If a start index i is 0, it
// refers to the first element.
int len = numChars();
+ // `len + pos` does not overflow as `len >= 0`.
int start = (pos > 0) ? pos -1 : ((pos < 0) ? len + pos : 0);
- int end = (length == Integer.MAX_VALUE) ? len : start + length;
+
+ int end;
+ if ((long) start + length > Integer.MAX_VALUE) {
+ end = Integer.MAX_VALUE;
+ } else if ((long) start + length < Integer.MIN_VALUE) {
+ end = Integer.MIN_VALUE;
+ } else {
+ end = start + length;
+ }
return substring(start, end);
}
@@ -554,7 +562,7 @@ public UTF8String trim() {
}
/**
- * Trims whitespaces (<= ASCII 32) from both ends of this string.
+ * Trims whitespaces ({@literal <=} ASCII 32) from both ends of this string.
*
* Note that, this method is the same as java's {@link String#trim}, and different from
* {@link UTF8String#trim()} which remove only spaces(= ASCII 32) from both ends.
@@ -566,14 +574,14 @@ public UTF8String trim() {
public UTF8String trimAll() {
int s = 0;
// skip all of the whitespaces (<=0x20) in the left side
- while (s < this.numBytes && getByte(s) <= ' ') s++;
+ while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++;
if (s == this.numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
// skip all of the whitespaces (<=0x20) in the right side
int e = this.numBytes - 1;
- while (e > s && getByte(e) <= ' ') e--;
+ while (e > s && Character.isWhitespace(getByte(e))) e--;
if (s == 0 && e == numBytes - 1) {
// Nothing trimmed
return this;
@@ -626,13 +634,13 @@ public UTF8String trimLeft() {
public UTF8String trimLeft(UTF8String trimString) {
if (trimString == null) return null;
// the searching byte position in the source string
- int srchIdx = 0;
+ int searchIdx = 0;
// the first beginning byte position of a non-matching character
int trimIdx = 0;
- while (srchIdx < numBytes) {
+ while (searchIdx < numBytes) {
UTF8String searchChar = copyUTF8String(
- srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1);
+ searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1);
int searchCharBytes = searchChar.numBytes;
// try to find the matching for the searchChar in the trimString set
if (trimString.find(searchChar, 0) >= 0) {
@@ -641,9 +649,9 @@ public UTF8String trimLeft(UTF8String trimString) {
// no matching, exit the search
break;
}
- srchIdx += searchCharBytes;
+ searchIdx += searchCharBytes;
}
- if (srchIdx == 0) {
+ if (searchIdx == 0) {
// Nothing trimmed
return this;
}
@@ -674,6 +682,17 @@ public UTF8String trimRight() {
return copyUTF8String(0, e);
}
+ /**
+ * Trims at most `numSpaces` space characters (ASCII 32) from the end of this string.
+ */
+ public UTF8String trimTrailingSpaces(int numSpaces) {
+ assert numSpaces > 0;
+ int endIdx = numBytes - 1;
+ int trimTo = numBytes - numSpaces;
+ while (endIdx >= trimTo && getByte(endIdx) == 0x20) endIdx--;
+ return copyUTF8String(0, endIdx);
+ }
+
/**
* Trims instances of the given trim string from the end of this string.
*
@@ -744,7 +763,7 @@ public UTF8String repeat(int times) {
return EMPTY_UTF8;
}
- byte[] newBytes = new byte[numBytes * times];
+ byte[] newBytes = new byte[Math.multiplyExact(numBytes, times)];
copyMemory(this.base, this.offset, newBytes, BYTE_ARRAY_OFFSET, numBytes);
int copied = 1;
@@ -887,7 +906,8 @@ public UTF8String rpad(int len, UTF8String pad) {
// the partial string of the padding
UTF8String remain = pad.substring(0, spaces - padChars * count);
- byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+ int resultSize = Math.toIntExact((long)numBytes + pad.numBytes * count + remain.numBytes);
+ byte[] data = new byte[resultSize];
copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET, this.numBytes);
int offset = this.numBytes;
int idx = 0;
@@ -919,7 +939,8 @@ public UTF8String lpad(int len, UTF8String pad) {
// the partial string of the padding
UTF8String remain = pad.substring(0, spaces - padChars * count);
- byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+ int resultSize = Math.toIntExact((long)numBytes + pad.numBytes * count + remain.numBytes);
+ byte[] data = new byte[resultSize];
int offset = 0;
int idx = 0;
@@ -951,7 +972,7 @@ public static UTF8String concat(UTF8String... inputs) {
}
// Allocate a new byte array, and copy the inputs one by one into it.
- final byte[] result = new byte[Ints.checkedCast(totalLength)];
+ final byte[] result = new byte[Math.toIntExact(totalLength)];
int offset = 0;
for (int i = 0; i < inputs.length; i++) {
int len = inputs[i].numBytes;
@@ -973,7 +994,7 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
return null;
}
- int numInputBytes = 0; // total number of bytes from the inputs
+ long numInputBytes = 0L; // total number of bytes from the inputs
int numInputs = 0; // number of non-null inputs
for (int i = 0; i < inputs.length; i++) {
if (inputs[i] != null) {
@@ -989,7 +1010,8 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
// Allocate a new byte array, and copy the inputs one by one into it.
// The size of the new array is the size of all inputs, plus the separators.
- final byte[] result = new byte[numInputBytes + (numInputs - 1) * separator.numBytes];
+ int resultSize = Math.toIntExact(numInputBytes + (numInputs - 1) * (long)separator.numBytes);
+ final byte[] result = new byte[resultSize];
int offset = 0;
for (int i = 0, j = 0; i < inputs.length; i++) {
@@ -1056,16 +1078,20 @@ public UTF8String replace(UTF8String search, UTF8String replace) {
return buf.build();
}
- // TODO: Need to use `Code Point` here instead of Char in case the character longer than 2 bytes
- public UTF8String translate(Map dict) {
+ public UTF8String translate(Map dict) {
String srcStr = this.toString();
StringBuilder sb = new StringBuilder();
- for(int k = 0; k< srcStr.length(); k++) {
- if (null == dict.get(srcStr.charAt(k))) {
- sb.append(srcStr.charAt(k));
- } else if ('\0' != dict.get(srcStr.charAt(k))){
- sb.append(dict.get(srcStr.charAt(k)));
+ int charCount = 0;
+ for (int k = 0; k < srcStr.length(); k += charCount) {
+ int codePoint = srcStr.codePointAt(k);
+ charCount = Character.charCount(codePoint);
+ String subStr = srcStr.substring(k, k + charCount);
+ String translated = dict.get(subStr);
+ if (null == translated) {
+ sb.append(subStr);
+ } else if (!"\0".equals(translated)) {
+ sb.append(translated);
}
}
return fromString(sb.toString());
@@ -1110,11 +1136,11 @@ public boolean toLong(LongWrapper toLongResult) {
private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) {
int offset = 0;
- while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
+ while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
if (offset == this.numBytes) return false;
int end = this.numBytes - 1;
- while (end > offset && getByte(end) <= ' ') end--;
+ while (end > offset && Character.isWhitespace(getByte(end))) end--;
byte b = getByte(offset);
final boolean negative = b == '-';
@@ -1207,11 +1233,11 @@ public boolean toInt(IntWrapper intWrapper) {
private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) {
int offset = 0;
- while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
+ while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
if (offset == this.numBytes) return false;
int end = this.numBytes - 1;
- while (end > offset && getByte(end) <= ' ') end--;
+ while (end > offset && Character.isWhitespace(getByte(end))) end--;
byte b = getByte(offset);
final boolean negative = b == '-';
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 8f933877f82e6..ba3e4269e9a46 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -390,6 +390,10 @@ public void substringSQL() {
assertEquals(fromString("example"), e.substringSQL(0, Integer.MAX_VALUE));
assertEquals(fromString("example"), e.substringSQL(1, Integer.MAX_VALUE));
assertEquals(fromString("xample"), e.substringSQL(2, Integer.MAX_VALUE));
+ assertEquals(EMPTY_UTF8, e.substringSQL(-100, -100));
+ assertEquals(EMPTY_UTF8, e.substringSQL(-1207959552, -1207959552));
+ assertEquals(fromString("pl"), e.substringSQL(-3, 2));
+ assertEquals(EMPTY_UTF8, e.substringSQL(Integer.MIN_VALUE, 6));
}
@Test
@@ -461,10 +465,10 @@ public void translate() {
assertEquals(
fromString("1a2s3ae"),
fromString("translate").translate(ImmutableMap.of(
- 'r', '1',
- 'n', '2',
- 'l', '3',
- 't', '\0'
+ "r", "1",
+ "n", "2",
+ "l", "3",
+ "t", "\0"
)));
assertEquals(
fromString("translate"),
@@ -472,16 +476,16 @@ public void translate() {
assertEquals(
fromString("asae"),
fromString("translate").translate(ImmutableMap.of(
- 'r', '\0',
- 'n', '\0',
- 'l', '\0',
- 't', '\0'
+ "r", "\0",
+ "n", "\0",
+ "l", "\0",
+ "t", "\0"
)));
assertEquals(
fromString("aa世b"),
fromString("花花世界").translate(ImmutableMap.of(
- '花', 'a',
- '界', 'b'
+ "花", "a",
+ "界", "b"
)));
}
diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
index 72aa682bb95bc..ab488e18ba3f4 100644
--- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
+++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
@@ -21,14 +21,15 @@ import org.apache.commons.text.similarity.LevenshteinDistance
import org.scalacheck.{Arbitrary, Gen}
import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks
// scalastyle:off
-import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.must.Matchers
import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}
/**
* This TestSuite utilize ScalaCheck to generate randomized inputs for UTF8String testing.
*/
-class UTF8StringPropertyCheckSuite extends FunSuite with ScalaCheckDrivenPropertyChecks with Matchers {
+class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenPropertyChecks with Matchers {
// scalastyle:on
test("toString") {
@@ -191,7 +192,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with ScalaCheckDrivenPropert
}
}
- val nullalbeSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString))
+ val nullableSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString))
test("concat") {
def concat(origin: Seq[String]): String =
@@ -200,7 +201,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with ScalaCheckDrivenPropert
forAll { (inputs: Seq[String]) =>
assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(inputs.mkString))
}
- forAll (nullalbeSeq) { (inputs: Seq[String]) =>
+ forAll (nullableSeq) { (inputs: Seq[String]) =>
assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(concat(inputs)))
}
}
@@ -215,7 +216,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with ScalaCheckDrivenPropert
assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) ===
toUTF8(inputs.mkString(sep)))
}
- forAll(randomString, nullalbeSeq) {(sep: String, inputs: Seq[String]) =>
+ forAll(randomString, nullableSeq) {(sep: String, inputs: Seq[String]) =>
assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) ===
toUTF8(concatWs(sep, inputs)))
}
diff --git a/conf/log4j.properties.template b/conf/log4j.properties.template
index e91595dd324b0..5db8c5c295d84 100644
--- a/conf/log4j.properties.template
+++ b/conf/log4j.properties.template
@@ -22,10 +22,12 @@ log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
-# Set the default spark-shell log level to WARN. When running the spark-shell, the
-# log level for this class is used to overwrite the root logger's log level, so that
-# the user can have different defaults for the shell and regular Spark apps.
+# Set the default spark-shell/spark-sql log level to WARN. When running the
+# spark-shell/spark-sql, the log level for these classes is used to overwrite
+# the root logger's log level, so that the user can have different defaults
+# for the shell and regular Spark apps.
log4j.logger.org.apache.spark.repl.Main=WARN
+log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN
# Settings to quiet third party logs that are too verbose
log4j.logger.org.sparkproject.jetty=WARN
@@ -38,3 +40,9 @@ log4j.logger.parquet=ERROR
# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+# For deploying Spark ThriftServer
+# SPARK-34128:Suppress undesirable TTransportException warnings involved in THRIFT-4805
+log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
+log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
+log4j.appender.console.filter.1.AcceptOnMatch=false
diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index df39ad8b0dcc2..3c003f45ed27a 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -62,6 +62,7 @@
# Generic options for the daemons used in the standalone deploy mode
# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf)
# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs)
+# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5.
# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp)
# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER)
# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0)
diff --git a/conf/slaves.template b/conf/workers.template
similarity index 100%
rename from conf/slaves.template
rename to conf/workers.template
diff --git a/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt b/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt
index e944111ff9e93..10ac28b46091c 100644
--- a/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt
+++ b/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt
@@ -2,39 +2,39 @@
Coalesced RDD , large scale
================================================================================================
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Coalesce Num Partitions: 100 Num Hosts: 1 344 360 14 0.3 3441.4 1.0X
-Coalesce Num Partitions: 100 Num Hosts: 5 283 301 22 0.4 2825.1 1.2X
-Coalesce Num Partitions: 100 Num Hosts: 10 270 271 2 0.4 2700.5 1.3X
-Coalesce Num Partitions: 100 Num Hosts: 20 272 273 1 0.4 2721.1 1.3X
-Coalesce Num Partitions: 100 Num Hosts: 40 271 272 1 0.4 2710.0 1.3X
-Coalesce Num Partitions: 100 Num Hosts: 80 266 267 2 0.4 2656.3 1.3X
-Coalesce Num Partitions: 500 Num Hosts: 1 609 619 15 0.2 6089.0 0.6X
-Coalesce Num Partitions: 500 Num Hosts: 5 338 343 6 0.3 3383.0 1.0X
-Coalesce Num Partitions: 500 Num Hosts: 10 303 306 3 0.3 3029.4 1.1X
-Coalesce Num Partitions: 500 Num Hosts: 20 286 288 2 0.4 2855.9 1.2X
-Coalesce Num Partitions: 500 Num Hosts: 40 279 282 4 0.4 2793.3 1.2X
-Coalesce Num Partitions: 500 Num Hosts: 80 273 275 3 0.4 2725.9 1.3X
-Coalesce Num Partitions: 1000 Num Hosts: 1 951 955 4 0.1 9514.1 0.4X
-Coalesce Num Partitions: 1000 Num Hosts: 5 421 429 8 0.2 4211.3 0.8X
-Coalesce Num Partitions: 1000 Num Hosts: 10 347 352 4 0.3 3473.5 1.0X
-Coalesce Num Partitions: 1000 Num Hosts: 20 309 312 5 0.3 3087.5 1.1X
-Coalesce Num Partitions: 1000 Num Hosts: 40 290 294 6 0.3 2896.4 1.2X
-Coalesce Num Partitions: 1000 Num Hosts: 80 281 286 5 0.4 2811.3 1.2X
-Coalesce Num Partitions: 5000 Num Hosts: 1 3928 3950 27 0.0 39278.0 0.1X
-Coalesce Num Partitions: 5000 Num Hosts: 5 1373 1389 27 0.1 13725.2 0.3X
-Coalesce Num Partitions: 5000 Num Hosts: 10 812 827 13 0.1 8123.3 0.4X
-Coalesce Num Partitions: 5000 Num Hosts: 20 530 540 9 0.2 5299.1 0.6X
-Coalesce Num Partitions: 5000 Num Hosts: 40 421 425 5 0.2 4210.5 0.8X
-Coalesce Num Partitions: 5000 Num Hosts: 80 335 344 12 0.3 3353.7 1.0X
-Coalesce Num Partitions: 10000 Num Hosts: 1 7116 7120 4 0.0 71159.0 0.0X
-Coalesce Num Partitions: 10000 Num Hosts: 5 2539 2598 51 0.0 25390.1 0.1X
-Coalesce Num Partitions: 10000 Num Hosts: 10 1393 1432 34 0.1 13928.1 0.2X
-Coalesce Num Partitions: 10000 Num Hosts: 20 833 1009 303 0.1 8329.2 0.4X
-Coalesce Num Partitions: 10000 Num Hosts: 40 562 563 3 0.2 5615.2 0.6X
-Coalesce Num Partitions: 10000 Num Hosts: 80 420 426 7 0.2 4204.0 0.8X
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+----------------------------------------------------------------------------------------------------------------------------
+Coalesce Num Partitions: 100 Num Hosts: 1 365 454 104 0.3 3652.8 1.0X
+Coalesce Num Partitions: 100 Num Hosts: 5 271 374 90 0.4 2708.0 1.3X
+Coalesce Num Partitions: 100 Num Hosts: 10 265 337 88 0.4 2651.9 1.4X
+Coalesce Num Partitions: 100 Num Hosts: 20 277 338 87 0.4 2772.8 1.3X
+Coalesce Num Partitions: 100 Num Hosts: 40 313 320 6 0.3 3127.8 1.2X
+Coalesce Num Partitions: 100 Num Hosts: 80 291 330 60 0.3 2909.5 1.3X
+Coalesce Num Partitions: 500 Num Hosts: 1 974 1035 59 0.1 9741.6 0.4X
+Coalesce Num Partitions: 500 Num Hosts: 5 443 450 6 0.2 4427.7 0.8X
+Coalesce Num Partitions: 500 Num Hosts: 10 382 455 64 0.3 3816.6 1.0X
+Coalesce Num Partitions: 500 Num Hosts: 20 359 402 39 0.3 3589.9 1.0X
+Coalesce Num Partitions: 500 Num Hosts: 40 317 350 54 0.3 3165.5 1.2X
+Coalesce Num Partitions: 500 Num Hosts: 80 267 337 81 0.4 2668.5 1.4X
+Coalesce Num Partitions: 1000 Num Hosts: 1 1621 1712 84 0.1 16205.9 0.2X
+Coalesce Num Partitions: 1000 Num Hosts: 5 582 638 85 0.2 5817.7 0.6X
+Coalesce Num Partitions: 1000 Num Hosts: 10 398 446 74 0.3 3980.8 0.9X
+Coalesce Num Partitions: 1000 Num Hosts: 20 332 410 108 0.3 3315.1 1.1X
+Coalesce Num Partitions: 1000 Num Hosts: 40 345 377 54 0.3 3453.1 1.1X
+Coalesce Num Partitions: 1000 Num Hosts: 80 294 353 69 0.3 2940.5 1.2X
+Coalesce Num Partitions: 5000 Num Hosts: 1 6818 6906 151 0.0 68183.8 0.1X
+Coalesce Num Partitions: 5000 Num Hosts: 5 1606 1719 138 0.1 16061.1 0.2X
+Coalesce Num Partitions: 5000 Num Hosts: 10 965 1031 94 0.1 9653.8 0.4X
+Coalesce Num Partitions: 5000 Num Hosts: 20 643 722 98 0.2 6427.4 0.6X
+Coalesce Num Partitions: 5000 Num Hosts: 40 445 517 97 0.2 4448.8 0.8X
+Coalesce Num Partitions: 5000 Num Hosts: 80 369 432 93 0.3 3688.6 1.0X
+Coalesce Num Partitions: 10000 Num Hosts: 1 12850 12953 90 0.0 128503.9 0.0X
+Coalesce Num Partitions: 10000 Num Hosts: 5 3431 3463 49 0.0 34307.1 0.1X
+Coalesce Num Partitions: 10000 Num Hosts: 10 1687 1795 106 0.1 16874.8 0.2X
+Coalesce Num Partitions: 10000 Num Hosts: 20 1084 1125 58 0.1 10844.2 0.3X
+Coalesce Num Partitions: 10000 Num Hosts: 40 667 722 73 0.1 6674.4 0.5X
+Coalesce Num Partitions: 10000 Num Hosts: 80 488 542 82 0.2 4883.9 0.7X
diff --git a/core/benchmarks/CoalescedRDDBenchmark-results.txt b/core/benchmarks/CoalescedRDDBenchmark-results.txt
index f1b867951a074..829e43530dd5f 100644
--- a/core/benchmarks/CoalescedRDDBenchmark-results.txt
+++ b/core/benchmarks/CoalescedRDDBenchmark-results.txt
@@ -2,39 +2,39 @@
Coalesced RDD , large scale
================================================================================================
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Coalesce Num Partitions: 100 Num Hosts: 1 395 401 9 0.3 3952.3 1.0X
-Coalesce Num Partitions: 100 Num Hosts: 5 296 344 42 0.3 2963.2 1.3X
-Coalesce Num Partitions: 100 Num Hosts: 10 294 308 15 0.3 2941.7 1.3X
-Coalesce Num Partitions: 100 Num Hosts: 20 316 328 13 0.3 3155.2 1.3X
-Coalesce Num Partitions: 100 Num Hosts: 40 294 316 36 0.3 2940.3 1.3X
-Coalesce Num Partitions: 100 Num Hosts: 80 292 324 30 0.3 2922.2 1.4X
-Coalesce Num Partitions: 500 Num Hosts: 1 629 687 61 0.2 6292.4 0.6X
-Coalesce Num Partitions: 500 Num Hosts: 5 354 378 42 0.3 3541.7 1.1X
-Coalesce Num Partitions: 500 Num Hosts: 10 318 338 29 0.3 3179.8 1.2X
-Coalesce Num Partitions: 500 Num Hosts: 20 306 317 11 0.3 3059.2 1.3X
-Coalesce Num Partitions: 500 Num Hosts: 40 294 311 28 0.3 2941.6 1.3X
-Coalesce Num Partitions: 500 Num Hosts: 80 288 309 34 0.3 2883.9 1.4X
-Coalesce Num Partitions: 1000 Num Hosts: 1 956 978 20 0.1 9562.2 0.4X
-Coalesce Num Partitions: 1000 Num Hosts: 5 431 452 36 0.2 4306.2 0.9X
-Coalesce Num Partitions: 1000 Num Hosts: 10 358 379 23 0.3 3581.1 1.1X
-Coalesce Num Partitions: 1000 Num Hosts: 20 324 347 20 0.3 3236.7 1.2X
-Coalesce Num Partitions: 1000 Num Hosts: 40 312 333 20 0.3 3116.8 1.3X
-Coalesce Num Partitions: 1000 Num Hosts: 80 307 342 32 0.3 3068.4 1.3X
-Coalesce Num Partitions: 5000 Num Hosts: 1 3895 3906 12 0.0 38946.8 0.1X
-Coalesce Num Partitions: 5000 Num Hosts: 5 1388 1401 19 0.1 13881.7 0.3X
-Coalesce Num Partitions: 5000 Num Hosts: 10 806 839 57 0.1 8063.7 0.5X
-Coalesce Num Partitions: 5000 Num Hosts: 20 546 573 44 0.2 5462.6 0.7X
-Coalesce Num Partitions: 5000 Num Hosts: 40 413 418 5 0.2 4134.7 1.0X
-Coalesce Num Partitions: 5000 Num Hosts: 80 345 365 23 0.3 3448.1 1.1X
-Coalesce Num Partitions: 10000 Num Hosts: 1 6933 6966 55 0.0 69328.8 0.1X
-Coalesce Num Partitions: 10000 Num Hosts: 5 2455 2499 69 0.0 24551.7 0.2X
-Coalesce Num Partitions: 10000 Num Hosts: 10 1352 1392 34 0.1 13520.2 0.3X
-Coalesce Num Partitions: 10000 Num Hosts: 20 815 853 50 0.1 8147.5 0.5X
-Coalesce Num Partitions: 10000 Num Hosts: 40 558 581 28 0.2 5578.0 0.7X
-Coalesce Num Partitions: 10000 Num Hosts: 80 416 423 5 0.2 4163.3 0.9X
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
+Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+----------------------------------------------------------------------------------------------------------------------------
+Coalesce Num Partitions: 100 Num Hosts: 1 394 423 25 0.3 3942.7 1.0X
+Coalesce Num Partitions: 100 Num Hosts: 5 317 339 23 0.3 3171.6 1.2X
+Coalesce Num Partitions: 100 Num Hosts: 10 310 324 20 0.3 3095.9 1.3X
+Coalesce Num Partitions: 100 Num Hosts: 20 285 292 8 0.4 2849.0 1.4X
+Coalesce Num Partitions: 100 Num Hosts: 40 271 292 22 0.4 2712.7 1.5X
+Coalesce Num Partitions: 100 Num Hosts: 80 287 300 18 0.3 2874.2 1.4X
+Coalesce Num Partitions: 500 Num Hosts: 1 839 872 33 0.1 8388.1 0.5X
+Coalesce Num Partitions: 500 Num Hosts: 5 389 395 5 0.3 3892.4 1.0X
+Coalesce Num Partitions: 500 Num Hosts: 10 352 356 3 0.3 3522.0 1.1X
+Coalesce Num Partitions: 500 Num Hosts: 20 315 322 11 0.3 3145.0 1.3X
+Coalesce Num Partitions: 500 Num Hosts: 40 304 324 22 0.3 3036.4 1.3X
+Coalesce Num Partitions: 500 Num Hosts: 80 286 301 23 0.4 2855.7 1.4X
+Coalesce Num Partitions: 1000 Num Hosts: 1 1390 1422 28 0.1 13898.0 0.3X
+Coalesce Num Partitions: 1000 Num Hosts: 5 535 562 26 0.2 5347.4 0.7X
+Coalesce Num Partitions: 1000 Num Hosts: 10 419 425 8 0.2 4193.8 0.9X
+Coalesce Num Partitions: 1000 Num Hosts: 20 358 364 9 0.3 3575.8 1.1X
+Coalesce Num Partitions: 1000 Num Hosts: 40 323 329 5 0.3 3234.8 1.2X
+Coalesce Num Partitions: 1000 Num Hosts: 80 306 316 11 0.3 3060.3 1.3X
+Coalesce Num Partitions: 5000 Num Hosts: 1 5967 6744 1289 0.0 59666.6 0.1X
+Coalesce Num Partitions: 5000 Num Hosts: 5 2010 2032 19 0.0 20104.1 0.2X
+Coalesce Num Partitions: 5000 Num Hosts: 10 1132 1161 25 0.1 11324.9 0.3X
+Coalesce Num Partitions: 5000 Num Hosts: 20 687 714 39 0.1 6874.2 0.6X
+Coalesce Num Partitions: 5000 Num Hosts: 40 507 511 3 0.2 5070.1 0.8X
+Coalesce Num Partitions: 5000 Num Hosts: 80 375 383 12 0.3 3749.7 1.1X
+Coalesce Num Partitions: 10000 Num Hosts: 1 10782 10833 63 0.0 107816.6 0.0X
+Coalesce Num Partitions: 10000 Num Hosts: 5 3819 3901 72 0.0 38185.3 0.1X
+Coalesce Num Partitions: 10000 Num Hosts: 10 2021 2030 12 0.0 20212.9 0.2X
+Coalesce Num Partitions: 10000 Num Hosts: 20 1138 1168 44 0.1 11378.0 0.3X
+Coalesce Num Partitions: 10000 Num Hosts: 40 716 723 9 0.1 7157.2 0.6X
+Coalesce Num Partitions: 10000 Num Hosts: 80 504 514 9 0.2 5039.9 0.8X
diff --git a/core/benchmarks/KryoBenchmark-jdk11-results.txt b/core/benchmarks/KryoBenchmark-jdk11-results.txt
index 27f0b8f59f47a..22b776c6808c8 100644
--- a/core/benchmarks/KryoBenchmark-jdk11-results.txt
+++ b/core/benchmarks/KryoBenchmark-jdk11-results.txt
@@ -2,27 +2,27 @@
Benchmark Kryo Unsafe vs safe Serialization
================================================================================================
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-basicTypes: Int with unsafe:true 275 288 14 3.6 275.2 1.0X
-basicTypes: Long with unsafe:true 331 336 13 3.0 330.9 0.8X
-basicTypes: Float with unsafe:true 304 305 1 3.3 304.4 0.9X
-basicTypes: Double with unsafe:true 328 332 3 3.0 328.1 0.8X
-Array: Int with unsafe:true 4 4 0 252.8 4.0 69.6X
-Array: Long with unsafe:true 6 6 0 161.5 6.2 44.5X
-Array: Float with unsafe:true 4 4 0 264.6 3.8 72.8X
-Array: Double with unsafe:true 6 7 0 160.5 6.2 44.2X
-Map of string->Double with unsafe:true 52 52 0 19.3 51.8 5.3X
-basicTypes: Int with unsafe:false 344 345 1 2.9 344.3 0.8X
-basicTypes: Long with unsafe:false 372 373 1 2.7 372.3 0.7X
-basicTypes: Float with unsafe:false 333 334 1 3.0 333.4 0.8X
-basicTypes: Double with unsafe:false 344 345 0 2.9 344.3 0.8X
-Array: Int with unsafe:false 25 25 0 40.8 24.5 11.2X
-Array: Long with unsafe:false 37 37 1 27.3 36.7 7.5X
-Array: Float with unsafe:false 11 11 0 92.1 10.9 25.4X
-Array: Double with unsafe:false 17 18 0 58.3 17.2 16.0X
-Map of string->Double with unsafe:false 51 52 1 19.4 51.5 5.3X
+---------------------------------------------------------------------------------------------------------------------------
+basicTypes: Int with unsafe:true 331 346 11 3.0 330.7 1.0X
+basicTypes: Long with unsafe:true 392 401 9 2.6 392.0 0.8X
+basicTypes: Float with unsafe:true 394 404 9 2.5 394.5 0.8X
+basicTypes: Double with unsafe:true 404 414 7 2.5 404.0 0.8X
+Array: Int with unsafe:true 5 6 1 195.3 5.1 64.6X
+Array: Long with unsafe:true 8 9 1 124.1 8.1 41.0X
+Array: Float with unsafe:true 5 6 1 186.1 5.4 61.5X
+Array: Double with unsafe:true 8 9 1 126.1 7.9 41.7X
+Map of string->Double with unsafe:true 54 59 4 18.4 54.4 6.1X
+basicTypes: Int with unsafe:false 417 428 8 2.4 416.6 0.8X
+basicTypes: Long with unsafe:false 452 466 9 2.2 451.8 0.7X
+basicTypes: Float with unsafe:false 410 421 6 2.4 410.0 0.8X
+basicTypes: Double with unsafe:false 429 444 10 2.3 429.2 0.8X
+Array: Int with unsafe:false 25 27 2 39.2 25.5 13.0X
+Array: Long with unsafe:false 40 43 2 25.0 40.0 8.3X
+Array: Float with unsafe:false 11 12 1 90.7 11.0 30.0X
+Array: Double with unsafe:false 18 20 1 54.2 18.5 17.9X
+Map of string->Double with unsafe:false 55 59 2 18.0 55.5 6.0X
diff --git a/core/benchmarks/KryoBenchmark-results.txt b/core/benchmarks/KryoBenchmark-results.txt
index 49791e6e87e3a..5c38ff9cabe0e 100644
--- a/core/benchmarks/KryoBenchmark-results.txt
+++ b/core/benchmarks/KryoBenchmark-results.txt
@@ -2,27 +2,27 @@
Benchmark Kryo Unsafe vs safe Serialization
================================================================================================
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-basicTypes: Int with unsafe:true 269 290 23 3.7 269.0 1.0X
-basicTypes: Long with unsafe:true 294 295 1 3.4 293.8 0.9X
-basicTypes: Float with unsafe:true 300 301 1 3.3 300.4 0.9X
-basicTypes: Double with unsafe:true 304 305 1 3.3 304.0 0.9X
-Array: Int with unsafe:true 5 6 1 193.5 5.2 52.0X
-Array: Long with unsafe:true 8 9 1 131.2 7.6 35.3X
-Array: Float with unsafe:true 6 6 0 163.5 6.1 44.0X
-Array: Double with unsafe:true 9 10 0 108.8 9.2 29.3X
-Map of string->Double with unsafe:true 54 54 1 18.7 53.6 5.0X
-basicTypes: Int with unsafe:false 326 327 1 3.1 326.2 0.8X
-basicTypes: Long with unsafe:false 353 354 1 2.8 353.3 0.8X
-basicTypes: Float with unsafe:false 325 327 1 3.1 325.1 0.8X
-basicTypes: Double with unsafe:false 335 336 1 3.0 335.0 0.8X
-Array: Int with unsafe:false 27 28 1 36.7 27.2 9.9X
-Array: Long with unsafe:false 40 41 1 25.0 40.0 6.7X
-Array: Float with unsafe:false 12 13 1 80.8 12.4 21.7X
-Array: Double with unsafe:false 21 21 1 48.6 20.6 13.1X
-Map of string->Double with unsafe:false 56 57 1 17.8 56.1 4.8X
+---------------------------------------------------------------------------------------------------------------------------
+basicTypes: Int with unsafe:true 286 295 6 3.5 285.6 1.0X
+basicTypes: Long with unsafe:true 320 327 5 3.1 319.8 0.9X
+basicTypes: Float with unsafe:true 314 318 2 3.2 313.5 0.9X
+basicTypes: Double with unsafe:true 310 319 6 3.2 309.9 0.9X
+Array: Int with unsafe:true 5 6 1 217.9 4.6 62.2X
+Array: Long with unsafe:true 8 9 1 121.5 8.2 34.7X
+Array: Float with unsafe:true 5 6 0 217.5 4.6 62.1X
+Array: Double with unsafe:true 9 9 1 117.3 8.5 33.5X
+Map of string->Double with unsafe:true 51 52 1 19.7 50.9 5.6X
+basicTypes: Int with unsafe:false 373 384 12 2.7 373.3 0.8X
+basicTypes: Long with unsafe:false 373 391 9 2.7 372.6 0.8X
+basicTypes: Float with unsafe:false 349 362 11 2.9 349.2 0.8X
+basicTypes: Double with unsafe:false 358 372 10 2.8 358.0 0.8X
+Array: Int with unsafe:false 25 29 1 40.0 25.0 11.4X
+Array: Long with unsafe:false 35 39 1 28.2 35.5 8.1X
+Array: Float with unsafe:false 10 12 1 100.2 10.0 28.6X
+Array: Double with unsafe:false 17 20 1 58.3 17.1 16.7X
+Map of string->Double with unsafe:false 48 53 2 20.6 48.5 5.9X
diff --git a/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt b/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt
index 6b148bde12d36..41ac5386754a4 100644
--- a/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt
+++ b/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt
@@ -2,11 +2,11 @@
Benchmark KryoPool vs old"pool of 1" implementation
================================================================================================
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-KryoPool:true 6208 8374 NaN 0.0 12416876.6 1.0X
-KryoPool:false 9084 11577 724 0.0 18168947.4 0.7X
+-----------------------------------------------------------------------------------------------------------------------------------
+KryoPool:true 10053 13422 983 0.0 20106810.4 1.0X
+KryoPool:false 16854 19164 NaN 0.0 33708260.0 0.6X
diff --git a/core/benchmarks/KryoSerializerBenchmark-results.txt b/core/benchmarks/KryoSerializerBenchmark-results.txt
index 609f3298cbc00..c10f7dc415982 100644
--- a/core/benchmarks/KryoSerializerBenchmark-results.txt
+++ b/core/benchmarks/KryoSerializerBenchmark-results.txt
@@ -2,11 +2,11 @@
Benchmark KryoPool vs old"pool of 1" implementation
================================================================================================
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-KryoPool:true 6012 7586 NaN 0.0 12023020.2 1.0X
-KryoPool:false 9289 11566 909 0.0 18578683.1 0.6X
+-----------------------------------------------------------------------------------------------------------------------------------
+KryoPool:true 7893 10556 NaN 0.0 15785307.8 1.0X
+KryoPool:false 12778 15741 426 0.0 25555753.8 0.6X
diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt
index db23cf5c12ea7..29699a2fdcf2a 100644
--- a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt
+++ b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt
@@ -1,64 +1,64 @@
-OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz
200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 170 178 9 1.2 849.7 1.0X
-Deserialization 530 535 9 0.4 2651.1 0.3X
+-------------------------------------------------------------------------------------------------------------------------
+Serialization 179 194 9 1.1 897.4 1.0X
+Deserialization 254 321 74 0.8 1271.0 0.7X
-Compressed Serialized MapStatus sizes: 411 bytes
+Compressed Serialized MapStatus sizes: 409 bytes
Compressed Serialized Broadcast MapStatus sizes: 2 MB
-OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz
200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 157 165 7 1.3 785.4 1.0X
-Deserialization 495 588 79 0.4 2476.7 0.3X
+--------------------------------------------------------------------------------------------------------------------------
+Serialization 160 166 7 1.2 801.2 1.0X
+Deserialization 256 323 69 0.8 1278.9 0.6X
Compressed Serialized MapStatus sizes: 2 MB
Compressed Serialized Broadcast MapStatus sizes: 0 bytes
-OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz
200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 344 351 4 0.6 1720.4 1.0X
-Deserialization 527 579 99 0.4 2635.9 0.7X
+--------------------------------------------------------------------------------------------------------------------------
+Serialization 341 349 7 0.6 1707.3 1.0X
+Deserialization 286 370 84 0.7 1431.4 1.2X
-Compressed Serialized MapStatus sizes: 427 bytes
+Compressed Serialized MapStatus sizes: 426 bytes
Compressed Serialized Broadcast MapStatus sizes: 13 MB
-OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz
200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 317 321 4 0.6 1583.8 1.0X
-Deserialization 530 540 15 0.4 2648.3 0.6X
+---------------------------------------------------------------------------------------------------------------------------
+Serialization 309 319 11 0.6 1543.6 1.0X
+Deserialization 286 373 117 0.7 1429.5 1.1X
Compressed Serialized MapStatus sizes: 13 MB
Compressed Serialized Broadcast MapStatus sizes: 0 bytes
-OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz
200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 1738 1849 156 0.1 8692.0 1.0X
-Deserialization 946 977 33 0.2 4730.2 1.8X
+---------------------------------------------------------------------------------------------------------------------------
+Serialization 1619 1627 12 0.1 8092.6 1.0X
+Deserialization 864 883 26 0.2 4319.9 1.9X
-Compressed Serialized MapStatus sizes: 556 bytes
+Compressed Serialized MapStatus sizes: 557 bytes
Compressed Serialized Broadcast MapStatus sizes: 121 MB
-OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz
200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 1379 1432 76 0.1 6892.6 1.0X
-Deserialization 929 941 19 0.2 4645.5 1.5X
+----------------------------------------------------------------------------------------------------------------------------
+Serialization 1449 1456 9 0.1 7246.8 1.0X
+Deserialization 853 888 46 0.2 4263.7 1.7X
Compressed Serialized MapStatus sizes: 121 MB
Compressed Serialized Broadcast MapStatus sizes: 0 bytes
diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt
index 053f4bf771923..96fa3a01a8f6d 100644
--- a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt
+++ b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt
@@ -1,64 +1,64 @@
-OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 178 187 15 1.1 887.5 1.0X
-Deserialization 530 558 32 0.4 2647.5 0.3X
+-------------------------------------------------------------------------------------------------------------------------
+Serialization 135 161 56 1.5 673.9 1.0X
+Deserialization 213 235 26 0.9 1065.6 0.6X
-Compressed Serialized MapStatus sizes: 411 bytes
+Compressed Serialized MapStatus sizes: 409 bytes
Compressed Serialized Broadcast MapStatus sizes: 2 MB
-OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 167 175 7 1.2 835.7 1.0X
-Deserialization 523 537 22 0.4 2616.2 0.3X
+--------------------------------------------------------------------------------------------------------------------------
+Serialization 130 137 5 1.5 650.8 1.0X
+Deserialization 211 230 20 0.9 1056.5 0.6X
Compressed Serialized MapStatus sizes: 2 MB
Compressed Serialized Broadcast MapStatus sizes: 0 bytes
-OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 351 416 147 0.6 1754.4 1.0X
-Deserialization 546 551 8 0.4 2727.6 0.6X
+--------------------------------------------------------------------------------------------------------------------------
+Serialization 281 324 86 0.7 1406.7 1.0X
+Deserialization 240 267 32 0.8 1200.5 1.2X
-Compressed Serialized MapStatus sizes: 427 bytes
+Compressed Serialized MapStatus sizes: 426 bytes
Compressed Serialized Broadcast MapStatus sizes: 13 MB
-OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 320 321 1 0.6 1598.0 1.0X
-Deserialization 542 549 7 0.4 2709.0 0.6X
+---------------------------------------------------------------------------------------------------------------------------
+Serialization 265 273 6 0.8 1324.5 1.0X
+Deserialization 247 276 33 0.8 1236.1 1.1X
Compressed Serialized MapStatus sizes: 13 MB
Compressed Serialized Broadcast MapStatus sizes: 0 bytes
-OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 1671 1877 290 0.1 8357.3 1.0X
-Deserialization 943 970 32 0.2 4715.8 1.8X
+---------------------------------------------------------------------------------------------------------------------------
+Serialization 1333 1592 366 0.2 6666.0 1.0X
+Deserialization 560 585 22 0.4 2799.1 2.4X
-Compressed Serialized MapStatus sizes: 556 bytes
+Compressed Serialized MapStatus sizes: 558 bytes
Compressed Serialized Broadcast MapStatus sizes: 121 MB
-OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
-Serialization 1373 1436 89 0.1 6865.0 1.0X
-Deserialization 940 970 37 0.2 4699.1 1.5X
+----------------------------------------------------------------------------------------------------------------------------
+Serialization 1222 1260 54 0.2 6111.7 1.0X
+Deserialization 539 568 42 0.4 2695.3 2.3X
Compressed Serialized MapStatus sizes: 121 MB
Compressed Serialized Broadcast MapStatus sizes: 0 bytes
diff --git a/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt b/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt
index 605b856d53382..f1d6b1ff372bc 100644
--- a/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt
+++ b/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt
@@ -2,39 +2,39 @@
Properties Cloning
================================================================================================
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 0 0 0 0.1 11539.0 1.0X
-Utils.cloneProperties 0 0 0 1.7 572.0 20.2X
+SerializationUtils.clone 0 0 0 0.2 4800.0 1.0X
+Utils.cloneProperties 0 0 0 Infinity 0.0 InfinityX
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 0 0 0 0.0 217514.0 1.0X
-Utils.cloneProperties 0 0 0 0.2 5387.0 40.4X
+SerializationUtils.clone 0 0 0 0.0 202203.0 1.0X
+Utils.cloneProperties 0 0 0 0.1 6700.0 30.2X
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 1 1 0 0.0 634574.0 1.0X
-Utils.cloneProperties 0 0 0 0.3 3082.0 205.9X
+SerializationUtils.clone 1 1 0 0.0 588099.0 1.0X
+Utils.cloneProperties 0 0 0 0.2 5699.0 103.2X
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 3 3 0 0.0 2576565.0 1.0X
-Utils.cloneProperties 0 0 0 0.1 16071.0 160.3X
+SerializationUtils.clone 2 3 0 0.0 2357927.0 1.0X
+Utils.cloneProperties 0 0 0 0.0 31901.0 73.9X
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 5 5 0 0.0 5027248.0 1.0X
-Utils.cloneProperties 0 0 0 0.0 31842.0 157.9X
+SerializationUtils.clone 5 5 0 0.0 4636068.0 1.0X
+Utils.cloneProperties 0 0 0 0.0 64701.0 71.7X
diff --git a/core/benchmarks/PropertiesCloneBenchmark-results.txt b/core/benchmarks/PropertiesCloneBenchmark-results.txt
index 5d332a147c698..81ccff6e638c8 100644
--- a/core/benchmarks/PropertiesCloneBenchmark-results.txt
+++ b/core/benchmarks/PropertiesCloneBenchmark-results.txt
@@ -2,39 +2,39 @@
Properties Cloning
================================================================================================
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 0 0 0 0.1 13640.0 1.0X
-Utils.cloneProperties 0 0 0 1.6 608.0 22.4X
+SerializationUtils.clone 0 0 0 0.2 5599.0 1.0X
+Utils.cloneProperties 0 0 0 Infinity 0.0 InfinityX
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 0 0 0 0.0 238968.0 1.0X
-Utils.cloneProperties 0 0 0 0.4 2318.0 103.1X
+SerializationUtils.clone 0 0 0 0.0 339506.0 1.0X
+Utils.cloneProperties 0 0 0 0.5 1900.0 178.7X
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 1 1 0 0.0 725849.0 1.0X
-Utils.cloneProperties 0 0 0 0.3 2900.0 250.3X
+SerializationUtils.clone 1 1 0 0.0 604411.0 1.0X
+Utils.cloneProperties 0 0 0 0.2 5100.0 118.5X
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 3 3 0 0.0 2999676.0 1.0X
-Utils.cloneProperties 0 0 0 0.1 11734.0 255.6X
+SerializationUtils.clone 2 2 0 0.0 2378345.0 1.0X
+Utils.cloneProperties 0 0 0 0.0 25100.0 94.8X
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-SerializationUtils.clone 6 6 1 0.0 5846410.0 1.0X
-Utils.cloneProperties 0 0 0 0.0 22405.0 260.9X
+SerializationUtils.clone 4 4 0 0.0 4035677.0 1.0X
+Utils.cloneProperties 0 0 0 0.0 43300.0 93.2X
diff --git a/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt b/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt
index 9aa10e4835a2f..0b13462c59b51 100644
--- a/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt
+++ b/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt
@@ -2,43 +2,43 @@
Pseudo random
================================================================================================
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 1362 1362 0 73.4 13.6 1.0X
-XORShiftRandom 227 227 0 440.6 2.3 6.0X
+java.util.Random 1414 1423 10 70.7 14.1 1.0X
+XORShiftRandom 234 238 3 426.9 2.3 6.0X
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 2725 2726 1 36.7 27.3 1.0X
-XORShiftRandom 694 694 1 144.1 6.9 3.9X
+java.util.Random 2669 2699 26 37.5 26.7 1.0X
+XORShiftRandom 622 633 12 160.7 6.2 4.3X
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 2727 2728 0 36.7 27.3 1.0X
-XORShiftRandom 693 694 0 144.2 6.9 3.9X
+java.util.Random 2613 2712 98 38.3 26.1 1.0X
+XORShiftRandom 667 675 7 149.9 6.7 3.9X
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 7012 7016 4 14.3 70.1 1.0X
-XORShiftRandom 6065 6067 1 16.5 60.7 1.2X
+java.util.Random 6687 6757 61 15.0 66.9 1.0X
+XORShiftRandom 4882 4892 9 20.5 48.8 1.4X
================================================================================================
hash seed
================================================================================================
-OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-XORShiftRandom.hashSeed 36 37 1 276.5 3.6 1.0X
+XORShiftRandom.hashSeed 40 42 3 247.4 4.0 1.0X
diff --git a/core/benchmarks/XORShiftRandomBenchmark-results.txt b/core/benchmarks/XORShiftRandomBenchmark-results.txt
index 4b069878b2e9b..87093eaabd7ea 100644
--- a/core/benchmarks/XORShiftRandomBenchmark-results.txt
+++ b/core/benchmarks/XORShiftRandomBenchmark-results.txt
@@ -2,43 +2,43 @@
Pseudo random
================================================================================================
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 1362 1396 59 73.4 13.6 1.0X
-XORShiftRandom 227 227 0 440.7 2.3 6.0X
+java.util.Random 1192 1217 24 83.9 11.9 1.0X
+XORShiftRandom 193 196 4 518.9 1.9 6.2X
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 2732 2732 1 36.6 27.3 1.0X
-XORShiftRandom 630 630 1 158.7 6.3 4.3X
+java.util.Random 2391 2402 10 41.8 23.9 1.0X
+XORShiftRandom 518 528 11 193.1 5.2 4.6X
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 2731 2732 1 36.6 27.3 1.0X
-XORShiftRandom 630 630 0 158.8 6.3 4.3X
+java.util.Random 2318 2363 39 43.1 23.2 1.0X
+XORShiftRandom 488 496 13 205.0 4.9 4.8X
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-java.util.Random 8895 8899 4 11.2 88.9 1.0X
-XORShiftRandom 5049 5052 5 19.8 50.5 1.8X
+java.util.Random 6102 6156 77 16.4 61.0 1.0X
+XORShiftRandom 3685 3758 84 27.1 36.9 1.7X
================================================================================================
hash seed
================================================================================================
-OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-XORShiftRandom.hashSeed 67 68 1 148.8 6.7 1.0X
+XORShiftRandom.hashSeed 65 66 2 155.0 6.5 1.0X
diff --git a/core/benchmarks/ZStandardBenchmark-jdk11-results.txt b/core/benchmarks/ZStandardBenchmark-jdk11-results.txt
new file mode 100644
index 0000000000000..3895e7bb6f27d
--- /dev/null
+++ b/core/benchmarks/ZStandardBenchmark-jdk11-results.txt
@@ -0,0 +1,27 @@
+================================================================================================
+Benchmark ZStandardCompressionCodec
+================================================================================================
+
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+--------------------------------------------------------------------------------------------------------------------------------------
+Compression 10000 times at level 1 without buffer pool 606 614 6 0.0 60645.3 1.0X
+Compression 10000 times at level 2 without buffer pool 686 693 7 0.0 68594.9 0.9X
+Compression 10000 times at level 3 without buffer pool 906 920 14 0.0 90642.7 0.7X
+Compression 10000 times at level 1 with buffer pool 389 403 20 0.0 38901.4 1.6X
+Compression 10000 times at level 2 with buffer pool 450 466 13 0.0 45032.0 1.3X
+Compression 10000 times at level 3 with buffer pool 680 682 2 0.0 68004.2 0.9X
+
+OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------------------------
+Decompression 10000 times from level 1 without buffer pool 1209 1226 25 0.0 120862.8 1.0X
+Decompression 10000 times from level 2 without buffer pool 1191 1193 3 0.0 119064.9 1.0X
+Decompression 10000 times from level 3 without buffer pool 1188 1193 6 0.0 118843.3 1.0X
+Decompression 10000 times from level 1 with buffer pool 998 1004 9 0.0 99754.7 1.2X
+Decompression 10000 times from level 2 with buffer pool 990 1001 11 0.0 99043.8 1.2X
+Decompression 10000 times from level 3 with buffer pool 983 999 20 0.0 98269.5 1.2X
+
+
diff --git a/core/benchmarks/ZStandardBenchmark-results.txt b/core/benchmarks/ZStandardBenchmark-results.txt
new file mode 100644
index 0000000000000..6990c28690072
--- /dev/null
+++ b/core/benchmarks/ZStandardBenchmark-results.txt
@@ -0,0 +1,27 @@
+================================================================================================
+Benchmark ZStandardCompressionCodec
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+--------------------------------------------------------------------------------------------------------------------------------------
+Compression 10000 times at level 1 without buffer pool 670 681 9 0.0 67011.0 1.0X
+Compression 10000 times at level 2 without buffer pool 569 571 2 0.0 56932.0 1.2X
+Compression 10000 times at level 3 without buffer pool 748 751 2 0.0 74813.8 0.9X
+Compression 10000 times at level 1 with buffer pool 336 337 1 0.0 33630.6 2.0X
+Compression 10000 times at level 2 with buffer pool 395 397 2 0.0 39472.6 1.7X
+Compression 10000 times at level 3 with buffer pool 563 567 4 0.0 56272.8 1.2X
+
+OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------------------------
+Decompression 10000 times from level 1 without buffer pool 1029 1031 3 0.0 102887.4 1.0X
+Decompression 10000 times from level 2 without buffer pool 1028 1031 4 0.0 102847.8 1.0X
+Decompression 10000 times from level 3 without buffer pool 1029 1029 0 0.0 102941.0 1.0X
+Decompression 10000 times from level 1 with buffer pool 798 799 0 0.0 79838.0 1.3X
+Decompression 10000 times from level 2 with buffer pool 799 799 0 0.0 79852.9 1.3X
+Decompression 10000 times from level 3 with buffer pool 796 798 2 0.0 79630.5 1.3X
+
+
diff --git a/core/pom.xml b/core/pom.xml
index b0f68880f1d8a..ec80807f880e2 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
org.apache.sparkspark-parent_2.12
- 3.1.0-SNAPSHOT
+ 3.2.0-SNAPSHOT../pom.xml
@@ -35,10 +35,6 @@
-
- com.thoughtworks.paranamer
- paranamer
- org.apache.avroavro
@@ -46,7 +42,6 @@
org.apache.avroavro-mapred
- ${avro.mapred.classifier}com.google.guava
@@ -66,7 +61,13 @@
org.apache.hadoop
- hadoop-client
+ ${hadoop-client-api.artifact}
+ ${hadoop.version}
+
+
+ org.apache.hadoop
+ ${hadoop-client-runtime.artifact}
+ ${hadoop.version}org.apache.spark
@@ -161,9 +162,13 @@
compile
- javax.servlet
- javax.servlet-api
- ${javaxservlet.version}
+ jakarta.servlet
+ jakarta.servlet-api
+ ${jakartaservlet.version}
+
+
+ commons-codec
+ commons-codecorg.apache.commons
@@ -177,6 +182,14 @@
org.apache.commonscommons-text
+
+ commons-io
+ commons-io
+
+
+ commons-collections
+ commons-collections
+ com.google.code.findbugsjsr305
@@ -334,7 +347,7 @@
org.seleniumhq.selenium
- selenium-htmlunit-driver
+ htmlunit-drivertest
@@ -414,7 +427,7 @@
net.sf.py4jpy4j
- 0.10.9
+ 0.10.9.2org.apache.spark
@@ -447,6 +460,47 @@
test
+
+ org.apache.hadoop
+ hadoop-aws
+ ${hadoop.version}
+ test
+
+
+ org.apache.hadoop
+ hadoop-common
+
+
+ commons-logging
+ commons-logging
+
+
+ org.codehaus.jackson
+ jackson-mapper-asl
+
+
+ org.codehaus.jackson
+ jackson-core-asl
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+
+
+
+ com.amazonaws
+ aws-java-sdk
+
+
+ org.apache.commonscommons-crypto
diff --git a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
index 731f6fc767dfd..7cb2455affe48 100644
--- a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
+++ b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
@@ -17,6 +17,7 @@
package org.apache.spark;
+import org.apache.spark.annotation.DeveloperApi;
import org.apache.spark.scheduler.*;
/**
@@ -27,7 +28,11 @@
* new methods to SparkListener: forgetting to add a method will result in a compilation error (if
* this was a concrete Scala class, default implementations of new event handlers would be inherited
* from the SparkListener trait).
+ *
+ * Please note until Spark 3.1.0 this was missing the DevelopApi annotation, this needs to be
+ * taken into account if changing this API before a major release.
*/
+@DeveloperApi
public class SparkFirehoseListener implements SparkListenerInterface {
public void onEvent(SparkListenerEvent event) { }
@@ -124,34 +129,67 @@ public final void onExecutorBlacklisted(SparkListenerExecutorBlacklisted executo
onEvent(executorBlacklisted);
}
+ @Override
+ public final void onExecutorExcluded(SparkListenerExecutorExcluded executorExcluded) {
+ onEvent(executorExcluded);
+ }
+
@Override
public void onExecutorBlacklistedForStage(
SparkListenerExecutorBlacklistedForStage executorBlacklistedForStage) {
onEvent(executorBlacklistedForStage);
}
+ @Override
+ public void onExecutorExcludedForStage(
+ SparkListenerExecutorExcludedForStage executorExcludedForStage) {
+ onEvent(executorExcludedForStage);
+ }
+
@Override
public void onNodeBlacklistedForStage(
SparkListenerNodeBlacklistedForStage nodeBlacklistedForStage) {
onEvent(nodeBlacklistedForStage);
}
+ @Override
+ public void onNodeExcludedForStage(
+ SparkListenerNodeExcludedForStage nodeExcludedForStage) {
+ onEvent(nodeExcludedForStage);
+ }
+
@Override
public final void onExecutorUnblacklisted(
SparkListenerExecutorUnblacklisted executorUnblacklisted) {
onEvent(executorUnblacklisted);
}
+ @Override
+ public final void onExecutorUnexcluded(
+ SparkListenerExecutorUnexcluded executorUnexcluded) {
+ onEvent(executorUnexcluded);
+ }
+
@Override
public final void onNodeBlacklisted(SparkListenerNodeBlacklisted nodeBlacklisted) {
onEvent(nodeBlacklisted);
}
+ @Override
+ public final void onNodeExcluded(SparkListenerNodeExcluded nodeExcluded) {
+ onEvent(nodeExcluded);
+ }
+
@Override
public final void onNodeUnblacklisted(SparkListenerNodeUnblacklisted nodeUnblacklisted) {
onEvent(nodeUnblacklisted);
}
+ @Override
+ public final void onNodeUnexcluded(SparkListenerNodeUnexcluded nodeUnexcluded) {
+ onEvent(nodeUnexcluded);
+ }
+
@Override
public void onBlockUpdated(SparkListenerBlockUpdated blockUpdated) {
onEvent(blockUpdated);
@@ -162,6 +200,21 @@ public void onSpeculativeTaskSubmitted(SparkListenerSpeculativeTaskSubmitted spe
onEvent(speculativeTask);
}
+ public void onUnschedulableTaskSetAdded(
+ SparkListenerUnschedulableTaskSetAdded unschedulableTaskSetAdded) {
+ onEvent(unschedulableTaskSetAdded);
+ }
+
+ public void onUnschedulableTaskSetRemoved(
+ SparkListenerUnschedulableTaskSetRemoved unschedulableTaskSetRemoved) {
+ onEvent(unschedulableTaskSetRemoved);
+ }
+
+ @Override
+ public void onResourceProfileAdded(SparkListenerResourceProfileAdded event) {
+ onEvent(event);
+ }
+
@Override
public void onOtherEvent(SparkListenerEvent event) {
onEvent(event);
diff --git a/core/src/main/java/org/apache/spark/api/java/StorageLevels.java b/core/src/main/java/org/apache/spark/api/java/StorageLevels.java
index 3fcb52f615834..b51cde48e632b 100644
--- a/core/src/main/java/org/apache/spark/api/java/StorageLevels.java
+++ b/core/src/main/java/org/apache/spark/api/java/StorageLevels.java
@@ -26,6 +26,7 @@ public class StorageLevels {
public static final StorageLevel NONE = create(false, false, false, false, 1);
public static final StorageLevel DISK_ONLY = create(true, false, false, false, 1);
public static final StorageLevel DISK_ONLY_2 = create(true, false, false, false, 2);
+ public static final StorageLevel DISK_ONLY_3 = create(true, false, false, false, 3);
public static final StorageLevel MEMORY_ONLY = create(false, true, false, true, 1);
public static final StorageLevel MEMORY_ONLY_2 = create(false, true, false, true, 2);
public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, false, 1);
diff --git a/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java b/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java
index 0c0d0df8ae682..1d676ff781c70 100644
--- a/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java
+++ b/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java
@@ -41,7 +41,7 @@ public interface DriverPlugin {
* initialization.
*
* It's recommended that plugins be careful about what operations are performed in this call,
- * preferrably performing expensive operations in a separate thread, or postponing them until
+ * preferably performing expensive operations in a separate thread, or postponing them until
* the application has fully started.
*
* @param sc The SparkContext loading the plugin.
diff --git a/core/src/main/java/org/apache/spark/api/plugin/ExecutorPlugin.java b/core/src/main/java/org/apache/spark/api/plugin/ExecutorPlugin.java
index 4961308035163..481bf985f1c6c 100644
--- a/core/src/main/java/org/apache/spark/api/plugin/ExecutorPlugin.java
+++ b/core/src/main/java/org/apache/spark/api/plugin/ExecutorPlugin.java
@@ -19,6 +19,7 @@
import java.util.Map;
+import org.apache.spark.TaskFailedReason;
import org.apache.spark.annotation.DeveloperApi;
/**
@@ -54,4 +55,45 @@ default void init(PluginContext ctx, Map extraConf) {}
*/
default void shutdown() {}
+ /**
+ * Perform any action before the task is run.
+ *
+ * This method is invoked from the same thread the task will be executed.
+ * Task-specific information can be accessed via {@link org.apache.spark.TaskContext#get}.
+ *
+ * Plugin authors should avoid expensive operations here, as this method will be called
+ * on every task, and doing something expensive can significantly slow down a job.
+ * It is not recommended for a user to call a remote service, for example.
+ *
+ * Exceptions thrown from this method do not propagate - they're caught,
+ * logged, and suppressed. Therefore exceptions when executing this method won't
+ * make the job fail.
+ *
+ * @since 3.1.0
+ */
+ default void onTaskStart() {}
+
+ /**
+ * Perform an action after tasks completes without exceptions.
+ *
+ * As {@link #onTaskStart() onTaskStart} exceptions are suppressed, this method
+ * will still be invoked even if the corresponding {@link #onTaskStart} call for this
+ * task failed.
+ *
+ * Same warnings of {@link #onTaskStart() onTaskStart} apply here.
+ *
+ * @since 3.1.0
+ */
+ default void onTaskSucceeded() {}
+
+ /**
+ * Perform an action after tasks completes with exceptions.
+ *
* This can also close any resources and clean up temporary state if necessary.
*
- * The returned array should contain, for each partition from (0) to (numPartitions - 1), the
- * number of bytes written by the partition writer for that partition id.
+ * The returned commit message is a structure with two components:
+ *
+ * 1) An array of longs, which should contain, for each partition from (0) to
+ * (numPartitions - 1), the number of bytes written by the partition writer
+ * for that partition id.
+ *
+ * 2) An optional metadata blob that can be used by shuffle readers.
*/
- long[] commitAllPartitions() throws IOException;
+ MapOutputCommitMessage commitAllPartitions() throws IOException;
/**
* Abort all of the writes done by any writers returned by {@link #getPartitionWriter(int)}.
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/WritableByteChannelWrapper.java b/core/src/main/java/org/apache/spark/shuffle/api/WritableByteChannelWrapper.java
index a204903008a51..04a75dde97cb4 100644
--- a/core/src/main/java/org/apache/spark/shuffle/api/WritableByteChannelWrapper.java
+++ b/core/src/main/java/org/apache/spark/shuffle/api/WritableByteChannelWrapper.java
@@ -24,7 +24,6 @@
/**
* :: Private ::
- *
* A thin wrapper around a {@link WritableByteChannel}.
*
* This is primarily provided for the local disk shuffle implementation to provide a
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/metadata/MapOutputCommitMessage.java b/core/src/main/java/org/apache/spark/shuffle/api/metadata/MapOutputCommitMessage.java
new file mode 100644
index 0000000000000..c5ded5e75a2d7
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/shuffle/api/metadata/MapOutputCommitMessage.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api.metadata;
+
+import java.util.Optional;
+
+import org.apache.spark.annotation.Private;
+
+/**
+ * :: Private ::
+ * Represents the result of writing map outputs for a shuffle map task.
+ *
+ * Partition lengths represents the length of each block written in the map task. This can
+ * be used for downstream readers to allocate resources, such as in-memory buffers.
+ *
+ * Map output writers can choose to attach arbitrary metadata tags to register with a
+ * shuffle output tracker (a module that is currently yet to be built in a future
+ * iteration of the shuffle storage APIs).
+ */
+@Private
+public final class MapOutputCommitMessage {
+
+ private final long[] partitionLengths;
+ private final Optional mapOutputMetadata;
+
+ private MapOutputCommitMessage(
+ long[] partitionLengths, Optional mapOutputMetadata) {
+ this.partitionLengths = partitionLengths;
+ this.mapOutputMetadata = mapOutputMetadata;
+ }
+
+ public static MapOutputCommitMessage of(long[] partitionLengths) {
+ return new MapOutputCommitMessage(partitionLengths, Optional.empty());
+ }
+
+ public static MapOutputCommitMessage of(
+ long[] partitionLengths, MapOutputMetadata mapOutputMetadata) {
+ return new MapOutputCommitMessage(partitionLengths, Optional.of(mapOutputMetadata));
+ }
+
+ public long[] getPartitionLengths() {
+ return partitionLengths;
+ }
+
+ public Optional getMapOutputMetadata() {
+ return mapOutputMetadata;
+ }
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/metadata/MapOutputMetadata.java b/core/src/main/java/org/apache/spark/shuffle/api/metadata/MapOutputMetadata.java
new file mode 100644
index 0000000000000..f509686621602
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/shuffle/api/metadata/MapOutputMetadata.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api.metadata;
+
+import java.io.Serializable;
+
+/**
+ * :: Private ::
+ * An opaque metadata tag for registering the result of committing the output of a
+ * shuffle map task.
+ *
+ * All implementations must be serializable since this is sent from the executors to
+ * the driver.
+ */
+public interface MapOutputMetadata extends Serializable {}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
index dc157eaa3b253..3dbee1b13d287 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -31,7 +31,6 @@
import scala.Tuple2;
import scala.collection.Iterator;
-import com.google.common.annotations.VisibleForTesting;
import com.google.common.io.Closeables;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -130,7 +129,7 @@ public void write(Iterator> records) throws IOException {
.createMapOutputWriter(shuffleId, mapId, numPartitions);
try {
if (!records.hasNext()) {
- partitionLengths = mapOutputWriter.commitAllPartitions();
+ partitionLengths = mapOutputWriter.commitAllPartitions().getPartitionLengths();
mapStatus = MapStatus$.MODULE$.apply(
blockManager.shuffleServerId(), partitionLengths, mapId);
return;
@@ -178,8 +177,8 @@ public void write(Iterator> records) throws IOException {
}
}
- @VisibleForTesting
- long[] getPartitionLengths() {
+ @Override
+ public long[] getPartitionLengths() {
return partitionLengths;
}
@@ -219,7 +218,7 @@ private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) thro
}
partitionWriters = null;
}
- return mapOutputWriter.commitAllPartitions();
+ return mapOutputWriter.commitAllPartitions().getPartitionLengths();
}
private void writePartitionedDataWithChannel(
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index d09282e61a9c7..e8f94ba8ffeee 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -18,6 +18,7 @@
package org.apache.spark.shuffle.sort;
import java.nio.channels.Channels;
+import java.util.Arrays;
import java.util.Optional;
import javax.annotation.Nullable;
import java.io.*;
@@ -87,6 +88,7 @@ public class UnsafeShuffleWriter extends ShuffleWriter {
@Nullable private MapStatus mapStatus;
@Nullable private ShuffleExternalSorter sorter;
+ @Nullable private long[] partitionLengths;
private long peakMemoryUsedBytes = 0;
/** Subclass of ByteArrayOutputStream that exposes `buf` directly. */
@@ -218,7 +220,6 @@ void closeAndWriteOutput() throws IOException {
serOutputStream = null;
final SpillInfo[] spills = sorter.closeAndGetSpills();
sorter = null;
- final long[] partitionLengths;
try {
partitionLengths = mergeSpills(spills);
} finally {
@@ -266,7 +267,7 @@ private long[] mergeSpills(SpillInfo[] spills) throws IOException {
if (spills.length == 0) {
final ShuffleMapOutputWriter mapWriter = shuffleExecutorComponents
.createMapOutputWriter(shuffleId, mapId, partitioner.numPartitions());
- return mapWriter.commitAllPartitions();
+ return mapWriter.commitAllPartitions().getPartitionLengths();
} else if (spills.length == 1) {
Optional maybeSingleFileWriter =
shuffleExecutorComponents.createSingleFileMapOutputWriter(shuffleId, mapId);
@@ -274,6 +275,8 @@ private long[] mergeSpills(SpillInfo[] spills) throws IOException {
// Here, we don't need to perform any metrics updates because the bytes written to this
// output file would have already been counted as shuffle bytes written.
partitionLengths = spills[0].partitionLengths;
+ logger.debug("Merge shuffle spills for mapId {} with length {}", mapId,
+ partitionLengths.length);
maybeSingleFileWriter.get().transferMapSpillFile(spills[0].file, partitionLengths);
} else {
partitionLengths = mergeSpillsUsingStandardWriter(spills);
@@ -327,7 +330,7 @@ private long[] mergeSpillsUsingStandardWriter(SpillInfo[] spills) throws IOExcep
// to be counted as shuffle write, but this will lead to double-counting of the final
// SpillInfo's bytes.
writeMetrics.decBytesWritten(spills[spills.length - 1].file.length());
- partitionLengths = mapWriter.commitAllPartitions();
+ partitionLengths = mapWriter.commitAllPartitions().getPartitionLengths();
} catch (Exception e) {
try {
mapWriter.abort(e);
@@ -360,6 +363,7 @@ private void mergeSpillsWithFileStream(
SpillInfo[] spills,
ShuffleMapOutputWriter mapWriter,
@Nullable CompressionCodec compressionCodec) throws IOException {
+ logger.debug("Merge shuffle spills with FileStream for mapId {}", mapId);
final int numPartitions = partitioner.numPartitions();
final InputStream[] spillInputStreams = new InputStream[spills.length];
@@ -369,6 +373,11 @@ private void mergeSpillsWithFileStream(
spillInputStreams[i] = new NioBufferedFileInputStream(
spills[i].file,
inputBufferSizeInBytes);
+ // Only convert the partitionLengths when debug level is enabled.
+ if (logger.isDebugEnabled()) {
+ logger.debug("Partition lengths for mapId {} in Spill {}: {}", mapId, i,
+ Arrays.toString(spills[i].partitionLengths));
+ }
}
for (int partition = 0; partition < numPartitions; partition++) {
boolean copyThrewException = true;
@@ -431,6 +440,7 @@ private void mergeSpillsWithFileStream(
private void mergeSpillsWithTransferTo(
SpillInfo[] spills,
ShuffleMapOutputWriter mapWriter) throws IOException {
+ logger.debug("Merge shuffle spills with TransferTo for mapId {}", mapId);
final int numPartitions = partitioner.numPartitions();
final FileChannel[] spillInputChannels = new FileChannel[spills.length];
final long[] spillInputChannelPositions = new long[spills.length];
@@ -439,6 +449,11 @@ private void mergeSpillsWithTransferTo(
try {
for (int i = 0; i < spills.length; i++) {
spillInputChannels[i] = new FileInputStream(spills[i].file).getChannel();
+ // Only convert the partitionLengths when debug level is enabled.
+ if (logger.isDebugEnabled()) {
+ logger.debug("Partition lengths for mapId {} in Spill {}: {}", mapId, i,
+ Arrays.toString(spills[i].partitionLengths));
+ }
}
for (int partition = 0; partition < numPartitions; partition++) {
boolean copyThrewException = true;
@@ -528,4 +543,9 @@ public void close() throws IOException {
channel.close();
}
}
+
+ @Override
+ public long[] getPartitionLengths() {
+ return partitionLengths;
+ }
}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java
index a6529fd76188a..0b286264be43d 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java
@@ -35,6 +35,7 @@
import org.apache.spark.shuffle.api.WritableByteChannelWrapper;
import org.apache.spark.internal.config.package$;
import org.apache.spark.shuffle.IndexShuffleBlockResolver;
+import org.apache.spark.shuffle.api.metadata.MapOutputCommitMessage;
import org.apache.spark.util.Utils;
/**
@@ -97,7 +98,7 @@ public ShufflePartitionWriter getPartitionWriter(int reducePartitionId) throws I
}
@Override
- public long[] commitAllPartitions() throws IOException {
+ public MapOutputCommitMessage commitAllPartitions() throws IOException {
// Check the position after transferTo loop to see if it is in the right position and raise a
// exception if it is incorrect. The position will not be increased to the expected length
// after calling transferTo in kernel version 2.6.32. This issue is described at
@@ -112,8 +113,10 @@ public long[] commitAllPartitions() throws IOException {
}
cleanUp();
File resolvedTmp = outputTempFile != null && outputTempFile.isFile() ? outputTempFile : null;
+ log.debug("Writing shuffle index file for mapId {} with length {}", mapId,
+ partitionLengths.length);
blockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, resolvedTmp);
- return partitionLengths;
+ return MapOutputCommitMessage.of(partitionLengths);
}
@Override
@@ -210,14 +213,14 @@ public long getNumBytesWritten() {
private class PartitionWriterStream extends OutputStream {
private final int partitionId;
- private int count = 0;
+ private long count = 0;
private boolean isClosed = false;
PartitionWriterStream(int partitionId) {
this.partitionId = partitionId;
}
- public int getCount() {
+ public long getCount() {
return count;
}
diff --git a/core/src/main/java/org/apache/spark/status/api/v1/TaskStatus.java b/core/src/main/java/org/apache/spark/status/api/v1/TaskStatus.java
new file mode 100644
index 0000000000000..dec9c31321839
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/status/api/v1/TaskStatus.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status.api.v1;
+
+import org.apache.spark.util.EnumUtil;
+
+public enum TaskStatus {
+ RUNNING,
+ KILLED,
+ FAILED,
+ SUCCESS,
+ UNKNOWN;
+
+ public static TaskStatus fromString(String str) {
+ return EnumUtil.parseIgnoreCase(TaskStatus.class, str);
+ }
+}
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 6e028886f2318..f474c30b8b3d8 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -393,10 +393,12 @@ public void remove() {
}
private void handleFailedDelete() {
- // remove the spill file from disk
- File file = spillWriters.removeFirst().getFile();
- if (file != null && file.exists() && !file.delete()) {
- logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+ if (spillWriters.size() > 0) {
+ // remove the spill file from disk
+ File file = spillWriters.removeFirst().getFile();
+ if (file != null && file.exists() && !file.delete()) {
+ logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+ }
}
}
}
@@ -428,6 +430,68 @@ public MapIterator destructiveIterator() {
return new MapIterator(numValues, new Location(), true);
}
+ /**
+ * Iterator for the entries of this map. This is to first iterate over key indices in
+ * `longArray` then accessing values in `dataPages`. NOTE: this is different from `MapIterator`
+ * in the sense that key index is preserved here
+ * (See `UnsafeHashedRelation` for example of usage).
+ */
+ public final class MapIteratorWithKeyIndex implements Iterator {
+
+ /**
+ * The index in `longArray` where the key is stored.
+ */
+ private int keyIndex = 0;
+
+ private int numRecords;
+ private final Location loc;
+
+ private MapIteratorWithKeyIndex() {
+ this.numRecords = numValues;
+ this.loc = new Location();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return numRecords > 0;
+ }
+
+ @Override
+ public Location next() {
+ if (!loc.isDefined() || !loc.nextValue()) {
+ while (longArray.get(keyIndex * 2) == 0) {
+ keyIndex++;
+ }
+ loc.with(keyIndex, 0, true);
+ keyIndex++;
+ }
+ numRecords--;
+ return loc;
+ }
+ }
+
+ /**
+ * Returns an iterator for iterating over the entries of this map,
+ * by first iterating over the key index inside hash map's `longArray`.
+ *
+ * For efficiency, all calls to `next()` will return the same {@link Location} object.
+ *
+ * The returned iterator is NOT thread-safe. If the map is modified while iterating over it,
+ * the behavior of the returned iterator is undefined.
+ */
+ public MapIteratorWithKeyIndex iteratorWithKeyIndex() {
+ return new MapIteratorWithKeyIndex();
+ }
+
+ /**
+ * The maximum number of allowed keys index.
+ *
+ * The value of allowed keys index is in the range of [0, maxNumKeysIndex - 1].
+ */
+ public int maxNumKeysIndex() {
+ return (int) (longArray.size() / 2);
+ }
+
/**
* Looks up a key, and return a {@link Location} handle that can be used to test existence
* and read/write values.
@@ -601,6 +665,14 @@ public boolean isDefined() {
return isDefined;
}
+ /**
+ * Returns index for key.
+ */
+ public int getKeyIndex() {
+ assert (isDefined);
+ return pos;
+ }
+
/**
* Returns the base object for key.
*/
@@ -738,12 +810,21 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
longArray.set(pos * 2 + 1, keyHashcode);
isDefined = true;
- // We use two array entries per key, so the array size is twice the capacity.
- // We should compare the current capacity of the array, instead of its size.
- if (numKeys >= growthThreshold && longArray.size() / 2 < MAX_CAPACITY) {
- try {
- growAndRehash();
- } catch (SparkOutOfMemoryError oom) {
+ // If the map has reached its growth threshold, try to grow it.
+ if (numKeys >= growthThreshold) {
+ // We use two array entries per key, so the array size is twice the capacity.
+ // We should compare the current capacity of the array, instead of its size.
+ if (longArray.size() / 2 < MAX_CAPACITY) {
+ try {
+ growAndRehash();
+ } catch (SparkOutOfMemoryError oom) {
+ canGrowArray = false;
+ }
+ } else {
+ // The map is already at MAX_CAPACITY and cannot grow. Instead, we prevent it from
+ // accepting any more new elements to make sure we don't exceed the load factor. If we
+ // need to spill later, this allows UnsafeKVExternalSorter to reuse the array for
+ // sorting.
canGrowArray = false;
}
}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 55e4e609c3c7b..c38327cae8ce3 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -104,11 +104,14 @@ public static UnsafeExternalSorter createWithExistingInMemorySorter(
int initialSize,
long pageSizeBytes,
int numElementsForSpillThreshold,
- UnsafeInMemorySorter inMemorySorter) throws IOException {
+ UnsafeInMemorySorter inMemorySorter,
+ long existingMemoryConsumption) throws IOException {
UnsafeExternalSorter sorter = new UnsafeExternalSorter(taskMemoryManager, blockManager,
serializerManager, taskContext, recordComparatorSupplier, prefixComparator, initialSize,
pageSizeBytes, numElementsForSpillThreshold, inMemorySorter, false /* ignored */);
sorter.spill(Long.MAX_VALUE, sorter);
+ taskContext.taskMetrics().incMemoryBytesSpilled(existingMemoryConsumption);
+ sorter.totalSpillBytes += existingMemoryConsumption;
// The external sorter will be used to insert records, in-memory sorter is not needed.
sorter.inMemSorter = null;
return sorter;
@@ -203,6 +206,10 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
}
if (inMemSorter == null || inMemSorter.numRecords() <= 0) {
+ // There could still be some memory allocated when there are no records in the in-memory
+ // sorter. We will not spill it however, to ensure that we can always process at least one
+ // record before spilling. See the comments in `allocateMemoryForRecordIfNecessary` for why
+ // this is necessary.
return 0L;
}
@@ -224,7 +231,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
// Note that this is more-or-less going to be a multiple of the page size, so wasted space in
// pages will currently be counted as memory spilled even though that space isn't actually
// written to disk. This also counts the space needed to store the sorter's pointer array.
- inMemSorter.reset();
+ inMemSorter.freeMemory();
// Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the
// records. Otherwise, if the task is over allocated memory, then without freeing the memory
// pages, we might not be able to get memory for the pointer array.
@@ -325,7 +332,7 @@ public void cleanupResources() {
deleteSpillFiles();
freeMemory();
if (inMemSorter != null) {
- inMemSorter.free();
+ inMemSorter.freeMemory();
inMemSorter = null;
}
}
@@ -339,40 +346,53 @@ public void cleanupResources() {
private void growPointerArrayIfNecessary() throws IOException {
assert(inMemSorter != null);
if (!inMemSorter.hasSpaceForAnotherRecord()) {
+ if (inMemSorter.numRecords() <= 0) {
+ // Spilling was triggered just before this method was called. The pointer array was freed
+ // during the spill, so a new pointer array needs to be allocated here.
+ LongArray array = allocateArray(inMemSorter.getInitialSize());
+ inMemSorter.expandPointerArray(array);
+ return;
+ }
+
long used = inMemSorter.getMemoryUsage();
- LongArray array;
+ LongArray array = null;
try {
// could trigger spilling
array = allocateArray(used / 8 * 2);
} catch (TooLargePageException e) {
// The pointer array is too big to fix in a single page, spill.
spill();
- return;
} catch (SparkOutOfMemoryError e) {
- // should have trigger spilling
- if (!inMemSorter.hasSpaceForAnotherRecord()) {
+ if (inMemSorter.numRecords() > 0) {
logger.error("Unable to grow the pointer array");
throw e;
}
- return;
+ // The new array could not be allocated, but that is not an issue as it is longer needed,
+ // as all records were spilled.
}
- // check if spilling is triggered or not
- if (inMemSorter.hasSpaceForAnotherRecord()) {
- freeArray(array);
- } else {
- inMemSorter.expandPointerArray(array);
+
+ if (inMemSorter.numRecords() <= 0) {
+ // Spilling was triggered while trying to allocate the new array.
+ if (array != null) {
+ // We succeeded in allocating the new array, but, since all records were spilled, a
+ // smaller array would also suffice.
+ freeArray(array);
+ }
+ // The pointer array was freed during the spill, so a new pointer array needs to be
+ // allocated here.
+ array = allocateArray(inMemSorter.getInitialSize());
}
+ inMemSorter.expandPointerArray(array);
}
}
/**
- * Allocates more memory in order to insert an additional record. This will request additional
- * memory from the memory manager and spill if the requested memory can not be obtained.
+ * Allocates an additional page in order to insert an additional record. This will request
+ * additional memory from the memory manager and spill if the requested memory can not be
+ * obtained.
*
* @param required the required space in the data page, in bytes, including space for storing
- * the record size. This must be less than or equal to the page size (records
- * that exceed the page size are handled via a different code path which uses
- * special overflow pages).
+ * the record size.
*/
private void acquireNewPageIfNecessary(int required) {
if (currentPage == null ||
@@ -384,6 +404,37 @@ private void acquireNewPageIfNecessary(int required) {
}
}
+ /**
+ * Allocates more memory in order to insert an additional record. This will request additional
+ * memory from the memory manager and spill if the requested memory can not be obtained.
+ *
+ * @param required the required space in the data page, in bytes, including space for storing
+ * the record size.
+ */
+ private void allocateMemoryForRecordIfNecessary(int required) throws IOException {
+ // Step 1:
+ // Ensure that the pointer array has space for another record. This may cause a spill.
+ growPointerArrayIfNecessary();
+ // Step 2:
+ // Ensure that the last page has space for another record. This may cause a spill.
+ acquireNewPageIfNecessary(required);
+ // Step 3:
+ // The allocation in step 2 could have caused a spill, which would have freed the pointer
+ // array allocated in step 1. Therefore we need to check again whether we have to allocate
+ // a new pointer array.
+ //
+ // If the allocation in this step causes a spill event then it will not cause the page
+ // allocated in the previous step to be freed. The function `spill` only frees memory if at
+ // least one record has been inserted in the in-memory sorter. This will not be the case if
+ // we have spilled in the previous step.
+ //
+ // If we did not spill in the previous step then `growPointerArrayIfNecessary` will be a
+ // no-op that does not allocate any memory, and therefore can't cause a spill event.
+ //
+ // Thus there is no need to call `acquireNewPageIfNecessary` again after this step.
+ growPointerArrayIfNecessary();
+ }
+
/**
* Write a record to the sorter.
*/
@@ -398,11 +449,10 @@ public void insertRecord(
spill();
}
- growPointerArrayIfNecessary();
- int uaoSize = UnsafeAlignedOffset.getUaoSize();
+ final int uaoSize = UnsafeAlignedOffset.getUaoSize();
// Need 4 or 8 bytes to store the record length.
final int required = length + uaoSize;
- acquireNewPageIfNecessary(required);
+ allocateMemoryForRecordIfNecessary(required);
final Object base = currentPage.getBaseObject();
final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
@@ -425,10 +475,9 @@ public void insertKVRecord(Object keyBase, long keyOffset, int keyLen,
Object valueBase, long valueOffset, int valueLen, long prefix, boolean prefixIsNull)
throws IOException {
- growPointerArrayIfNecessary();
- int uaoSize = UnsafeAlignedOffset.getUaoSize();
+ final int uaoSize = UnsafeAlignedOffset.getUaoSize();
final int required = keyLen + valueLen + (2 * uaoSize);
- acquireNewPageIfNecessary(required);
+ allocateMemoryForRecordIfNecessary(required);
final Object base = currentPage.getBaseObject();
final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
@@ -450,6 +499,7 @@ public void insertKVRecord(Object keyBase, long keyOffset, int keyLen,
*/
public void merge(UnsafeExternalSorter other) throws IOException {
other.spill();
+ totalSpillBytes += other.totalSpillBytes;
spillWriters.addAll(other.spillWriters);
// remove them from `spillWriters`, or the files will be deleted in `cleanupResources`.
other.spillWriters.clear();
@@ -501,10 +551,14 @@ private static void spillIterator(UnsafeSorterIterator inMemIterator,
*/
class SpillableIterator extends UnsafeSorterIterator {
private UnsafeSorterIterator upstream;
- private UnsafeSorterIterator nextUpstream = null;
private MemoryBlock lastPage = null;
private boolean loaded = false;
- private int numRecords = 0;
+ private int numRecords;
+
+ private Object currentBaseObject;
+ private long currentBaseOffset;
+ private int currentRecordLength;
+ private long currentKeyPrefix;
SpillableIterator(UnsafeSorterIterator inMemIterator) {
this.upstream = inMemIterator;
@@ -516,23 +570,32 @@ public int getNumRecords() {
return numRecords;
}
+ @Override
+ public long getCurrentPageNumber() {
+ throw new UnsupportedOperationException();
+ }
+
public long spill() throws IOException {
synchronized (this) {
- if (!(upstream instanceof UnsafeInMemorySorter.SortedIterator && nextUpstream == null
- && numRecords > 0)) {
+ if (inMemSorter == null) {
return 0L;
}
- UnsafeInMemorySorter.SortedIterator inMemIterator =
- ((UnsafeInMemorySorter.SortedIterator) upstream).clone();
+ long currentPageNumber = upstream.getCurrentPageNumber();
- ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
- // Iterate over the records that have not been returned and spill them.
- final UnsafeSorterSpillWriter spillWriter =
- new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics, numRecords);
- spillIterator(inMemIterator, spillWriter);
- spillWriters.add(spillWriter);
- nextUpstream = spillWriter.getReader(serializerManager);
+ ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
+ if (numRecords > 0) {
+ // Iterate over the records that have not been returned and spill them.
+ final UnsafeSorterSpillWriter spillWriter = new UnsafeSorterSpillWriter(
+ blockManager, fileBufferSizeBytes, writeMetrics, numRecords);
+ spillIterator(upstream, spillWriter);
+ spillWriters.add(spillWriter);
+ upstream = spillWriter.getReader(serializerManager);
+ } else {
+ // Nothing to spill as all records have been read already, but do not return yet, as the
+ // memory still has to be freed.
+ upstream = null;
+ }
long released = 0L;
synchronized (UnsafeExternalSorter.this) {
@@ -540,8 +603,7 @@ public long spill() throws IOException {
// is accessing the current record. We free this page in that caller's next loadNext()
// call.
for (MemoryBlock page : allocatedPages) {
- if (!loaded || page.pageNumber !=
- ((UnsafeInMemorySorter.SortedIterator)upstream).getCurrentPageNumber()) {
+ if (!loaded || page.pageNumber != currentPageNumber) {
released += page.size();
freePage(page);
} else {
@@ -549,13 +611,18 @@ public long spill() throws IOException {
}
}
allocatedPages.clear();
+ if (lastPage != null) {
+ // Add the last page back to the list of allocated pages to make sure it gets freed in
+ // case loadNext() never gets called again.
+ allocatedPages.add(lastPage);
+ }
}
// in-memory sorter will not be used after spilling
assert(inMemSorter != null);
released += inMemSorter.getMemoryUsage();
totalSortTimeNanos += inMemSorter.getSortTimeNanos();
- inMemSorter.free();
+ inMemSorter.freeMemory();
inMemSorter = null;
taskContext.taskMetrics().incMemoryBytesSpilled(released);
taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten());
@@ -571,26 +638,32 @@ public boolean hasNext() {
@Override
public void loadNext() throws IOException {
+ assert upstream != null;
MemoryBlock pageToFree = null;
try {
synchronized (this) {
loaded = true;
- if (nextUpstream != null) {
- // Just consumed the last record from in memory iterator
- if(lastPage != null) {
- // Do not free the page here, while we are locking `SpillableIterator`. The `freePage`
- // method locks the `TaskMemoryManager`, and it's a bad idea to lock 2 objects in
- // sequence. We may hit dead lock if another thread locks `TaskMemoryManager` and
- // `SpillableIterator` in sequence, which may happen in
- // `TaskMemoryManager.acquireExecutionMemory`.
- pageToFree = lastPage;
- lastPage = null;
- }
- upstream = nextUpstream;
- nextUpstream = null;
+ // Just consumed the last record from the in-memory iterator.
+ if (lastPage != null) {
+ // Do not free the page here, while we are locking `SpillableIterator`. The `freePage`
+ // method locks the `TaskMemoryManager`, and it's a bad idea to lock 2 objects in
+ // sequence. We may hit dead lock if another thread locks `TaskMemoryManager` and
+ // `SpillableIterator` in sequence, which may happen in
+ // `TaskMemoryManager.acquireExecutionMemory`.
+ pageToFree = lastPage;
+ allocatedPages.clear();
+ lastPage = null;
}
numRecords--;
upstream.loadNext();
+
+ // Keep track of the current base object, base offset, record length, and key prefix,
+ // so that the current record can still be read in case a spill is triggered and we
+ // switch to the spill writer's iterator.
+ currentBaseObject = upstream.getBaseObject();
+ currentBaseOffset = upstream.getBaseOffset();
+ currentRecordLength = upstream.getRecordLength();
+ currentKeyPrefix = upstream.getKeyPrefix();
}
} finally {
if (pageToFree != null) {
@@ -601,22 +674,22 @@ public void loadNext() throws IOException {
@Override
public Object getBaseObject() {
- return upstream.getBaseObject();
+ return currentBaseObject;
}
@Override
public long getBaseOffset() {
- return upstream.getBaseOffset();
+ return currentBaseOffset;
}
@Override
public int getRecordLength() {
- return upstream.getRecordLength();
+ return currentRecordLength;
}
@Override
public long getKeyPrefix() {
- return upstream.getKeyPrefix();
+ return currentKeyPrefix;
}
}
@@ -646,7 +719,7 @@ public UnsafeSorterIterator getIterator(int startIndex) throws IOException {
}
i += spillWriter.recordsSpilled();
}
- if (inMemSorter != null) {
+ if (inMemSorter != null && inMemSorter.numRecords() > 0) {
UnsafeSorterIterator iter = inMemSorter.getSortedIterator();
moveOver(iter, startIndex - i);
queue.add(iter);
@@ -693,6 +766,11 @@ public int getNumRecords() {
return numRecords;
}
+ @Override
+ public long getCurrentPageNumber() {
+ return current.getCurrentPageNumber();
+ }
+
@Override
public boolean hasNext() {
while (!current.hasNext() && !iterators.isEmpty()) {
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 660eb790a550b..765ee035855d6 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -20,7 +20,7 @@
import java.util.Comparator;
import java.util.LinkedList;
-import org.apache.avro.reflect.Nullable;
+import javax.annotation.Nullable;
import org.apache.spark.TaskContext;
import org.apache.spark.memory.MemoryConsumer;
@@ -159,32 +159,26 @@ private int getUsableCapacity() {
return (int) (array.size() / (radixSortSupport != null ? 2 : 1.5));
}
+ public long getInitialSize() {
+ return initialSize;
+ }
+
/**
* Free the memory used by pointer array.
*/
- public void free() {
+ public void freeMemory() {
if (consumer != null) {
if (array != null) {
consumer.freeArray(array);
}
- array = null;
- }
- }
- public void reset() {
- if (consumer != null) {
- consumer.freeArray(array);
- // the call to consumer.allocateArray may trigger a spill which in turn access this instance
- // and eventually re-enter this method and try to free the array again. by setting the array
- // to null and its length to 0 we effectively make the spill code-path a no-op. setting the
- // array to null also indicates that it has already been de-allocated which prevents a double
- // de-allocation in free().
+ // Set the array to null instead of allocating a new array. Allocating an array could have
+ // triggered another spill and this method already is called from UnsafeExternalSorter when
+ // spilling. Attempting to allocate while spilling is dangerous, as we could be holding onto
+ // a large partially complete allocation, which may prevent other memory from being allocated.
+ // Instead we will allocate the new array when it is necessary.
array = null;
usableCapacity = 0;
- pos = 0;
- nullBoundaryPos = 0;
- array = consumer.allocateArray(initialSize);
- usableCapacity = getUsableCapacity();
}
pos = 0;
nullBoundaryPos = 0;
@@ -217,18 +211,20 @@ public boolean hasSpaceForAnotherRecord() {
}
public void expandPointerArray(LongArray newArray) {
- if (newArray.size() < array.size()) {
- // checkstyle.off: RegexpSinglelineJava
- throw new SparkOutOfMemoryError("Not enough memory to grow pointer array");
- // checkstyle.on: RegexpSinglelineJava
+ if (array != null) {
+ if (newArray.size() < array.size()) {
+ // checkstyle.off: RegexpSinglelineJava
+ throw new SparkOutOfMemoryError("Not enough memory to grow pointer array");
+ // checkstyle.on: RegexpSinglelineJava
+ }
+ Platform.copyMemory(
+ array.getBaseObject(),
+ array.getBaseOffset(),
+ newArray.getBaseObject(),
+ newArray.getBaseOffset(),
+ pos * 8L);
+ consumer.freeArray(array);
}
- Platform.copyMemory(
- array.getBaseObject(),
- array.getBaseOffset(),
- newArray.getBaseObject(),
- newArray.getBaseOffset(),
- pos * 8L);
- consumer.freeArray(array);
array = newArray;
usableCapacity = getUsableCapacity();
}
@@ -330,6 +326,7 @@ public void loadNext() {
@Override
public long getBaseOffset() { return baseOffset; }
+ @Override
public long getCurrentPageNumber() {
return currentPageNumber;
}
@@ -346,6 +343,11 @@ public long getCurrentPageNumber() {
* {@code next()} will return the same mutable object.
*/
public UnsafeSorterIterator getSortedIterator() {
+ if (numRecords() == 0) {
+ // `array` might be null, so make sure that it is not accessed by returning early.
+ return new SortedIterator(0, 0);
+ }
+
int offset = 0;
long start = System.nanoTime();
if (sortComparator != null) {
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java
index 1b3167fcc250c..d9f22311d07c2 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java
@@ -34,4 +34,6 @@ public abstract class UnsafeSorterIterator {
public abstract long getKeyPrefix();
public abstract int getNumRecords();
+
+ public abstract long getCurrentPageNumber();
}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
index ab800288dcb43..f8603c5799e9b 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
@@ -70,6 +70,11 @@ public int getNumRecords() {
return numRecords;
}
+ @Override
+ public long getCurrentPageNumber() {
+ throw new UnsupportedOperationException();
+ }
+
@Override
public boolean hasNext() {
return !priorityQueue.isEmpty() || (spillReader != null && spillReader.hasNext());
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index a524c4790407d..db79efd008530 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -89,6 +89,11 @@ public int getNumRecords() {
return numRecords;
}
+ @Override
+ public long getCurrentPageNumber() {
+ throw new UnsupportedOperationException();
+ }
+
@Override
public boolean hasNext() {
return (numRecordsRemaining > 0);
diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
index 0b26bfc5b2d82..37d56a06ded7f 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
@@ -56,8 +56,8 @@
Summary
- Blacklisted
+ title="Number of executors excluded by the scheduler due to task failures.">
+ Excluded
@@ -86,9 +86,26 @@
Executors
Off Heap Storage Memory
+
+
+ Peak JVM Memory OnHeap / OffHeap
+
+
+ Peak Execution Memory OnHeap / OffHeap
+
+
+ Peak Storage Memory OnHeap / OffHeap
+
+
+ Peak Pool Memory Direct / Mapped
Disk Used
Cores
-
Resources
+
Resources
+
Resource Profile Id
Active Tasks
Failed Tasks
Complete Tasks
@@ -111,10 +128,28 @@
Executors
Shuffle Write
Logs
Thread Dump
+
Exec Loss Reason
+
+
+
Miscellaneous Process
+
+
+
+
Process ID
+
Address
+
Status
+
Cores
+
Logs
+
+
+
+
+
+
diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
index ec57797ba0909..ab412a8589a28 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
@@ -15,607 +15,759 @@
* limitations under the License.
*/
+/* global $, Mustache, createRESTEndPointForExecutorsPage, createRESTEndPointForMiscellaneousProcess, */
+/* global createTemplateURI, formatBytes, formatDuration, formatLogsCells, getStandAloneAppId, */
+/* global jQuery, setDataTableDefaults */
+
var threadDumpEnabled = false;
+/* eslint-disable no-unused-vars */
function setThreadDumpEnabled(val) {
- threadDumpEnabled = val;
+ threadDumpEnabled = val;
}
+/* eslint-enable no-unused-vars */
function getThreadDumpEnabled() {
- return threadDumpEnabled;
+ return threadDumpEnabled;
+}
+
+function formatLossReason(removeReason) {
+ if (removeReason) {
+ return removeReason
+ } else {
+ return ""
+ }
}
function formatStatus(status, type, row) {
- if (row.isBlacklisted) {
- return "Blacklisted";
- }
+ if (row.isExcluded) {
+ return "Excluded";
+ }
- if (status) {
- if (row.blacklistedInStages.length == 0) {
- return "Active"
- }
- return "Active (Blacklisted in Stages: [" + row.blacklistedInStages.join(", ") + "])";
+ if (status) {
+ if (row.excludedInStages.length == 0) {
+ return "Active"
}
- return "Dead"
+ return "Active (Excluded in Stages: [" + row.excludedInStages.join(", ") + "])";
+ }
+ return "Dead"
+}
+
+function formatProcessStatus(activeStatus) {
+ if (activeStatus) {
+ return "Active"
+ }
+ return "Dead"
}
function formatResourceCells(resources) {
- var result = ""
- var count = 0
- $.each(resources, function (name, resInfo) {
- if (count > 0) {
- result += ", "
- }
- result += name + ': [' + resInfo.addresses.join(", ") + ']'
- count += 1
- });
- return result
+ var result = ""
+ var count = 0
+ $.each(resources, function (name, resInfo) {
+ if (count > 0) {
+ result += ", "
+ }
+ result += name + ': [' + resInfo.addresses.join(", ") + ']';
+ count += 1
+ });
+ return result
}
jQuery.extend(jQuery.fn.dataTableExt.oSort, {
- "title-numeric-pre": function (a) {
- var x = a.match(/title="*(-?[0-9\.]+)/)[1];
- return parseFloat(x);
- },
-
- "title-numeric-asc": function (a, b) {
- return ((a < b) ? -1 : ((a > b) ? 1 : 0));
- },
-
- "title-numeric-desc": function (a, b) {
- return ((a < b) ? 1 : ((a > b) ? -1 : 0));
- }
+ "title-numeric-pre": function (a) {
+ var x = a.match(/title="*(-?[0-9.]+)/)[1];
+ return parseFloat(x);
+ },
+
+ "title-numeric-asc": function (a, b) {
+ return ((a < b) ? -1 : ((a > b) ? 1 : 0));
+ },
+
+ "title-numeric-desc": function (a, b) {
+ return ((a < b) ? 1 : ((a > b) ? -1 : 0));
+ }
});
$(document).ajaxStop($.unblockUI);
$(document).ajaxStart(function () {
- $.blockUI({message: '
{headerRow}
@@ -459,13 +461,14 @@ private[spark] object UIUtils extends Logging {
skipped: Int,
reasonToNumKilled: Map[String, Int],
total: Int): Seq[Node] = {
- val ratio = if (total == 0) 100.0 else (completed.toDouble/total)*100
+ val ratio = if (total == 0) 100.0 else (completed.toDouble / total) * 100
val completeWidth = "width: %s%%".format(ratio)
// started + completed can be > total when there are speculative tasks
val boundedStarted = math.min(started, total - completed)
- val startWidth = "width: %s%%".format((boundedStarted.toDouble/total)*100)
+ val startRatio = if (total == 0) 0.0 else (boundedStarted.toDouble / total) * 100
+ val startWidth = "width: %s%%".format(startRatio)
-
- }
- }
- }
- {headerRow}
+ headerRow(blockHeaders, desc, pageSize, sortColumn, basePath, rddTag, "block")
}
override def row(block: BlockTableRowData): Seq[Node] = {
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index d5b3ce36e742a..1453840b834f2 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -19,7 +19,7 @@ package org.apache.spark.util
import java.{lang => jl}
import java.io.ObjectInputStream
-import java.util.{ArrayList, Collections}
+import java.util.ArrayList
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicLong
@@ -449,39 +449,46 @@ class DoubleAccumulator extends AccumulatorV2[jl.Double, jl.Double] {
* @since 2.0.0
*/
class CollectionAccumulator[T] extends AccumulatorV2[T, java.util.List[T]] {
- private val _list: java.util.List[T] = Collections.synchronizedList(new ArrayList[T]())
+ private var _list: java.util.List[T] = _
+
+ private def getOrCreate = {
+ _list = Option(_list).getOrElse(new java.util.ArrayList[T]())
+ _list
+ }
/**
* Returns false if this accumulator instance has any values in it.
*/
- override def isZero: Boolean = _list.isEmpty
+ override def isZero: Boolean = this.synchronized(getOrCreate.isEmpty)
override def copyAndReset(): CollectionAccumulator[T] = new CollectionAccumulator
override def copy(): CollectionAccumulator[T] = {
val newAcc = new CollectionAccumulator[T]
- _list.synchronized {
- newAcc._list.addAll(_list)
+ this.synchronized {
+ newAcc.getOrCreate.addAll(getOrCreate)
}
newAcc
}
- override def reset(): Unit = _list.clear()
+ override def reset(): Unit = this.synchronized {
+ _list = null
+ }
- override def add(v: T): Unit = _list.add(v)
+ override def add(v: T): Unit = this.synchronized(getOrCreate.add(v))
override def merge(other: AccumulatorV2[T, java.util.List[T]]): Unit = other match {
- case o: CollectionAccumulator[T] => _list.addAll(o.value)
+ case o: CollectionAccumulator[T] => this.synchronized(getOrCreate.addAll(o.value))
case _ => throw new UnsupportedOperationException(
s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
}
- override def value: java.util.List[T] = _list.synchronized {
- java.util.Collections.unmodifiableList(new ArrayList[T](_list))
+ override def value: java.util.List[T] = this.synchronized {
+ java.util.Collections.unmodifiableList(new ArrayList[T](getOrCreate))
}
- private[spark] def setValue(newValue: java.util.List[T]): Unit = {
- _list.clear()
- _list.addAll(newValue)
+ private[spark] def setValue(newValue: java.util.List[T]): Unit = this.synchronized {
+ _list = null
+ getOrCreate.addAll(newValue)
}
}
diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 6ffd6605f75b8..7e2b9c72ad91b 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -285,7 +285,7 @@ private[spark] object ClosureCleaner extends Logging {
logDebug(s" + outermost object is a closure, so we clone it: ${outermostClass}")
} else if (outermostClass.getName.startsWith("$line")) {
// SPARK-14558: if the outermost object is a REPL line object, we should clone
- // and clean it as it may carray a lot of unnecessary information,
+ // and clean it as it may carry a lot of unnecessary information,
// e.g. hadoop conf, spark conf, etc.
logDebug(s" + outermost object is a REPL line object, so we clone it:" +
s" ${outermostClass}")
diff --git a/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
new file mode 100644
index 0000000000000..f7135edd2129d
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.io.File
+import java.net.URI
+
+import org.apache.commons.lang3.StringUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.SparkSubmitUtils
+import org.apache.spark.internal.Logging
+
+case class IvyProperties(
+ packagesExclusions: String,
+ packages: String,
+ repositories: String,
+ ivyRepoPath: String,
+ ivySettingsPath: String)
+
+private[spark] object DependencyUtils extends Logging {
+
+ def getIvyProperties(): IvyProperties = {
+ val Seq(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath) = Seq(
+ "spark.jars.excludes",
+ "spark.jars.packages",
+ "spark.jars.repositories",
+ "spark.jars.ivy",
+ "spark.jars.ivySettings"
+ ).map(sys.props.get(_).orNull)
+ IvyProperties(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath)
+ }
+
+ private def isInvalidQueryString(tokens: Array[String]): Boolean = {
+ tokens.length != 2 || StringUtils.isBlank(tokens(0)) || StringUtils.isBlank(tokens(1))
+ }
+
+ /**
+ * Parse URI query string's parameter value of `transitive` and `exclude`.
+ * Other invalid parameters will be ignored.
+ *
+ * @param uri Ivy URI need to be downloaded.
+ * @return Tuple value of parameter `transitive` and `exclude` value.
+ *
+ * 1. transitive: whether to download dependency jar of Ivy URI, default value is true
+ * and this parameter value is case-insensitive. This mimics Hive's behaviour for
+ * parsing the transitive parameter. Invalid value will be treat as false.
+ * Example: Input: exclude=org.mortbay.jetty:jetty&transitive=true
+ * Output: true
+ *
+ * 2. exclude: comma separated exclusions to apply when resolving transitive dependencies,
+ * consists of `group:module` pairs separated by commas.
+ * Example: Input: excludeorg.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http
+ * Output: [org.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http]
+ */
+ private def parseQueryParams(uri: URI): (Boolean, String) = {
+ val uriQuery = uri.getQuery
+ if (uriQuery == null) {
+ (true, "")
+ } else {
+ val mapTokens = uriQuery.split("&").map(_.split("="))
+ if (mapTokens.exists(isInvalidQueryString)) {
+ throw new IllegalArgumentException(
+ s"Invalid query string in Ivy URI ${uri.toString}: $uriQuery")
+ }
+ val groupedParams = mapTokens.map(kv => (kv(0), kv(1))).groupBy(_._1)
+
+ // Parse transitive parameters (e.g., transitive=true) in an Ivy URI, default value is true
+ val transitiveParams = groupedParams.get("transitive")
+ if (transitiveParams.map(_.size).getOrElse(0) > 1) {
+ logWarning("It's best to specify `transitive` parameter in ivy URI query only once." +
+ " If there are multiple `transitive` parameter, we will select the last one")
+ }
+ val transitive =
+ transitiveParams.flatMap(_.takeRight(1).map(_._2.equalsIgnoreCase("true")).headOption)
+ .getOrElse(true)
+
+ // Parse an excluded list (e.g., exclude=org.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http)
+ // in an Ivy URI. When download Ivy URI jar, Spark won't download transitive jar
+ // in a excluded list.
+ val exclusionList = groupedParams.get("exclude").map { params =>
+ params.map(_._2).flatMap { excludeString =>
+ val excludes = excludeString.split(",")
+ if (excludes.map(_.split(":")).exists(isInvalidQueryString)) {
+ throw new IllegalArgumentException(
+ s"Invalid exclude string in Ivy URI ${uri.toString}:" +
+ " expected 'org:module,org:module,..', found " + excludeString)
+ }
+ excludes
+ }.mkString(",")
+ }.getOrElse("")
+
+ val validParams = Set("transitive", "exclude")
+ val invalidParams = groupedParams.keys.filterNot(validParams.contains).toSeq
+ if (invalidParams.nonEmpty) {
+ logWarning(s"Invalid parameters `${invalidParams.sorted.mkString(",")}` found " +
+ s"in Ivy URI query `$uriQuery`.")
+ }
+
+ (transitive, exclusionList)
+ }
+ }
+
+ /**
+ * Download Ivy URI's dependency jars.
+ *
+ * @param uri Ivy URI need to be downloaded. The URI format should be:
+ * `ivy://group:module:version[?query]`
+ * Ivy URI query part format should be:
+ * `parameter=value¶meter=value...`
+ * Note that currently Ivy URI query part support two parameters:
+ * 1. transitive: whether to download dependent jars related to your Ivy URI.
+ * transitive=false or `transitive=true`, if not set, the default value is true.
+ * 2. exclude: exclusion list when download Ivy URI jar and dependency jars.
+ * The `exclude` parameter content is a ',' separated `group:module` pair string :
+ * `exclude=group:module,group:module...`
+ * @return List of jars downloaded.
+ */
+ def resolveMavenDependencies(uri: URI): Seq[String] = {
+ val ivyProperties = DependencyUtils.getIvyProperties()
+ val authority = uri.getAuthority
+ if (authority == null) {
+ throw new IllegalArgumentException(
+ s"Invalid Ivy URI authority in uri ${uri.toString}:" +
+ " Expected 'org:module:version', found null.")
+ }
+ if (authority.split(":").length != 3) {
+ throw new IllegalArgumentException(
+ s"Invalid Ivy URI authority in uri ${uri.toString}:" +
+ s" Expected 'org:module:version', found $authority.")
+ }
+
+ val (transitive, exclusionList) = parseQueryParams(uri)
+
+ resolveMavenDependencies(
+ transitive,
+ exclusionList,
+ authority,
+ ivyProperties.repositories,
+ ivyProperties.ivyRepoPath,
+ Option(ivyProperties.ivySettingsPath)
+ )
+ }
+
+ def resolveMavenDependencies(
+ packagesTransitive: Boolean,
+ packagesExclusions: String,
+ packages: String,
+ repositories: String,
+ ivyRepoPath: String,
+ ivySettingsPath: Option[String]): Seq[String] = {
+ val exclusions: Seq[String] =
+ if (!StringUtils.isBlank(packagesExclusions)) {
+ packagesExclusions.split(",")
+ } else {
+ Nil
+ }
+ // Create the IvySettings, either load from file or build defaults
+ val ivySettings = ivySettingsPath match {
+ case Some(path) =>
+ SparkSubmitUtils.loadIvySettings(path, Option(repositories), Option(ivyRepoPath))
+
+ case None =>
+ SparkSubmitUtils.buildIvySettings(Option(repositories), Option(ivyRepoPath))
+ }
+
+ SparkSubmitUtils.resolveMavenCoordinates(packages, ivySettings,
+ transitive = packagesTransitive, exclusions = exclusions)
+ }
+
+ def resolveAndDownloadJars(
+ jars: String,
+ userJar: String,
+ sparkConf: SparkConf,
+ hadoopConf: Configuration): String = {
+ val targetDir = Utils.createTempDir()
+ val userJarName = userJar.split(File.separatorChar).last
+ Option(jars)
+ .map {
+ resolveGlobPaths(_, hadoopConf)
+ .split(",")
+ .filterNot(_.contains(userJarName))
+ .mkString(",")
+ }
+ .filterNot(_ == "")
+ .map(downloadFileList(_, targetDir, sparkConf, hadoopConf))
+ .orNull
+ }
+
+ def addJarsToClassPath(jars: String, loader: MutableURLClassLoader): Unit = {
+ if (jars != null) {
+ for (jar <- jars.split(",")) {
+ addJarToClasspath(jar, loader)
+ }
+ }
+ }
+
+ /**
+ * Download a list of remote files to temp local files. If the file is local, the original file
+ * will be returned.
+ *
+ * @param fileList A comma separated file list.
+ * @param targetDir A temporary directory for which downloaded files.
+ * @param sparkConf Spark configuration.
+ * @param hadoopConf Hadoop configuration.
+ * @return A comma separated local files list.
+ */
+ def downloadFileList(
+ fileList: String,
+ targetDir: File,
+ sparkConf: SparkConf,
+ hadoopConf: Configuration): String = {
+ require(fileList != null, "fileList cannot be null.")
+ Utils.stringToSeq(fileList)
+ .map(downloadFile(_, targetDir, sparkConf, hadoopConf))
+ .mkString(",")
+ }
+
+ /**
+ * Download a file from the remote to a local temporary directory. If the input path points to
+ * a local path, returns it with no operation.
+ *
+ * @param path A file path from where the files will be downloaded.
+ * @param targetDir A temporary directory for which downloaded files.
+ * @param sparkConf Spark configuration.
+ * @param hadoopConf Hadoop configuration.
+ * @return Path to the local file.
+ */
+ def downloadFile(
+ path: String,
+ targetDir: File,
+ sparkConf: SparkConf,
+ hadoopConf: Configuration): String = {
+ require(path != null, "path cannot be null.")
+ val uri = Utils.resolveURI(path)
+
+ uri.getScheme match {
+ case "file" | "local" => path
+ case "http" | "https" | "ftp" if Utils.isTesting =>
+ // This is only used for SparkSubmitSuite unit test. Instead of downloading file remotely,
+ // return a dummy local path instead.
+ val file = new File(uri.getPath)
+ new File(targetDir, file.getName).toURI.toString
+ case _ =>
+ val fname = new Path(uri).getName()
+ val localFile = Utils.doFetchFile(uri.toString(), targetDir, fname, sparkConf, hadoopConf)
+ localFile.toURI().toString()
+ }
+ }
+
+ def resolveGlobPaths(paths: String, hadoopConf: Configuration): String = {
+ require(paths != null, "paths cannot be null.")
+ Utils.stringToSeq(paths).flatMap { path =>
+ val (base, fragment) = splitOnFragment(path)
+ (resolveGlobPath(base, hadoopConf), fragment) match {
+ case (resolved, Some(_)) if resolved.length > 1 => throw new SparkException(
+ s"${base.toString} resolves ambiguously to multiple files: ${resolved.mkString(",")}")
+ case (resolved, Some(namedAs)) => resolved.map(_ + "#" + namedAs)
+ case (resolved, _) => resolved
+ }
+ }.mkString(",")
+ }
+
+ def addJarToClasspath(localJar: String, loader: MutableURLClassLoader): Unit = {
+ val uri = Utils.resolveURI(localJar)
+ uri.getScheme match {
+ case "file" | "local" =>
+ val file = new File(uri.getPath)
+ if (file.exists()) {
+ loader.addURL(file.toURI.toURL)
+ } else {
+ logWarning(s"Local jar $file does not exist, skipping.")
+ }
+ case _ =>
+ logWarning(s"Skip remote jar $uri.")
+ }
+ }
+
+ /**
+ * Merge a sequence of comma-separated file lists, some of which may be null to indicate
+ * no files, into a single comma-separated string.
+ */
+ def mergeFileLists(lists: String*): String = {
+ val merged = lists.filterNot(StringUtils.isBlank)
+ .flatMap(Utils.stringToSeq)
+ if (merged.nonEmpty) merged.mkString(",") else null
+ }
+
+ private def splitOnFragment(path: String): (URI, Option[String]) = {
+ val uri = Utils.resolveURI(path)
+ val withoutFragment = new URI(uri.getScheme, uri.getSchemeSpecificPart, null)
+ (withoutFragment, Option(uri.getFragment))
+ }
+
+ private def resolveGlobPath(uri: URI, hadoopConf: Configuration): Array[String] = {
+ uri.getScheme match {
+ case "local" | "http" | "https" | "ftp" => Array(uri.toString)
+ case _ =>
+ val fs = FileSystem.get(uri, hadoopConf)
+ Option(fs.globStatus(new Path(uri))).map { status =>
+ status.filter(_.isFile).map(_.getPath.toUri.toString)
+ }.getOrElse(Array(uri.toString))
+ }
+ }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala
new file mode 100644
index 0000000000000..60a73adc8582e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala
@@ -0,0 +1,370 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.io.FileNotFoundException
+
+import scala.collection.mutable
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.fs.viewfs.ViewFileSystem
+import org.apache.hadoop.hdfs.DistributedFileSystem
+
+import org.apache.spark._
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+
+/**
+ * Utility functions to simplify and speed-up file listing.
+ */
+private[spark] object HadoopFSUtils extends Logging {
+ /**
+ * Lists a collection of paths recursively. Picks the listing strategy adaptively depending
+ * on the number of paths to list.
+ *
+ * This may only be called on the driver.
+ *
+ * @param sc Spark context used to run parallel listing.
+ * @param paths Input paths to list
+ * @param hadoopConf Hadoop configuration
+ * @param filter Path filter used to exclude leaf files from result
+ * @param ignoreMissingFiles Ignore missing files that happen during recursive listing
+ * (e.g., due to race conditions)
+ * @param ignoreLocality Whether to fetch data locality info when listing leaf files. If false,
+ * this will return `FileStatus` without `BlockLocation` info.
+ * @param parallelismThreshold The threshold to enable parallelism. If the number of input paths
+ * is smaller than this value, this will fallback to use
+ * sequential listing.
+ * @param parallelismMax The maximum parallelism for listing. If the number of input paths is
+ * larger than this value, parallelism will be throttled to this value
+ * to avoid generating too many tasks.
+ * @return for each input path, the set of discovered files for the path
+ */
+ def parallelListLeafFiles(
+ sc: SparkContext,
+ paths: Seq[Path],
+ hadoopConf: Configuration,
+ filter: PathFilter,
+ ignoreMissingFiles: Boolean,
+ ignoreLocality: Boolean,
+ parallelismThreshold: Int,
+ parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
+ parallelListLeafFilesInternal(sc, paths, hadoopConf, filter, isRootLevel = true,
+ ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax)
+ }
+
+ private def parallelListLeafFilesInternal(
+ sc: SparkContext,
+ paths: Seq[Path],
+ hadoopConf: Configuration,
+ filter: PathFilter,
+ isRootLevel: Boolean,
+ ignoreMissingFiles: Boolean,
+ ignoreLocality: Boolean,
+ parallelismThreshold: Int,
+ parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
+
+ // Short-circuits parallel listing when serial listing is likely to be faster.
+ if (paths.size <= parallelismThreshold) {
+ return paths.map { path =>
+ val leafFiles = listLeafFiles(
+ path,
+ hadoopConf,
+ filter,
+ Some(sc),
+ ignoreMissingFiles = ignoreMissingFiles,
+ ignoreLocality = ignoreLocality,
+ isRootPath = isRootLevel,
+ parallelismThreshold = parallelismThreshold,
+ parallelismMax = parallelismMax)
+ (path, leafFiles)
+ }
+ }
+
+ logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." +
+ s" The first several paths are: ${paths.take(10).mkString(", ")}.")
+ HiveCatalogMetrics.incrementParallelListingJobCount(1)
+
+ val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+ val serializedPaths = paths.map(_.toString)
+
+ // Set the number of parallelism to prevent following file listing from generating many tasks
+ // in case of large #defaultParallelism.
+ val numParallelism = Math.min(paths.size, parallelismMax)
+
+ val previousJobDescription = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
+ val statusMap = try {
+ val description = paths.size match {
+ case 0 =>
+ "Listing leaf files and directories 0 paths"
+ case 1 =>
+ s"Listing leaf files and directories for 1 path: ${paths(0)}"
+ case s =>
+ s"Listing leaf files and directories for $s paths: ${paths(0)}, ..."
+ }
+ sc.setJobDescription(description)
+ sc
+ .parallelize(serializedPaths, numParallelism)
+ .mapPartitions { pathStrings =>
+ val hadoopConf = serializableConfiguration.value
+ pathStrings.map(new Path(_)).toSeq.map { path =>
+ val leafFiles = listLeafFiles(
+ path = path,
+ hadoopConf = hadoopConf,
+ filter = filter,
+ contextOpt = None, // Can't execute parallel scans on workers
+ ignoreMissingFiles = ignoreMissingFiles,
+ ignoreLocality = ignoreLocality,
+ isRootPath = isRootLevel,
+ parallelismThreshold = Int.MaxValue,
+ parallelismMax = 0)
+ (path, leafFiles)
+ }.iterator
+ }.map { case (path, statuses) =>
+ val serializableStatuses = statuses.map { status =>
+ // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+ val blockLocations = status match {
+ case f: LocatedFileStatus =>
+ f.getBlockLocations.map { loc =>
+ SerializableBlockLocation(
+ loc.getNames,
+ loc.getHosts,
+ loc.getOffset,
+ loc.getLength)
+ }
+
+ case _ =>
+ Array.empty[SerializableBlockLocation]
+ }
+
+ SerializableFileStatus(
+ status.getPath.toString,
+ status.getLen,
+ status.isDirectory,
+ status.getReplication,
+ status.getBlockSize,
+ status.getModificationTime,
+ status.getAccessTime,
+ blockLocations)
+ }
+ (path.toString, serializableStatuses)
+ }.collect()
+ } finally {
+ sc.setJobDescription(previousJobDescription)
+ }
+
+ // turn SerializableFileStatus back to Status
+ statusMap.map { case (path, serializableStatuses) =>
+ val statuses = serializableStatuses.map { f =>
+ val blockLocations = f.blockLocations.map { loc =>
+ new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+ }
+ new LocatedFileStatus(
+ new FileStatus(
+ f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime,
+ new Path(f.path)),
+ blockLocations)
+ }
+ (new Path(path), statuses)
+ }
+ }
+
+ // scalastyle:off argcount
+ /**
+ * Lists a single filesystem path recursively. If a `SparkContext` object is specified, this
+ * function may launch Spark jobs to parallelize listing based on `parallelismThreshold`.
+ *
+ * If sessionOpt is None, this may be called on executors.
+ *
+ * @return all children of path that match the specified filter.
+ */
+ private def listLeafFiles(
+ path: Path,
+ hadoopConf: Configuration,
+ filter: PathFilter,
+ contextOpt: Option[SparkContext],
+ ignoreMissingFiles: Boolean,
+ ignoreLocality: Boolean,
+ isRootPath: Boolean,
+ parallelismThreshold: Int,
+ parallelismMax: Int): Seq[FileStatus] = {
+
+ logTrace(s"Listing $path")
+ val fs = path.getFileSystem(hadoopConf)
+
+ // Note that statuses only include FileStatus for the files and dirs directly under path,
+ // and does not include anything else recursively.
+ val statuses: Array[FileStatus] = try {
+ fs match {
+ // DistributedFileSystem overrides listLocatedStatus to make 1 single call to namenode
+ // to retrieve the file status with the file block location. The reason to still fallback
+ // to listStatus is because the default implementation would potentially throw a
+ // FileNotFoundException which is better handled by doing the lookups manually below.
+ case (_: DistributedFileSystem | _: ViewFileSystem) if !ignoreLocality =>
+ val remoteIter = fs.listLocatedStatus(path)
+ new Iterator[LocatedFileStatus]() {
+ def next(): LocatedFileStatus = remoteIter.next
+ def hasNext(): Boolean = remoteIter.hasNext
+ }.toArray
+ case _ => fs.listStatus(path)
+ }
+ } catch {
+ // If we are listing a root path for SQL (e.g. a top level directory of a table), we need to
+ // ignore FileNotFoundExceptions during this root level of the listing because
+ //
+ // (a) certain code paths might construct an InMemoryFileIndex with root paths that
+ // might not exist (i.e. not all callers are guaranteed to have checked
+ // path existence prior to constructing InMemoryFileIndex) and,
+ // (b) we need to ignore deleted root paths during REFRESH TABLE, otherwise we break
+ // existing behavior and break the ability drop SessionCatalog tables when tables'
+ // root directories have been deleted (which breaks a number of Spark's own tests).
+ //
+ // If we are NOT listing a root path then a FileNotFoundException here means that the
+ // directory was present in a previous level of file listing but is absent in this
+ // listing, likely indicating a race condition (e.g. concurrent table overwrite or S3
+ // list inconsistency).
+ //
+ // The trade-off in supporting existing behaviors / use-cases is that we won't be
+ // able to detect race conditions involving root paths being deleted during
+ // InMemoryFileIndex construction. However, it's still a net improvement to detect and
+ // fail-fast on the non-root cases. For more info see the SPARK-27676 review discussion.
+ case _: FileNotFoundException if isRootPath || ignoreMissingFiles =>
+ logWarning(s"The directory $path was not found. Was it deleted very recently?")
+ Array.empty[FileStatus]
+ }
+
+ val filteredStatuses =
+ statuses.filterNot(status => shouldFilterOutPathName(status.getPath.getName))
+
+ val allLeafStatuses = {
+ val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
+ val nestedFiles: Seq[FileStatus] = contextOpt match {
+ case Some(context) if dirs.size > parallelismThreshold =>
+ parallelListLeafFilesInternal(
+ context,
+ dirs.map(_.getPath),
+ hadoopConf = hadoopConf,
+ filter = filter,
+ isRootLevel = false,
+ ignoreMissingFiles = ignoreMissingFiles,
+ ignoreLocality = ignoreLocality,
+ parallelismThreshold = parallelismThreshold,
+ parallelismMax = parallelismMax
+ ).flatMap(_._2)
+ case _ =>
+ dirs.flatMap { dir =>
+ listLeafFiles(
+ path = dir.getPath,
+ hadoopConf = hadoopConf,
+ filter = filter,
+ contextOpt = contextOpt,
+ ignoreMissingFiles = ignoreMissingFiles,
+ ignoreLocality = ignoreLocality,
+ isRootPath = false,
+ parallelismThreshold = parallelismThreshold,
+ parallelismMax = parallelismMax)
+ }
+ }
+ val allFiles = topLevelFiles ++ nestedFiles
+ if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
+ }
+
+ val missingFiles = mutable.ArrayBuffer.empty[String]
+ val resolvedLeafStatuses = allLeafStatuses.flatMap {
+ case f: LocatedFileStatus =>
+ Some(f)
+
+ // NOTE:
+ //
+ // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+ // operations, calling `getFileBlockLocations` does no harm here since these file system
+ // implementations don't actually issue RPC for this method.
+ //
+ // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+ // be a big deal since we always use to `parallelListLeafFiles` when the number of
+ // paths exceeds threshold.
+ case f if !ignoreLocality =>
+ // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+ // which is very slow on some file system (RawLocalFileSystem, which is launch a
+ // subprocess and parse the stdout).
+ try {
+ val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc =>
+ // Store BlockLocation objects to consume less memory
+ if (loc.getClass == classOf[BlockLocation]) {
+ loc
+ } else {
+ new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
+ }
+ }
+ val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+ f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+ if (f.isSymlink) {
+ lfs.setSymlink(f.getSymlink)
+ }
+ Some(lfs)
+ } catch {
+ case _: FileNotFoundException if ignoreMissingFiles =>
+ missingFiles += f.getPath.toString
+ None
+ }
+
+ case f => Some(f)
+ }
+
+ if (missingFiles.nonEmpty) {
+ logWarning(
+ s"the following files were missing during file scan:\n ${missingFiles.mkString("\n ")}")
+ }
+
+ resolvedLeafStatuses
+ }
+ // scalastyle:on argcount
+
+ /** A serializable variant of HDFS's BlockLocation. This is required by Hadoop 2.7. */
+ private case class SerializableBlockLocation(
+ names: Array[String],
+ hosts: Array[String],
+ offset: Long,
+ length: Long)
+
+ /** A serializable variant of HDFS's FileStatus. This is required by Hadoop 2.7. */
+ private case class SerializableFileStatus(
+ path: String,
+ length: Long,
+ isDir: Boolean,
+ blockReplication: Short,
+ blockSize: Long,
+ modificationTime: Long,
+ accessTime: Long,
+ blockLocations: Array[SerializableBlockLocation])
+
+ /** Checks if we should filter out this path name. */
+ def shouldFilterOutPathName(pathName: String): Boolean = {
+ // We filter follow paths:
+ // 1. everything that starts with _ and ., except _common_metadata and _metadata
+ // because Parquet needs to find those metadata files from leaf files returned by this method.
+ // We should refactor this logic to not mix metadata files with data files.
+ // 2. everything that ends with `._COPYING_`, because this is a intermediate state of file. we
+ // should skip this file in case of double reading.
+ val exclude = (pathName.startsWith("_") && !pathName.contains("=")) ||
+ pathName.startsWith(".") || pathName.endsWith("._COPYING_")
+ val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata")
+ exclude && !include
+ }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 9254ac94005f1..c74cca9b93b89 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -32,8 +32,8 @@ import org.json4s.jackson.JsonMethods._
import org.apache.spark._
import org.apache.spark.executor._
import org.apache.spark.metrics.ExecutorMetricType
-import org.apache.spark.rdd.RDDOperationScope
-import org.apache.spark.resource.{ResourceInformation, ResourceProfile}
+import org.apache.spark.rdd.{DeterministicLevel, RDDOperationScope}
+import org.apache.spark.resource.{ExecutorResourceRequest, ResourceInformation, ResourceProfile, TaskResourceRequest}
import org.apache.spark.scheduler._
import org.apache.spark.scheduler.cluster.ExecutorInfo
import org.apache.spark.storage._
@@ -105,6 +105,8 @@ private[spark] object JsonProtocol {
stageExecutorMetricsToJson(stageExecutorMetrics)
case blockUpdate: SparkListenerBlockUpdated =>
blockUpdateToJson(blockUpdate)
+ case resourceProfileAdded: SparkListenerResourceProfileAdded =>
+ resourceProfileAddedToJson(resourceProfileAdded)
case _ => parse(mapper.writeValueAsString(event))
}
}
@@ -224,6 +226,15 @@ private[spark] object JsonProtocol {
("Timestamp" -> applicationEnd.time)
}
+ def resourceProfileAddedToJson(profileAdded: SparkListenerResourceProfileAdded): JValue = {
+ ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.resourceProfileAdded) ~
+ ("Resource Profile Id" -> profileAdded.resourceProfile.id) ~
+ ("Executor Resource Requests" ->
+ executorResourceRequestMapToJson(profileAdded.resourceProfile.executorResources)) ~
+ ("Task Resource Requests" ->
+ taskResourceRequestMapToJson(profileAdded.resourceProfile.taskResources))
+ }
+
def executorAddedToJson(executorAdded: SparkListenerExecutorAdded): JValue = {
("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.executorAdded) ~
("Timestamp" -> executorAdded.time) ~
@@ -297,7 +308,8 @@ private[spark] object JsonProtocol {
("Submission Time" -> submissionTime) ~
("Completion Time" -> completionTime) ~
("Failure Reason" -> failureReason) ~
- ("Accumulables" -> accumulablesToJson(stageInfo.accumulables.values))
+ ("Accumulables" -> accumulablesToJson(stageInfo.accumulables.values)) ~
+ ("Resource Profile Id" -> stageInfo.resourceProfileId)
}
def taskInfoToJson(taskInfo: TaskInfo): JValue = {
@@ -316,12 +328,12 @@ private[spark] object JsonProtocol {
("Accumulables" -> accumulablesToJson(taskInfo.accumulables))
}
- private lazy val accumulableBlacklist = Set("internal.metrics.updatedBlockStatuses")
+ private lazy val accumulableExcludeList = Set("internal.metrics.updatedBlockStatuses")
def accumulablesToJson(accumulables: Iterable[AccumulableInfo]): JArray = {
JArray(accumulables
- .filterNot(_.name.exists(accumulableBlacklist.contains))
- .toList.map(accumulableInfoToJson))
+ .filterNot(_.name.exists(accumulableExcludeList.contains))
+ .toList.sortBy(_.id).map(accumulableInfoToJson))
}
def accumulableInfoToJson(accumulableInfo: AccumulableInfo): JValue = {
@@ -351,12 +363,22 @@ private[spark] object JsonProtocol {
case v: Long => JInt(v)
// We only have 3 kind of internal accumulator types, so if it's not int or long, it must be
// the blocks accumulator, whose type is `java.util.List[(BlockId, BlockStatus)]`
- case v =>
- JArray(v.asInstanceOf[java.util.List[(BlockId, BlockStatus)]].asScala.toList.map {
- case (id, status) =>
- ("Block ID" -> id.toString) ~
- ("Status" -> blockStatusToJson(status))
+ case v: java.util.List[_] =>
+ JArray(v.asScala.toList.flatMap {
+ case (id: BlockId, status: BlockStatus) =>
+ Some(
+ ("Block ID" -> id.toString) ~
+ ("Status" -> blockStatusToJson(status))
+ )
+ case _ =>
+ // Ignore unsupported types. A user may put `METRICS_PREFIX` in the name. We should
+ // not crash.
+ None
})
+ case _ =>
+ // Ignore unsupported types. A user may put `METRICS_PREFIX` in the name. We should not
+ // crash.
+ JNothing
}
} else {
// For all external accumulators, just use strings
@@ -475,6 +497,8 @@ private[spark] object JsonProtocol {
("Callsite" -> rddInfo.callSite) ~
("Parent IDs" -> parentIds) ~
("Storage Level" -> storageLevel) ~
+ ("Barrier" -> rddInfo.isBarrier) ~
+ ("DeterministicLevel" -> rddInfo.outputDeterministicLevel.toString) ~
("Number of Partitions" -> rddInfo.numPartitions) ~
("Number of Cached Partitions" -> rddInfo.numCachedPartitions) ~
("Memory Size" -> rddInfo.memSize) ~
@@ -500,7 +524,8 @@ private[spark] object JsonProtocol {
("Total Cores" -> executorInfo.totalCores) ~
("Log Urls" -> mapToJson(executorInfo.logUrlMap)) ~
("Attributes" -> mapToJson(executorInfo.attributes)) ~
- ("Resources" -> resourcesMapToJson(executorInfo.resourcesInfo))
+ ("Resources" -> resourcesMapToJson(executorInfo.resourcesInfo)) ~
+ ("Resource Profile Id" -> executorInfo.resourceProfileId)
}
def resourcesMapToJson(m: Map[String, ResourceInformation]): JValue = {
@@ -518,6 +543,34 @@ private[spark] object JsonProtocol {
("Disk Size" -> blockUpdatedInfo.diskSize)
}
+ def executorResourceRequestToJson(execReq: ExecutorResourceRequest): JValue = {
+ ("Resource Name" -> execReq.resourceName) ~
+ ("Amount" -> execReq.amount) ~
+ ("Discovery Script" -> execReq.discoveryScript) ~
+ ("Vendor" -> execReq.vendor)
+ }
+
+ def executorResourceRequestMapToJson(m: Map[String, ExecutorResourceRequest]): JValue = {
+ val jsonFields = m.map {
+ case (k, execReq) =>
+ JField(k, executorResourceRequestToJson(execReq))
+ }
+ JObject(jsonFields.toList)
+ }
+
+ def taskResourceRequestToJson(taskReq: TaskResourceRequest): JValue = {
+ ("Resource Name" -> taskReq.resourceName) ~
+ ("Amount" -> taskReq.amount)
+ }
+
+ def taskResourceRequestMapToJson(m: Map[String, TaskResourceRequest]): JValue = {
+ val jsonFields = m.map {
+ case (k, taskReq) =>
+ JField(k, taskResourceRequestToJson(taskReq))
+ }
+ JObject(jsonFields.toList)
+ }
+
/** ------------------------------ *
* Util JSON serialization methods |
* ------------------------------- */
@@ -577,6 +630,7 @@ private[spark] object JsonProtocol {
val metricsUpdate = Utils.getFormattedClassName(SparkListenerExecutorMetricsUpdate)
val stageExecutorMetrics = Utils.getFormattedClassName(SparkListenerStageExecutorMetrics)
val blockUpdate = Utils.getFormattedClassName(SparkListenerBlockUpdated)
+ val resourceProfileAdded = Utils.getFormattedClassName(SparkListenerResourceProfileAdded)
}
def sparkEventFromJson(json: JValue): SparkListenerEvent = {
@@ -602,6 +656,7 @@ private[spark] object JsonProtocol {
case `metricsUpdate` => executorMetricsUpdateFromJson(json)
case `stageExecutorMetrics` => stageExecutorMetricsFromJson(json)
case `blockUpdate` => blockUpdateFromJson(json)
+ case `resourceProfileAdded` => resourceProfileAddedFromJson(json)
case other => mapper.readValue(compact(render(json)), Utils.classForName(other))
.asInstanceOf[SparkListenerEvent]
}
@@ -678,6 +733,45 @@ private[spark] object JsonProtocol {
SparkListenerJobEnd(jobId, completionTime, jobResult)
}
+ def resourceProfileAddedFromJson(json: JValue): SparkListenerResourceProfileAdded = {
+ val profId = (json \ "Resource Profile Id").extract[Int]
+ val executorReqs = executorResourceRequestMapFromJson(json \ "Executor Resource Requests")
+ val taskReqs = taskResourceRequestMapFromJson(json \ "Task Resource Requests")
+ val rp = new ResourceProfile(executorReqs.toMap, taskReqs.toMap)
+ rp.setResourceProfileId(profId)
+ SparkListenerResourceProfileAdded(rp)
+ }
+
+ def executorResourceRequestFromJson(json: JValue): ExecutorResourceRequest = {
+ val rName = (json \ "Resource Name").extract[String]
+ val amount = (json \ "Amount").extract[Int]
+ val discoveryScript = (json \ "Discovery Script").extract[String]
+ val vendor = (json \ "Vendor").extract[String]
+ new ExecutorResourceRequest(rName, amount, discoveryScript, vendor)
+ }
+
+ def taskResourceRequestFromJson(json: JValue): TaskResourceRequest = {
+ val rName = (json \ "Resource Name").extract[String]
+ val amount = (json \ "Amount").extract[Int]
+ new TaskResourceRequest(rName, amount)
+ }
+
+ def taskResourceRequestMapFromJson(json: JValue): Map[String, TaskResourceRequest] = {
+ val jsonFields = json.asInstanceOf[JObject].obj
+ jsonFields.collect { case JField(k, v) =>
+ val req = taskResourceRequestFromJson(v)
+ (k, req)
+ }.toMap
+ }
+
+ def executorResourceRequestMapFromJson(json: JValue): Map[String, ExecutorResourceRequest] = {
+ val jsonFields = json.asInstanceOf[JObject].obj
+ jsonFields.collect { case JField(k, v) =>
+ val req = executorResourceRequestFromJson(v)
+ (k, req)
+ }.toMap
+ }
+
def environmentUpdateFromJson(json: JValue): SparkListenerEnvironmentUpdate = {
// For compatible with previous event logs
val hadoopProperties = jsonOption(json \ "Hadoop Properties").map(mapFromJson(_).toSeq)
@@ -804,9 +898,10 @@ private[spark] object JsonProtocol {
}
}
- val stageInfo = new StageInfo(
- stageId, attemptId, stageName, numTasks, rddInfos, parentIds, details,
- resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+ val rpId = jsonOption(json \ "Resource Profile Id").map(_.extract[Int])
+ val stageProf = rpId.getOrElse(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+ val stageInfo = new StageInfo(stageId, attemptId, stageName, numTasks, rddInfos,
+ parentIds, details, resourceProfileId = stageProf)
stageInfo.submissionTime = submissionTime
stageInfo.completionTime = completionTime
stageInfo.failureReason = failureReason
@@ -984,7 +1079,14 @@ private[spark] object JsonProtocol {
val blockManagerAddress = blockManagerIdFromJson(json \ "Block Manager Address")
val shuffleId = (json \ "Shuffle ID").extract[Int]
val mapId = (json \ "Map ID").extract[Long]
- val mapIndex = (json \ "Map Index").extract[Int]
+ val mapIndex = json \ "Map Index" match {
+ case JNothing =>
+ // Note, we use the invalid value Int.MinValue here to fill the map index for backward
+ // compatibility. Otherwise, the fetch failed event will be dropped when the history
+ // server loads the event log written by the Spark version before 3.0.
+ Int.MinValue
+ case x => x.extract[Int]
+ }
val reduceId = (json \ "Reduce ID").extract[Int]
val message = jsonOption(json \ "Message").map(_.extract[String])
new FetchFailed(blockManagerAddress, shuffleId, mapId, mapIndex, reduceId,
@@ -1074,8 +1176,12 @@ private[spark] object JsonProtocol {
val memSize = (json \ "Memory Size").extract[Long]
val diskSize = (json \ "Disk Size").extract[Long]
+ val outputDeterministicLevel = DeterministicLevel.withName(
+ jsonOption(json \ "DeterministicLevel").map(_.extract[String]).getOrElse("DETERMINATE"))
+
val rddInfo =
- new RDDInfo(rddId, name, numPartitions, storageLevel, isBarrier, parentIds, callsite, scope)
+ new RDDInfo(rddId, name, numPartitions, storageLevel, isBarrier, parentIds, callsite, scope,
+ outputDeterministicLevel)
rddInfo.numCachedPartitions = numCachedPartitions
rddInfo.memSize = memSize
rddInfo.diskSize = diskSize
@@ -1109,7 +1215,12 @@ private[spark] object JsonProtocol {
case Some(resources) => resourcesMapFromJson(resources).toMap
case None => Map.empty[String, ResourceInformation]
}
- new ExecutorInfo(executorHost, totalCores, logUrls, attributes, resources)
+ val resourceProfileId = jsonOption(json \ "Resource Profile Id") match {
+ case Some(id) => id.extract[Int]
+ case None => ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
+ }
+ new ExecutorInfo(executorHost, totalCores, logUrls, attributes.toMap, resources.toMap,
+ resourceProfileId)
}
def blockUpdatedInfoFromJson(json: JValue): BlockUpdatedInfo = {
@@ -1123,7 +1234,7 @@ private[spark] object JsonProtocol {
def resourcesMapFromJson(json: JValue): Map[String, ResourceInformation] = {
val jsonFields = json.asInstanceOf[JObject].obj
- jsonFields.map { case JField(k, v) =>
+ jsonFields.collect { case JField(k, v) =>
val resourceInfo = ResourceInformation.parseJson(v)
(k, resourceInfo)
}.toMap
@@ -1135,7 +1246,7 @@ private[spark] object JsonProtocol {
def mapFromJson(json: JValue): Map[String, String] = {
val jsonFields = json.asInstanceOf[JObject].obj
- jsonFields.map { case JField(k, JString(v)) => (k, v) }.toMap
+ jsonFields.collect { case JField(k, JString(v)) => (k, v) }.toMap
}
def propertiesFromJson(json: JValue): Properties = {
diff --git a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
index 51cd7d1284ff3..3520fa870c91b 100644
--- a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
@@ -27,6 +27,8 @@ import com.codahale.metrics.Timer
import org.apache.spark.SparkEnv
import org.apache.spark.internal.{config, Logging}
+import org.apache.spark.scheduler.EventLoggingListener
+import org.apache.spark.scheduler.SparkListenerEnvironmentUpdate
/**
* An event bus which posts events to its listeners.
@@ -128,7 +130,7 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
if (maybeTimerContext != null) {
val elapsed = maybeTimerContext.stop()
if (logSlowEventEnabled && elapsed > logSlowEventThreshold) {
- logInfo(s"Process of event ${event} by listener ${listenerName} took " +
+ logInfo(s"Process of event ${redactEvent(event)} by listener ${listenerName} took " +
s"${elapsed / 1000000000d}s.")
}
}
@@ -150,4 +152,12 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
listeners.asScala.filter(_.getClass == c).map(_.asInstanceOf[T]).toSeq
}
+ private def redactEvent(e: E): E = {
+ e match {
+ case event: SparkListenerEnvironmentUpdate =>
+ EventLoggingListener.redactEvent(env.conf, event).asInstanceOf[E]
+ case _ => e
+ }
+ }
+
}
diff --git a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
index 7272b375e5388..0e4debc595345 100644
--- a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
@@ -17,6 +17,8 @@
package org.apache.spark.util
+import scala.concurrent.duration._
+
import org.apache.spark.SparkConf
import org.apache.spark.internal.config
import org.apache.spark.internal.config.Network._
@@ -54,6 +56,14 @@ private[spark] object RpcUtils {
RpcTimeout(conf, Seq(RPC_LOOKUP_TIMEOUT.key, NETWORK_TIMEOUT.key), "120s")
}
+ /**
+ * Infinite timeout is used internally, so there's no timeout configuration property that
+ * controls it. Therefore, we use "infinite" without any specific reason as its timeout
+ * configuration property. And its timeout property should never be accessed since infinite
+ * means we never timeout.
+ */
+ val INFINITE_TIMEOUT = new RpcTimeout(Long.MaxValue.nanos, "infinite")
+
private val MAX_MESSAGE_SIZE_IN_MB = Int.MaxValue / 1024 / 1024
/** Returns the configured max message size for messages in bytes. */
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 78206c51c1028..d45dc937910d9 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -23,7 +23,6 @@ import java.util.concurrent.locks.ReentrantLock
import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor, Future}
import scala.concurrent.duration.{Duration, FiniteDuration}
-import scala.language.higherKinds
import scala.util.control.NonFatal
import com.google.common.util.concurrent.ThreadFactoryBuilder
diff --git a/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala b/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala
index 6a58ec142dd7f..24788d69121b2 100644
--- a/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala
+++ b/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala
@@ -31,7 +31,7 @@ private[spark] class UninterruptibleThread(
target: Runnable,
name: String) extends Thread(target, name) {
- def this(name: String) {
+ def this(name: String) = {
this(null, name)
}
diff --git a/core/src/main/scala/org/apache/spark/util/UninterruptibleThreadRunner.scala b/core/src/main/scala/org/apache/spark/util/UninterruptibleThreadRunner.scala
new file mode 100644
index 0000000000000..18108aa819db9
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/UninterruptibleThreadRunner.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Executors
+
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
+
+/**
+ * [[UninterruptibleThreadRunner]] ensures that all tasks are running in an
+ * [[UninterruptibleThread]]. A good example is Kafka consumer usage.
+ */
+private[spark] class UninterruptibleThreadRunner(threadName: String) {
+ private val thread = Executors.newSingleThreadExecutor((r: Runnable) => {
+ val t = new UninterruptibleThread(threadName) {
+ override def run(): Unit = {
+ r.run()
+ }
+ }
+ t.setDaemon(true)
+ t
+ })
+ private val execContext = ExecutionContext.fromExecutorService(thread)
+
+ def runUninterruptibly[T](body: => T): T = {
+ if (!Thread.currentThread.isInstanceOf[UninterruptibleThread]) {
+ val future = Future {
+ body
+ }(execContext)
+ ThreadUtils.awaitResult(future, Duration.Inf)
+ } else {
+ body
+ }
+ }
+
+ def shutdown(): Unit = {
+ thread.shutdown()
+ }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index c7db2127a6f04..e27666b284b53 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -28,7 +28,7 @@ import java.nio.channels.{Channels, FileChannel, WritableByteChannel}
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.security.SecureRandom
-import java.util.{Arrays, Locale, Properties, Random, UUID}
+import java.util.{Locale, Properties, Random, UUID}
import java.util.concurrent._
import java.util.concurrent.TimeUnit.NANOSECONDS
import java.util.zip.GZIPInputStream
@@ -53,6 +53,7 @@ import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
import org.apache.hadoop.security.UserGroupInformation
+import org.apache.hadoop.util.{RunJar, StringUtils}
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.eclipse.jetty.util.MultiException
import org.slf4j.Logger
@@ -376,7 +377,7 @@ private[spark] object Utils extends Logging {
* This returns a new InputStream which contains the same data as the original input stream.
* It may be entirely on in-memory buffer, or it may be a combination of in-memory data, and then
* continue to read from the original stream. The only real use of this is if the original input
- * stream will potentially detect corruption while the data is being read (eg. from compression).
+ * stream will potentially detect corruption while the data is being read (e.g. from compression).
* This allows for an eager check of corruption in the first maxSize bytes of data.
*
* @return An InputStream which includes all data from the original stream (combining buffered
@@ -486,15 +487,19 @@ private[spark] object Utils extends Logging {
*
* Throws SparkException if the target file already exists and has different contents than
* the requested file.
+ *
+ * If `shouldUntar` is true, it untars the given url if it is a tar.gz or tgz into `targetDir`.
+ * This is a legacy behavior, and users should better use `spark.archives` configuration or
+ * `SparkContext.addArchive`
*/
def fetchFile(
url: String,
targetDir: File,
conf: SparkConf,
- securityMgr: SecurityManager,
hadoopConf: Configuration,
timestamp: Long,
- useCache: Boolean): File = {
+ useCache: Boolean,
+ shouldUntar: Boolean = true): File = {
val fileName = decodeFileNameInURI(new URI(url))
val targetFile = new File(targetDir, fileName)
val fetchCacheEnabled = conf.getBoolean("spark.files.useFetchCache", defaultValue = true)
@@ -519,7 +524,7 @@ private[spark] object Utils extends Logging {
val cachedFile = new File(localDir, cachedFileName)
try {
if (!cachedFile.exists()) {
- doFetchFile(url, localDir, cachedFileName, conf, securityMgr, hadoopConf)
+ doFetchFile(url, localDir, cachedFileName, conf, hadoopConf)
}
} finally {
lock.release()
@@ -532,16 +537,26 @@ private[spark] object Utils extends Logging {
conf.getBoolean("spark.files.overwrite", false)
)
} else {
- doFetchFile(url, targetDir, fileName, conf, securityMgr, hadoopConf)
- }
-
- // Decompress the file if it's a .tar or .tar.gz
- if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) {
- logInfo("Untarring " + fileName)
- executeAndGetOutput(Seq("tar", "-xzf", fileName), targetDir)
- } else if (fileName.endsWith(".tar")) {
- logInfo("Untarring " + fileName)
- executeAndGetOutput(Seq("tar", "-xf", fileName), targetDir)
+ doFetchFile(url, targetDir, fileName, conf, hadoopConf)
+ }
+
+ if (shouldUntar) {
+ // Decompress the file if it's a .tar or .tar.gz
+ if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) {
+ logWarning(
+ "Untarring behavior will be deprecated at spark.files and " +
+ "SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive " +
+ "instead.")
+ logInfo("Untarring " + fileName)
+ executeAndGetOutput(Seq("tar", "-xzf", fileName), targetDir)
+ } else if (fileName.endsWith(".tar")) {
+ logWarning(
+ "Untarring behavior will be deprecated at spark.files and " +
+ "SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive " +
+ "instead.")
+ logInfo("Untarring " + fileName)
+ executeAndGetOutput(Seq("tar", "-xf", fileName), targetDir)
+ }
}
// Make the file executable - That's necessary for scripts
FileUtil.chmod(targetFile.getAbsolutePath, "a+x")
@@ -555,6 +570,26 @@ private[spark] object Utils extends Logging {
targetFile
}
+ /**
+ * Unpacks an archive file into the specified directory. It expects .jar, .zip, .tar.gz, .tgz
+ * and .tar files. This behaves same as Hadoop's archive in distributed cache. This method is
+ * basically copied from `org.apache.hadoop.yarn.util.FSDownload.unpack`.
+ */
+ def unpack(source: File, dest: File): Unit = {
+ val lowerSrc = StringUtils.toLowerCase(source.getName)
+ if (lowerSrc.endsWith(".jar")) {
+ RunJar.unJar(source, dest, RunJar.MATCH_ANY)
+ } else if (lowerSrc.endsWith(".zip")) {
+ FileUtil.unZip(source, dest)
+ } else if (
+ lowerSrc.endsWith(".tar.gz") || lowerSrc.endsWith(".tgz") || lowerSrc.endsWith(".tar")) {
+ FileUtil.unTar(source, dest)
+ } else {
+ logWarning(s"Cannot unpack $source, just copying it to $dest.")
+ copyRecursive(source, dest)
+ }
+ }
+
/** Records the duration of running `body`. */
def timeTakenMs[T](body: => T): (T, Long) = {
val startTime = System.nanoTime()
@@ -705,7 +740,6 @@ private[spark] object Utils extends Logging {
targetDir: File,
filename: String,
conf: SparkConf,
- securityMgr: SecurityManager,
hadoopConf: Configuration): File = {
val targetFile = new File(targetDir, filename)
val uri = new URI(url)
@@ -1026,13 +1060,27 @@ private[spark] object Utils extends Logging {
customHostname.getOrElse(InetAddresses.toUriString(localIpAddress))
}
+ /**
+ * Checks if the host contains only valid hostname/ip without port
+ * NOTE: Incase of IPV6 ip it should be enclosed inside []
+ */
def checkHost(host: String): Unit = {
- assert(host != null && host.indexOf(':') == -1, s"Expected hostname (not IP) but got $host")
+ if (host != null && host.split(":").length > 2) {
+ assert(host.startsWith("[") && host.endsWith("]"),
+ s"Expected hostname or IPv6 IP enclosed in [] but got $host")
+ } else {
+ assert(host != null && host.indexOf(':') == -1, s"Expected hostname or IP but got $host")
+ }
}
def checkHostPort(hostPort: String): Unit = {
- assert(hostPort != null && hostPort.indexOf(':') != -1,
- s"Expected host and port but got $hostPort")
+ if (hostPort != null && hostPort.split(":").length > 2) {
+ assert(hostPort != null && hostPort.indexOf("]:") != -1,
+ s"Expected host and port but got $hostPort")
+ } else {
+ assert(hostPort != null && hostPort.indexOf(':') != -1,
+ s"Expected host and port but got $hostPort")
+ }
}
// Typically, this will be of order of number of nodes in cluster
@@ -1046,18 +1094,30 @@ private[spark] object Utils extends Logging {
return cached
}
- val indx: Int = hostPort.lastIndexOf(':')
- // This is potentially broken - when dealing with ipv6 addresses for example, sigh ...
- // but then hadoop does not support ipv6 right now.
- // For now, we assume that if port exists, then it is valid - not check if it is an int > 0
- if (-1 == indx) {
+ def setDefaultPortValue: (String, Int) = {
val retval = (hostPort, 0)
hostPortParseResults.put(hostPort, retval)
- return retval
+ retval
+ }
+ // checks if the hostport contains IPV6 ip and parses the host, port
+ if (hostPort != null && hostPort.split(":").length > 2) {
+ val index: Int = hostPort.lastIndexOf("]:")
+ if (-1 == index) {
+ return setDefaultPortValue
+ }
+ val port = hostPort.substring(index + 2).trim()
+ val retval = (hostPort.substring(0, index + 1).trim(), if (port.isEmpty) 0 else port.toInt)
+ hostPortParseResults.putIfAbsent(hostPort, retval)
+ } else {
+ val index: Int = hostPort.lastIndexOf(':')
+ if (-1 == index) {
+ return setDefaultPortValue
+ }
+ val port = hostPort.substring(index + 1).trim()
+ val retval = (hostPort.substring(0, index).trim(), if (port.isEmpty) 0 else port.toInt)
+ hostPortParseResults.putIfAbsent(hostPort, retval)
}
- val retval = (hostPort.substring(0, indx).trim(), hostPort.substring(indx + 1).trim().toInt)
- hostPortParseResults.putIfAbsent(hostPort, retval)
hostPortParseResults.get(hostPort)
}
@@ -1716,7 +1776,7 @@ private[spark] object Utils extends Logging {
if (inWord || inDoubleQuote || inSingleQuote) {
endWord()
}
- buf
+ buf.toSeq
}
/* Calculates 'x' modulo 'mod', takes to consideration sign of x,
@@ -1879,7 +1939,9 @@ private[spark] object Utils extends Logging {
* Indicates whether Spark is currently running unit tests.
*/
def isTesting: Boolean = {
- sys.env.contains("SPARK_TESTING") || sys.props.contains(IS_TESTING.key)
+ // Scala's `sys.env` creates a ton of garbage by constructing Scala immutable maps, so
+ // we directly use the Java APIs instead.
+ System.getenv("SPARK_TESTING") != null || System.getProperty(IS_TESTING.key) != null
}
/**
@@ -2001,6 +2063,17 @@ private[spark] object Utils extends Logging {
}
}
+ /** Check whether a path is an absolute URI. */
+ def isAbsoluteURI(path: String): Boolean = {
+ try {
+ val uri = new URI(path: String)
+ uri.isAbsolute
+ } catch {
+ case _: URISyntaxException =>
+ false
+ }
+ }
+
/** Return all non-local paths from a comma-separated list of paths. */
def nonLocalPaths(paths: String, testWindows: Boolean = false): Array[String] = {
val windows = isWindows || testWindows
@@ -2492,6 +2565,14 @@ private[spark] object Utils extends Logging {
master == "local" || master.startsWith("local[")
}
+ /**
+ * Push based shuffle can only be enabled when external shuffle service is enabled.
+ */
+ def isPushBasedShuffleEnabled(conf: SparkConf): Boolean = {
+ conf.get(PUSH_BASED_SHUFFLE_ENABLED) &&
+ (conf.get(IS_TESTING).getOrElse(false) || conf.get(SHUFFLE_SERVICE_ENABLED))
+ }
+
/**
* Return whether dynamic allocation is enabled in the given conf.
*/
@@ -2788,6 +2869,34 @@ private[spark] object Utils extends Logging {
Hex.encodeHexString(secretBytes)
}
+ /**
+ * Returns true if and only if the underlying class is a member class.
+ *
+ * Note: jdk8u throws a "Malformed class name" error if a given class is a deeply-nested
+ * inner class (See SPARK-34607 for details). This issue has already been fixed in jdk9+, so
+ * we can remove this helper method safely if we drop the support of jdk8u.
+ */
+ def isMemberClass(cls: Class[_]): Boolean = {
+ try {
+ cls.isMemberClass
+ } catch {
+ case _: InternalError =>
+ // We emulate jdk8u `Class.isMemberClass` below:
+ // public boolean isMemberClass() {
+ // return getSimpleBinaryName() != null && !isLocalOrAnonymousClass();
+ // }
+ // `getSimpleBinaryName()` returns null if a given class is a top-level class,
+ // so we replace it with `cls.getEnclosingClass != null`. The second condition checks
+ // if a given class is not a local or an anonymous class, so we replace it with
+ // `cls.getEnclosingMethod == null` because `cls.getEnclosingMethod()` return a value
+ // only in either case (JVM Spec 4.8.6).
+ //
+ // Note: The newer jdk evaluates `!isLocalOrAnonymousClass()` first,
+ // we reorder the conditions to follow it.
+ cls.getEnclosingMethod == null && cls.getEnclosingClass != null
+ }
+ }
+
/**
* Safer than Class obj's getSimpleName which may throw Malformed class name error in scala.
* This method mimics scalatest's getSimpleNameOfAnObjectsClass.
@@ -2820,11 +2929,11 @@ private[spark] object Utils extends Logging {
if (lastDollarIndex < s.length - 1) {
// The last char is not a dollar sign
if (lastDollarIndex == -1 || !s.contains("$iw")) {
- // The name does not have dollar sign or is not an intepreter
+ // The name does not have dollar sign or is not an interpreter
// generated class, so we should return the full string
s
} else {
- // The class name is intepreter generated,
+ // The class name is interpreter generated,
// return the part after the last dollar sign
// This is the same behavior as getClass.getSimpleName
s.substring(lastDollarIndex + 1)
@@ -2857,14 +2966,14 @@ private[spark] object Utils extends Logging {
*/
private val fullWidthRegex = ("""[""" +
// scalastyle:off nonascii
- """\u1100-\u115F""" +
- """\u2E80-\uA4CF""" +
- """\uAC00-\uD7A3""" +
- """\uF900-\uFAFF""" +
- """\uFE10-\uFE19""" +
- """\uFE30-\uFE6F""" +
- """\uFF00-\uFF60""" +
- """\uFFE0-\uFFE6""" +
+ "\u1100-\u115F" +
+ "\u2E80-\uA4CF" +
+ "\uAC00-\uD7A3" +
+ "\uF900-\uFAFF" +
+ "\uFE10-\uFE19" +
+ "\uFE30-\uFE6F" +
+ "\uFF00-\uFF60" +
+ "\uFFE0-\uFFE6" +
// scalastyle:on nonascii
"""]""").r
@@ -2900,10 +3009,58 @@ private[spark] object Utils extends Logging {
/** Create a new properties object with the same values as `props` */
def cloneProperties(props: Properties): Properties = {
+ if (props == null) {
+ return props
+ }
val resultProps = new Properties()
props.forEach((k, v) => resultProps.put(k, v))
resultProps
}
+
+ /**
+ * Convert a sequence of `Path`s to a metadata string. When the length of metadata string
+ * exceeds `stopAppendingThreshold`, stop appending paths for saving memory.
+ */
+ def buildLocationMetadata(paths: Seq[Path], stopAppendingThreshold: Int): String = {
+ val metadata = new StringBuilder(s"(${paths.length} paths)[")
+ var index: Int = 0
+ while (index < paths.length && metadata.length < stopAppendingThreshold) {
+ if (index > 0) {
+ metadata.append(", ")
+ }
+ metadata.append(paths(index).toString)
+ index += 1
+ }
+ if (paths.length > index) {
+ if (index > 0) {
+ metadata.append(", ")
+ }
+ metadata.append("...")
+ }
+ metadata.append("]")
+ metadata.toString
+ }
+
+ /**
+ * Convert MEMORY_OFFHEAP_SIZE to MB Unit, return 0 if MEMORY_OFFHEAP_ENABLED is false.
+ */
+ def executorOffHeapMemorySizeAsMb(sparkConf: SparkConf): Int = {
+ val sizeInMB = Utils.memoryStringToMb(sparkConf.get(MEMORY_OFFHEAP_SIZE).toString)
+ checkOffHeapEnabled(sparkConf, sizeInMB).toInt
+ }
+
+ /**
+ * return 0 if MEMORY_OFFHEAP_ENABLED is false.
+ */
+ def checkOffHeapEnabled(sparkConf: SparkConf, offHeapSize: Long): Long = {
+ if (sparkConf.get(MEMORY_OFFHEAP_ENABLED)) {
+ require(offHeapSize > 0,
+ s"${MEMORY_OFFHEAP_SIZE.key} must be > 0 when ${MEMORY_OFFHEAP_ENABLED.key} == true")
+ offHeapSize
+ } else {
+ 0
+ }
+ }
}
private[util] object CallerContext extends Logging {
diff --git a/core/src/main/scala/org/apache/spark/util/VersionUtils.scala b/core/src/main/scala/org/apache/spark/util/VersionUtils.scala
index c0f8866dd58dc..e97d1c9393701 100644
--- a/core/src/main/scala/org/apache/spark/util/VersionUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/VersionUtils.scala
@@ -24,6 +24,7 @@ private[spark] object VersionUtils {
private val majorMinorRegex = """^(\d+)\.(\d+)(\..*)?$""".r
private val shortVersionRegex = """^(\d+\.\d+\.\d+)(.*)?$""".r
+ private val majorMinorPatchRegex = """^(\d+)(?:\.(\d+)(?:\.(\d+)(?:[.-].*)?)?)?$""".r
/**
* Given a Spark version string, return the major version number.
@@ -63,4 +64,36 @@ private[spark] object VersionUtils {
s" version string, but it could not find the major and minor version numbers.")
}
}
+
+ /**
+ * Extracts the major, minor and patch parts from the input `version`. Note that if minor or patch
+ * version is missing from the input, this will return 0 for these parts. Returns `None` if the
+ * input is not of a valid format.
+ *
+ * Examples of valid version:
+ * - 1 (extracts to (1, 0, 0))
+ * - 2.4 (extracts to (2, 4, 0))
+ * - 3.2.2 (extracts to (3, 2, 2))
+ * - 3.2.2.4 (extracts to 3, 2, 2))
+ * - 3.3.1-SNAPSHOT (extracts to (3, 3, 1))
+ * - 3.2.2.4SNAPSHOT (extracts to (3, 2, 2), only the first 3 components)
+ *
+ * Examples of invalid version:
+ * - ABC
+ * - 1X
+ * - 2.4XYZ
+ * - 2.4-SNAPSHOT
+ * - 3.4.5ABC
+ *
+ * @return A non-empty option containing a 3-value tuple (major, minor, patch) iff the
+ * input is a valid version. `None` otherwise.
+ */
+ def majorMinorPatchVersion(version: String): Option[(Int, Int, Int)] = {
+ majorMinorPatchRegex.findFirstMatchIn(version).map { m =>
+ val major = m.group(1).toInt
+ val minor = Option(m.group(2)).map(_.toInt).getOrElse(0)
+ val patch = Option(m.group(3)).map(_.toInt).getOrElse(0)
+ (major, minor, patch)
+ }
+ }
}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
index 098f389829ec5..61386114997f6 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -236,6 +236,18 @@ class BitSet(numBits: Int) extends Serializable {
-1
}
+ /**
+ * Compute bit-wise union with another BitSet and overwrite bits in this BitSet with the result.
+ */
+ def union(other: BitSet): Unit = {
+ require(this.numWords <= other.numWords)
+ var ind = 0
+ while( ind < this.numWords ) {
+ this.words(ind) = this.words(ind) | other.words(ind)
+ ind += 1
+ }
+ }
+
/** Return the number of longs it would take to hold numBits. */
private def bit2words(numBits: Int) = ((numBits - 1) >> 6) + 1
}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 7f40b469a95e9..731131b688ca7 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -76,7 +76,7 @@ class ExternalAppendOnlyMap[K, V, C](
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
serializer: Serializer,
- blockManager: BlockManager) {
+ blockManager: BlockManager) = {
this(createCombiner, mergeValue, mergeCombiners, serializer, blockManager, TaskContext.get())
}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index cc97bbfa7201f..1913637371e31 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -32,7 +32,7 @@ import org.apache.spark.serializer._
import org.apache.spark.shuffle.ShufflePartitionPairsWriter
import org.apache.spark.shuffle.api.{ShuffleMapOutputWriter, ShufflePartitionWriter}
import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter, ShuffleBlockId}
-import org.apache.spark.util.{Utils => TryUtils}
+import org.apache.spark.util.{CompletionIterator, Utils => TryUtils}
/**
* Sorts and potentially merges a number of key-value pairs of type (K, V) to produce key-combiner
@@ -263,7 +263,7 @@ private[spark] class ExternalSorter[K, V, C](
/**
* Spill contents of in-memory iterator to a temporary file on disk.
*/
- private[this] def spillMemoryIteratorToDisk(inMemoryIterator: WritablePartitionedIterator)
+ private[this] def spillMemoryIteratorToDisk(inMemoryIterator: WritablePartitionedIterator[K, C])
: SpilledFile = {
// Because these files may be read during shuffle, their compression must be controlled by
// spark.shuffle.compress instead of spark.shuffle.spill.compress, so we need to use
@@ -659,7 +659,7 @@ private[spark] class ExternalSorter[K, V, C](
}
} else {
// Merge spilled and in-memory data
- merge(spills, destructiveIterator(
+ merge(spills.toSeq, destructiveIterator(
collection.partitionedDestructiveSortedIterator(comparator)))
}
}
@@ -672,6 +672,22 @@ private[spark] class ExternalSorter[K, V, C](
partitionedIterator.flatMap(pair => pair._2)
}
+ /**
+ * Insert all records, updates related task metrics, and return a completion iterator
+ * over all the data written to this object, aggregated by our aggregator.
+ * On task completion (success, failure, or cancellation), it releases resources by
+ * calling `stop()`.
+ */
+ def insertAllAndUpdateMetrics(records: Iterator[Product2[K, V]]): Iterator[Product2[K, C]] = {
+ insertAll(records)
+ context.taskMetrics().incMemoryBytesSpilled(memoryBytesSpilled)
+ context.taskMetrics().incDiskBytesSpilled(diskBytesSpilled)
+ context.taskMetrics().incPeakExecutionMemory(peakMemoryUsedBytes)
+ // Use completion callback to stop sorter if task was finished/cancelled.
+ context.addTaskCompletionListener[Unit](_ => stop())
+ CompletionIterator[Product2[K, C], Iterator[Product2[K, C]]](iterator, stop())
+ }
+
/**
* TODO(SPARK-28764): remove this, as this is only used by UnsafeRowSerializerSuite in the SQL
* project. We should figure out an alternative way to test that so that we can remove this
@@ -734,7 +750,7 @@ private[spark] class ExternalSorter[K, V, C](
// Case where we only have in-memory data
val collection = if (aggregator.isDefined) map else buffer
val it = collection.destructiveSortedWritablePartitionedIterator(comparator)
- while (it.hasNext()) {
+ while (it.hasNext) {
val partitionId = it.nextPartition()
var partitionWriter: ShufflePartitionWriter = null
var partitionPairsWriter: ShufflePartitionPairsWriter = null
@@ -850,18 +866,7 @@ private[spark] class ExternalSorter[K, V, C](
if (hasSpilled) {
false
} else {
- val inMemoryIterator = new WritablePartitionedIterator {
- private[this] var cur = if (upstream.hasNext) upstream.next() else null
-
- def writeNext(writer: PairsWriter): Unit = {
- writer.write(cur._1._2, cur._2)
- cur = if (upstream.hasNext) upstream.next() else null
- }
-
- def hasNext(): Boolean = cur != null
-
- def nextPartition(): Int = cur._1._1
- }
+ val inMemoryIterator = new WritablePartitionedIterator[K, C](upstream)
logInfo(s"Task ${TaskContext.get().taskAttemptId} force spilling in-memory map to disk " +
s"and it will release ${org.apache.spark.util.Utils.bytesToString(getUsed())} memory")
val spillFile = spillMemoryIteratorToDisk(inMemoryIterator)
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ImmutableBitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/ImmutableBitSet.scala
new file mode 100644
index 0000000000000..82413f4317d62
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/ImmutableBitSet.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+private object ErrorMessage {
+ final val msg: String = "mutable operation is not supported"
+}
+
+// An immutable BitSet that initializes set bits in its constructor.
+class ImmutableBitSet(val numBits: Int, val bitsToSet: Int*) extends BitSet(numBits) {
+
+ // Initialize the set bits.
+ {
+ val bitsIterator = bitsToSet.iterator
+ while (bitsIterator.hasNext) {
+ super.set(bitsIterator.next)
+ }
+ }
+
+ override def clear(): Unit = {
+ throw new UnsupportedOperationException(ErrorMessage.msg)
+ }
+
+ override def clearUntil(bitIndex: Int): Unit = {
+ throw new UnsupportedOperationException(ErrorMessage.msg)
+ }
+
+ override def set(index: Int): Unit = {
+ throw new UnsupportedOperationException(ErrorMessage.msg)
+ }
+
+ override def setUntil(bitIndex: Int): Unit = {
+ throw new UnsupportedOperationException(ErrorMessage.msg)
+ }
+
+ override def unset(index: Int): Unit = {
+ throw new UnsupportedOperationException(ErrorMessage.msg)
+ }
+
+ override def union(other: BitSet): Unit = {
+ throw new UnsupportedOperationException(ErrorMessage.msg)
+ }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/MedianHeap.scala b/core/src/main/scala/org/apache/spark/util/collection/MedianHeap.scala
index 6e57c3c5bee8c..f1a3932bb0e25 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/MedianHeap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/MedianHeap.scala
@@ -37,13 +37,13 @@ private[spark] class MedianHeap(implicit val ord: Ordering[Double]) {
* Stores all the numbers less than the current median in a smallerHalf,
* i.e median is the maximum, at the root.
*/
- private[this] var smallerHalf = PriorityQueue.empty[Double](ord)
+ private[this] val smallerHalf = PriorityQueue.empty[Double](ord)
/**
* Stores all the numbers greater than the current median in a largerHalf,
* i.e median is the minimum, at the root.
*/
- private[this] var largerHalf = PriorityQueue.empty[Double](ord.reverse)
+ private[this] val largerHalf = PriorityQueue.empty[Double](ord.reverse)
def isEmpty(): Boolean = {
smallerHalf.isEmpty && largerHalf.isEmpty
diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
index 1983b0002853d..fe488f9cf0daf 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
@@ -27,7 +27,7 @@ import org.apache.spark.memory.{MemoryConsumer, MemoryMode, TaskMemoryManager}
* has been exceeded.
*/
private[spark] abstract class Spillable[C](taskMemoryManager: TaskMemoryManager)
- extends MemoryConsumer(taskMemoryManager) with Logging {
+ extends MemoryConsumer(taskMemoryManager, MemoryMode.ON_HEAP) with Logging {
/**
* Spills the current in-memory collection to disk, and releases the memory.
*
diff --git a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
index 9624b02cb407c..3472a08cc329c 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
@@ -46,20 +46,9 @@ private[spark] trait WritablePartitionedPairCollection[K, V] {
* This may destroy the underlying collection.
*/
def destructiveSortedWritablePartitionedIterator(keyComparator: Option[Comparator[K]])
- : WritablePartitionedIterator = {
+ : WritablePartitionedIterator[K, V] = {
val it = partitionedDestructiveSortedIterator(keyComparator)
- new WritablePartitionedIterator {
- private[this] var cur = if (it.hasNext) it.next() else null
-
- def writeNext(writer: PairsWriter): Unit = {
- writer.write(cur._1._2, cur._2)
- cur = if (it.hasNext) it.next() else null
- }
-
- def hasNext(): Boolean = cur != null
-
- def nextPartition(): Int = cur._1._1
- }
+ new WritablePartitionedIterator[K, V](it)
}
}
@@ -87,10 +76,15 @@ private[spark] object WritablePartitionedPairCollection {
* Iterator that writes elements to a DiskBlockObjectWriter instead of returning them. Each element
* has an associated partition.
*/
-private[spark] trait WritablePartitionedIterator {
- def writeNext(writer: PairsWriter): Unit
+private[spark] class WritablePartitionedIterator[K, V](it: Iterator[((Int, K), V)]) {
+ private[this] var cur = if (it.hasNext) it.next() else null
+
+ def writeNext(writer: PairsWriter): Unit = {
+ writer.write(cur._1._2, cur._2)
+ cur = if (it.hasNext) it.next() else null
+ }
- def hasNext(): Boolean
+ def hasNext: Boolean = cur != null
- def nextPartition(): Int
+ def nextPartition(): Int = cur._1._1
}
diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
index 2c3730de08b5b..8635f1a3d702e 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@@ -193,7 +193,7 @@ private[spark] object ChunkedByteBuffer {
length: Long): ChunkedByteBuffer = {
// We do *not* memory map the file, because we may end up putting this into the memory store,
// and spark currently is not expecting memory-mapped buffers in the memory store, it conflicts
- // with other parts that manage the lifecyle of buffers and dispose them. See SPARK-25422.
+ // with other parts that manage the lifecycle of buffers and dispose them. See SPARK-25422.
val is = new FileInputStream(file)
ByteStreams.skipFully(is, offset)
val in = new LimitedInputStream(is, length)
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
index ee8e38c24b47f..5666bb3e5f140 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -68,10 +68,10 @@
public class UnsafeShuffleWriterSuite {
static final int DEFAULT_INITIAL_SORT_BUFFER_SIZE = 4096;
- static final int NUM_PARTITITONS = 4;
+ static final int NUM_PARTITIONS = 4;
TestMemoryManager memoryManager;
TaskMemoryManager taskMemoryManager;
- final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITITONS);
+ final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITIONS);
File mergedOutputFile;
File tempDir;
long[] partitionSizesInMergedFile;
@@ -97,8 +97,8 @@ public void tearDown() {
@Before
@SuppressWarnings("unchecked")
- public void setUp() throws IOException {
- MockitoAnnotations.initMocks(this);
+ public void setUp() throws Exception {
+ MockitoAnnotations.openMocks(this).close();
tempDir = Utils.createTempDir(null, "test");
mergedOutputFile = File.createTempFile("mergedoutput", "", tempDir);
partitionSizesInMergedFile = null;
@@ -194,7 +194,7 @@ private void assertSpillFilesWereCleanedUp() {
private List> readRecordsFromFile() throws IOException {
final ArrayList> recordsList = new ArrayList<>();
long startOffset = 0;
- for (int i = 0; i < NUM_PARTITITONS; i++) {
+ for (int i = 0; i < NUM_PARTITIONS; i++) {
final long partitionSize = partitionSizesInMergedFile[i];
if (partitionSize > 0) {
FileInputStream fin = new FileInputStream(mergedOutputFile);
@@ -253,7 +253,7 @@ public void writeEmptyIterator() throws Exception {
assertTrue(mapStatus.isDefined());
assertTrue(mergedOutputFile.exists());
assertEquals(0, spillFilesCreated.size());
- assertArrayEquals(new long[NUM_PARTITITONS], partitionSizesInMergedFile);
+ assertArrayEquals(new long[NUM_PARTITIONS], partitionSizesInMergedFile);
assertEquals(0, taskMetrics.shuffleWriteMetrics().recordsWritten());
assertEquals(0, taskMetrics.shuffleWriteMetrics().bytesWritten());
assertEquals(0, taskMetrics.diskBytesSpilled());
@@ -264,7 +264,7 @@ public void writeEmptyIterator() throws Exception {
public void writeWithoutSpilling() throws Exception {
// In this example, each partition should have exactly one record:
final ArrayList> dataToWrite = new ArrayList<>();
- for (int i = 0; i < NUM_PARTITITONS; i++) {
+ for (int i = 0; i < NUM_PARTITIONS; i++) {
dataToWrite.add(new Tuple2<>(i, i));
}
final UnsafeShuffleWriter