Merge branch 'master' into SPARK-32201

apache · Jul 16, 2020 · 42a52a1 · 42a52a1
2 parents 0950e9a + db47c6e
commit 42a52a1
Show file tree

Hide file tree

Showing 678 changed files with 8,923 additions and 5,754 deletions.
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -9,148 +9,231 @@ on:
     - master
 
 jobs:
+  # TODO(SPARK-32248): Recover JDK 11 builds
+  # Build: build Spark and run the tests for specified modules.
   build:
-
+    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
-        java: [ '1.8', '11' ]
-        hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ]
-        hive: [ 'hive-1.2', 'hive-2.3' ]
-        exclude:
-        - java: '11'
-          hive: 'hive-1.2'
-        - hadoop: 'hadoop-3.2'
-          hive: 'hive-1.2'
-    name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }}
-
+        java:
+          - 1.8
+        hadoop:
+          - hadoop3.2
+        hive:
+          - hive2.3
+        # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
+        # Kinesis tests depends on external Amazon kinesis service.
+        # Note that the modules below are from sparktestsupport/modules.py.
+        modules:
+          - |-
+            core, unsafe, kvstore, avro,
+            network-common, network-shuffle, repl, launcher,
+            examples, sketch, graphx
+          - |-
+            catalyst, hive-thriftserver
+          - |-
+            streaming, sql-kafka-0-10, streaming-kafka-0-10,
+            mllib-local, mllib,
+            yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
+          - |-
+            pyspark-sql, pyspark-mllib, pyspark-resource
+          - |-
+            pyspark-core, pyspark-streaming, pyspark-ml
+          - |-
+            sparkr
+        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
+        included-tags: [""]
+        excluded-tags: [""]
+        comment: [""]
+        include:
+          # Hive tests
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- slow tests"
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- other tests"
+          # SQL tests
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- slow tests"
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- other tests"
+    env:
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      INCLUDED_TAGS: ${{ matrix.included-tags }}
+      HADOOP_PROFILE: ${{ matrix.hadoop }}
+      HIVE_PROFILE: ${{ matrix.hive }}
+      # GitHub Actions' default miniconda to use in pip packaging test.
+      CONDA_PREFIX: /usr/share/miniconda
+      GITHUB_PREV_SHA: ${{ github.event.before }}
     steps:
-    - uses: actions/checkout@master
-    # We split caches because GitHub Action Cache has a 400MB-size limit.
-    - uses: actions/cache@v1
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+      # In order to fetch changed files
+      with:
+        fetch-depth: 0
+    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+    - name: Cache Scala, SBT, Maven and Zinc
+      uses: actions/cache@v1
       with:
         path: build
         key: build-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
           build-
-    - uses: actions/cache@v1
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
       with:
-        path: ~/.m2/repository/com
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-
-    - uses: actions/cache@v1
-      with:
-        path: ~/.m2/repository/org
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-
-    - uses: actions/cache@v1
-      with:
-        path: ~/.m2/repository/net
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }}
+        path: ~/.m2/repository
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-
-    - uses: actions/cache@v1
+          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-
+    - name: Cache Ivy local repository
+      uses: actions/cache@v2
       with:
-        path: ~/.m2/repository/io
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }}
+        path: ~/.ivy2/cache
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
         restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-
-    - name: Set up JDK ${{ matrix.java }}
+          ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
+    - name: Install JDK ${{ matrix.java }}
       uses: actions/setup-java@v1
       with:
         java-version: ${{ matrix.java }}
-    - name: Build with Maven
-      run: |
-        export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
-        export MAVEN_CLI_OPTS="--no-transfer-progress"
-        mkdir -p ~/.m2
-        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install
-        rm -rf ~/.m2/repository/org/apache/spark
-
-
-  lint:
-    runs-on: ubuntu-latest
-    name: Linters (Java/Scala/Python), licenses, dependencies
-    steps:
-    - uses: actions/checkout@master
-    - uses: actions/setup-java@v1
+    # PySpark
+    - name: Install PyPy3
+      # Note that order of Python installations here matters because default python3 is
+      # overridden by pypy3.
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark')
       with:
-        java-version: '11'
-    - uses: actions/setup-python@v1
+        python-version: pypy3
+        architecture: x64
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark')
       with:
-        python-version: '3.x'
-        architecture: 'x64'
-    - name: Scala
-      run: ./dev/lint-scala
-    - name: Java
-      run: ./dev/lint-java
-    - name: Python
-      run: |
-        pip install flake8 sphinx numpy
-        ./dev/lint-python
-    - name: License
-      run: ./dev/check-license
-    - name: Dependencies
-      run: ./dev/test-dependencies.sh
-
-  lintr:
-    runs-on: ubuntu-latest
-    name: Linter (R)
-    steps:
-    - uses: actions/checkout@master
-    - uses: actions/setup-java@v1
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python 3.8
+      uses: actions/setup-python@v2
+      # We should install one Python that is higher then 3+ for SQL and Yarn because:
+      # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
+      # - Yarn has a Python specific test too, for example, YarnClusterSuite.
+      if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       with:
-        java-version: '11'
-    - uses: r-lib/actions/setup-r@v1
+        python-version: 3.8
+        architecture: x64
+    - name: Install Python packages (Python 3.6 and PyPy3)
+      if: contains(matrix.modules, 'pyspark')
+      # PyArrow is not supported in PyPy yet, see ARROW-2651.
+      # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
+      run: |
+        python3.6 -m pip install numpy pyarrow pandas scipy
+        python3.6 -m pip list
+        pypy3 -m pip install numpy pandas
+        pypy3 -m pip list
+    - name: Install Python packages (Python 3.8)
+      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      run: |
+        python3.8 -m pip install numpy pyarrow pandas scipy
+        python3.8 -m pip list
+    # SparkR
+    - name: Install R 3.6
+      uses: r-lib/actions/setup-r@v1
+      if: contains(matrix.modules, 'sparkr')
       with:
-        r-version: '3.6.2'
-    - name: Install lib
+        r-version: 3.6
+    - name: Install R packages
+      if: contains(matrix.modules, 'sparkr')
       run: |
         sudo apt-get install -y libcurl4-openssl-dev
-    - name: install R packages
+        sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        # Show installed packages in R.
+        sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
+    # Run the tests.
+    - name: "Run tests: ${{ matrix.modules }}"
       run: |
-        sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
-        sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
-    - name: package and install SparkR
-      run: ./R/install-dev.sh
-    - name: lint-r
-      run: ./dev/lint-r
+        # Hive tests become flaky when running in parallel as it's too intensive.
+        if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
+        mkdir -p ~/.m2
+        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
+        rm -rf ~/.m2/repository/org/apache/spark
 
-  docs:
+  # Static analysis, and documentation build
+  lint:
+    name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-latest
-    name: Generate documents
     steps:
-    - uses: actions/checkout@master
-    - uses: actions/cache@v1
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
       with:
         path: ~/.m2/repository
         key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
-          docs-maven-repo-
-    - uses: actions/setup-java@v1
+          docs-maven-
+    - name: Install JDK 1.8
+      uses: actions/setup-java@v1
       with:
-        java-version: '1.8'
-    - uses: actions/setup-python@v1
+        java-version: 1.8
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
       with:
-        python-version: '3.x'
-        architecture: 'x64'
-    - uses: actions/setup-ruby@v1
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python linter dependencies
+      run: |
+        pip3 install flake8 sphinx numpy
+    - name: Install R 3.6
+      uses: r-lib/actions/setup-r@v1
       with:
-        ruby-version: '2.7'
-    - uses: r-lib/actions/setup-r@v1
+        r-version: 3.6
+    - name: Install R linter dependencies and SparkR
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev
+        sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
+        sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
+        ./R/install-dev.sh
+    - name: Install Ruby 2.7 for documentation generation
+      uses: actions/setup-ruby@v1
       with:
-        r-version: '3.6.2'
-    - name: Install lib and pandoc
+        ruby-version: 2.7
+    - name: Install dependencies for documentation generation
       run: |
         sudo apt-get install -y libcurl4-openssl-dev pandoc
-    - name: Install packages
-      run: |
         pip install sphinx mkdocs numpy
         gem install jekyll jekyll-redirect-from rouge
-        sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
-    - name: Run jekyll build
+        sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+    - name: Scala linter
+      run: ./dev/lint-scala
+    - name: Java linter
+      run: ./dev/lint-java
+    - name: Python linter
+      run: ./dev/lint-python
+    - name: R linter
+      run: ./dev/lint-r
+    - name: License test
+      run: ./dev/check-license
+    - name: Dependencies test
+      run: ./dev/test-dependencies.sh
+    - name: Run documentation build
       run: |
         cd docs
         jekyll build
diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R
@@ -139,7 +139,7 @@ test_that("utility function can be called", {
   expect_true(TRUE)
 })
 
-test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
+test_that("getClientModeSparkSubmitOpts() returns spark-submit args from allowList", {
   e <- new.env()
   e[["spark.driver.memory"]] <- "512m"
   ops <- getClientModeSparkSubmitOpts("sparkrmain", e)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -3921,14 +3921,14 @@ test_that("No extra files are created in SPARK_HOME by starting session and maki
   # before creating a SparkSession with enableHiveSupport = T at the top of this test file
   # (filesBefore). The test here is to compare that (filesBefore) against the list of files before
   # any test is run in run-all.R (sparkRFilesBefore).
-  # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
+  # sparkRAllowedSQLDirs is also defined in run-all.R, and should contain only 2 allowed dirs,
   # here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
   # same as before any test is run.
-  compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
+  compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRAllowedSQLDirs[[1]]))
   # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
   # note: as the note above, after running all tests in this file while enableHiveSupport = T, we
-  # check the list of files again. This time we allow both whitelisted dirs to be in the diff.
-  compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
+  # check the list of files again. This time we allow both dirs to be in the diff.
+  compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRAllowedSQLDirs))
 })
 
 unlink(parquetPath)

diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
@@ -35,8 +35,8 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
   install.spark(overwrite = TRUE)
 
   sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
-  sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
-  invisible(lapply(sparkRWhitelistSQLDirs,
+  sparkRAllowedSQLDirs <- c("spark-warehouse", "metastore_db")
+  invisible(lapply(sparkRAllowedSQLDirs,
                    function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
   sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
 

diff --git a/appveyor.yml b/appveyor.yml
@@ -42,8 +42,8 @@ install:
   # Install maven and dependencies
   - ps: .\dev\appveyor-install-dependencies.ps1
   # Required package for R unit tests
-  - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
-  - cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival'); packageVersion('arrow')"
+  - cmd: Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
+  - cmd: Rscript -e "pkg_list <- as.data.frame(installed.packages()[,c(1, 3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]"
 
 build_script:
   # '-Djna.nosys=true' is required to avoid kernel32.dll load failure.

diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/README.md b/common/network-common/src/main/java/org/apache/spark/network/crypto/README.md
@@ -155,4 +155,4 @@ server will be able to understand. This will cause the server to close the conne
 attacker tries to send any command to the server. The attacker can just hold the channel open for
 some time, which will be closed when the server times out the channel. These issues could be
 separately mitigated by adding a shorter timeout for the first message after authentication, and
-potentially by adding host blacklists if a possible attack is detected from a particular host.
+potentially by adding host reject-lists if a possible attack is detected from a particular host.